LLVM  14.0.0git
AMDGPULateCodeGenPrepare.cpp
Go to the documentation of this file.
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/InstVisitor.h"
21 #include "llvm/InitializePasses.h"
23 #include "llvm/Support/KnownBits.h"
25 
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27 
28 using namespace llvm;
29 
30 // Scalar load widening needs running after load-store-vectorizer as that pass
31 // doesn't handle overlapping cases. In addition, this pass enhances the
32 // widening to handle cases where scalar sub-dword loads are naturally aligned
33 // only but not dword aligned.
34 static cl::opt<bool>
35  WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36  cl::desc("Widen sub-dword constant address space loads in "
37  "AMDGPULateCodeGenPrepare"),
38  cl::ReallyHidden, cl::init(true));
39 
40 namespace {
41 
42 class AMDGPULateCodeGenPrepare
43  : public FunctionPass,
44  public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45  Module *Mod = nullptr;
46  const DataLayout *DL = nullptr;
47 
48  AssumptionCache *AC = nullptr;
49  LegacyDivergenceAnalysis *DA = nullptr;
50 
51 public:
52  static char ID;
53 
54  AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55 
56  StringRef getPassName() const override {
57  return "AMDGPU IR late optimizations";
58  }
59 
60  void getAnalysisUsage(AnalysisUsage &AU) const override {
63  AU.setPreservesAll();
64  }
65 
66  bool doInitialization(Module &M) override;
67  bool runOnFunction(Function &F) override;
68 
69  bool visitInstruction(Instruction &) { return false; }
70 
71  // Check if the specified value is at least DWORD aligned.
72  bool isDWORDAligned(const Value *V) const {
73  KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74  return Known.countMinTrailingZeros() >= 2;
75  }
76 
77  bool canWidenScalarExtLoad(LoadInst &LI) const;
78  bool visitLoadInst(LoadInst &LI);
79 };
80 
81 } // end anonymous namespace
82 
83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84  Mod = &M;
85  DL = &Mod->getDataLayout();
86  return false;
87 }
88 
90  if (skipFunction(F))
91  return false;
92 
93  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94  DA = &getAnalysis<LegacyDivergenceAnalysis>();
95 
96  bool Changed = false;
97  for (auto &BB : F)
99  Changed |= visit(I);
100 
101  return Changed;
102 }
103 
104 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
105  unsigned AS = LI.getPointerAddressSpace();
106  // Skip non-constant address space.
107  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
109  return false;
110  // Skip non-simple loads.
111  if (!LI.isSimple())
112  return false;
113  auto *Ty = LI.getType();
114  // Skip aggregate types.
115  if (Ty->isAggregateType())
116  return false;
117  unsigned TySize = DL->getTypeStoreSize(Ty);
118  // Only handle sub-DWORD loads.
119  if (TySize >= 4)
120  return false;
121  // That load must be at least naturally aligned.
122  if (LI.getAlign() < DL->getABITypeAlign(Ty))
123  return false;
124  // It should be uniform, i.e. a scalar load.
125  return DA->isUniform(&LI);
126 }
127 
128 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
129  if (!WidenLoads)
130  return false;
131 
132  // Skip if that load is already aligned on DWORD at least as it's handled in
133  // SDAG.
134  if (LI.getAlign() >= 4)
135  return false;
136 
137  if (!canWidenScalarExtLoad(LI))
138  return false;
139 
140  int64_t Offset = 0;
141  auto *Base =
143  // If that base is not DWORD aligned, it's not safe to perform the following
144  // transforms.
145  if (!isDWORDAligned(Base))
146  return false;
147 
148  int64_t Adjust = Offset & 0x3;
149  if (Adjust == 0) {
150  // With a zero adjust, the original alignment could be promoted with a
151  // better one.
152  LI.setAlignment(Align(4));
153  return true;
154  }
155 
156  IRBuilder<> IRB(&LI);
157  IRB.SetCurrentDebugLocation(LI.getDebugLoc());
158 
159  unsigned AS = LI.getPointerAddressSpace();
160  unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
161  auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
162 
163  PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
164  PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
165  auto *NewPtr = IRB.CreateBitCast(
166  IRB.CreateConstGEP1_64(
167  IRB.getInt8Ty(),
168  IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
169  Offset - Adjust),
170  Int32PtrTy);
171  LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
172  NewLd->copyMetadata(LI);
173  NewLd->setMetadata(LLVMContext::MD_range, nullptr);
174 
175  unsigned ShAmt = Adjust * 8;
176  auto *NewVal = IRB.CreateBitCast(
177  IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
178  LI.replaceAllUsesWith(NewVal);
180 
181  return true;
182 }
183 
184 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
185  "AMDGPU IR late optimizations", false, false)
188 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
190 
191 char AMDGPULateCodeGenPrepare::ID = 0;
192 
194  return new AMDGPULateCodeGenPrepare();
195 }
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:511
AssumptionCache.h
llvm
This is an optimization pass for GlobalISel generic memory operations.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:113
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:293
llvm::Function
Definition: Function.h:62
llvm::createAMDGPULateCodeGenPreparePass
FunctionPass * createAMDGPULateCodeGenPreparePass()
Definition: AMDGPULateCodeGenPrepare.cpp:193
llvm::IRBuilder<>
ValueTracking.h
Local.h
late
aarch64 falkor hwpf fix late
Definition: AArch64FalkorHWPFFix.cpp:230
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:835
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::Type::getInt32PtrTy
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:301
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
KnownBits.h
llvm::KnownBits::countMinTrailingZeros
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:226
CommandLine.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPULateCodeGenPrepare.cpp:26
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:142
llvm::Instruction
Definition: Instruction.h:45
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:746
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:273
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::LoadInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:227
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:441
llvm::make_early_inc_range
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:593
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:224
IRBuilder.h
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:362
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:202
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
InstVisitor.h
llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:259
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:42
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:991
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:79
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:366
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:245
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
llvm::M68kBeads::DA
@ DA
Definition: M68kBaseInfo.h:59
optimizations
AMDGPU IR late optimizations
Definition: AMDGPULateCodeGenPrepare.cpp:189
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:288
LegacyDivergenceAnalysis.h
WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:370
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::cl::desc
Definition: CommandLine.h:412
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:74
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38