LLVM  13.0.0git
AMDGPULateCodeGenPrepare.cpp
Go to the documentation of this file.
1 //===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass does misc. AMDGPU optimizations on IR *just* before instruction
11 /// selection.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "AMDGPU.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/InstVisitor.h"
21 #include "llvm/InitializePasses.h"
23 #include "llvm/Support/KnownBits.h"
25 
26 #define DEBUG_TYPE "amdgpu-late-codegenprepare"
27 
28 using namespace llvm;
29 
30 // Scalar load widening needs running after load-store-vectorizer as that pass
31 // doesn't handle overlapping cases. In addition, this pass enhances the
32 // widening to handle cases where scalar sub-dword loads are naturally aligned
33 // only but not dword aligned.
34 static cl::opt<bool>
35  WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads",
36  cl::desc("Widen sub-dword constant address space loads in "
37  "AMDGPULateCodeGenPrepare"),
38  cl::ReallyHidden, cl::init(true));
39 
40 namespace {
41 
42 class AMDGPULateCodeGenPrepare
43  : public FunctionPass,
44  public InstVisitor<AMDGPULateCodeGenPrepare, bool> {
45  Module *Mod = nullptr;
46  const DataLayout *DL = nullptr;
47 
48  AssumptionCache *AC = nullptr;
49  LegacyDivergenceAnalysis *DA = nullptr;
50 
51 public:
52  static char ID;
53 
54  AMDGPULateCodeGenPrepare() : FunctionPass(ID) {}
55 
56  StringRef getPassName() const override {
57  return "AMDGPU IR late optimizations";
58  }
59 
60  void getAnalysisUsage(AnalysisUsage &AU) const override {
63  AU.setPreservesAll();
64  }
65 
66  bool doInitialization(Module &M) override;
67  bool runOnFunction(Function &F) override;
68 
69  bool visitInstruction(Instruction &) { return false; }
70 
71  // Check if the specified value is at least DWORD aligned.
72  bool isDWORDAligned(const Value *V) const {
73  KnownBits Known = computeKnownBits(V, *DL, 0, AC);
74  return Known.countMinTrailingZeros() >= 2;
75  }
76 
77  bool canWidenScalarExtLoad(LoadInst &LI) const;
78  bool visitLoadInst(LoadInst &LI);
79 };
80 
81 } // end anonymous namespace
82 
83 bool AMDGPULateCodeGenPrepare::doInitialization(Module &M) {
84  Mod = &M;
85  DL = &Mod->getDataLayout();
86  return false;
87 }
88 
90  if (skipFunction(F))
91  return false;
92 
93  AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
94  DA = &getAnalysis<LegacyDivergenceAnalysis>();
95 
96  bool Changed = false;
97  for (auto &BB : F)
98  for (auto BI = BB.begin(), BE = BB.end(); BI != BE; /*EMPTY*/) {
99  Instruction *I = &*BI++;
100  Changed |= visit(*I);
101  }
102 
103  return Changed;
104 }
105 
106 bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
107  unsigned AS = LI.getPointerAddressSpace();
108  // Skip non-constant address space.
109  if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
111  return false;
112  // Skip non-simple loads.
113  if (!LI.isSimple())
114  return false;
115  auto *Ty = LI.getType();
116  // Skip aggregate types.
117  if (Ty->isAggregateType())
118  return false;
119  unsigned TySize = DL->getTypeStoreSize(Ty);
120  // Only handle sub-DWORD loads.
121  if (TySize >= 4)
122  return false;
123  // That load must be at least naturally aligned.
124  if (LI.getAlign() < DL->getABITypeAlign(Ty))
125  return false;
126  // It should be uniform, i.e. a scalar load.
127  return DA->isUniform(&LI);
128 }
129 
130 bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
131  if (!WidenLoads)
132  return false;
133 
134  // Skip if that load is already aligned on DWORD at least as it's handled in
135  // SDAG.
136  if (LI.getAlign() >= 4)
137  return false;
138 
139  if (!canWidenScalarExtLoad(LI))
140  return false;
141 
142  int64_t Offset = 0;
143  auto *Base =
145  // If that base is not DWORD aligned, it's not safe to perform the following
146  // transforms.
147  if (!isDWORDAligned(Base))
148  return false;
149 
150  int64_t Adjust = Offset & 0x3;
151  if (Adjust == 0) {
152  // With a zero adjust, the original alignment could be promoted with a
153  // better one.
154  LI.setAlignment(Align(4));
155  return true;
156  }
157 
158  IRBuilder<> IRB(&LI);
159  IRB.SetCurrentDebugLocation(LI.getDebugLoc());
160 
161  unsigned AS = LI.getPointerAddressSpace();
162  unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
163  auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
164 
165  PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
166  PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
167  auto *NewPtr = IRB.CreateBitCast(
168  IRB.CreateConstGEP1_64(
169  IRB.getInt8Ty(),
170  IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
171  Offset - Adjust),
172  Int32PtrTy);
173  LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
174  NewLd->copyMetadata(LI);
175  NewLd->setMetadata(LLVMContext::MD_range, nullptr);
176 
177  unsigned ShAmt = Adjust * 8;
178  auto *NewVal = IRB.CreateBitCast(
179  IRB.CreateTrunc(IRB.CreateLShr(NewLd, ShAmt), IntNTy), LI.getType());
180  LI.replaceAllUsesWith(NewVal);
182 
183  return true;
184 }
185 
186 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
187  "AMDGPU IR late optimizations", false, false)
190 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
192 
193 char AMDGPULateCodeGenPrepare::ID = 0;
194 
196  return new AMDGPULateCodeGenPrepare();
197 }
llvm::RecursivelyDeleteTriviallyDeadInstructions
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition: Local.cpp:511
AssumptionCache.h
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Type::getInt8PtrTy
static PointerType * getInt8PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:255
llvm::Function
Definition: Function.h:61
llvm::createAMDGPULateCodeGenPreparePass
FunctionPass * createAMDGPULateCodeGenPreparePass()
Definition: AMDGPULateCodeGenPrepare.cpp:195
llvm::IRBuilder<>
ValueTracking.h
Local.h
late
aarch64 falkor hwpf fix late
Definition: AArch64FalkorHWPFFix.cpp:230
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::Instruction::copyMetadata
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Definition: Instruction.cpp:812
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::LoadInst::getAlign
Align getAlign() const
Return the alignment of the access that is being performed.
Definition: Instructions.h:223
llvm::Type::getInt32PtrTy
static PointerType * getInt32PtrTy(LLVMContext &C, unsigned AS=0)
Definition: Type.cpp:263
llvm::cl::ReallyHidden
@ ReallyHidden
Definition: CommandLine.h:144
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
KnownBits.h
llvm::KnownBits::countMinTrailingZeros
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition: KnownBits.h:224
CommandLine.h
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPULateCodeGenPrepare.cpp:26
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
false
Definition: StackSlotColoring.cpp:142
llvm::Instruction
Definition: Instruction.h:45
llvm::LegacyDivergenceAnalysis
Definition: LegacyDivergenceAnalysis.h:31
IR
Statically lint checks LLVM IR
Definition: Lint.cpp:744
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::cl::opt< bool >
llvm::LoadInst::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Definition: Instructions.h:273
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:385
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::LoadInst::setAlignment
void setAlignment(Align Align)
Definition: Instructions.h:227
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:631
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:213
IRBuilder.h
llvm::M68kBeads::DA
@ DA
Definition: M68kBaseInfo.h:59
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::AssumptionCacheTracker
An immutable pass that tracks lazily created AssumptionCache objects.
Definition: AssumptionCache.h:200
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
InstVisitor.h
llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:259
llvm::AssumptionCache
A cache of @llvm.assume calls within a function.
Definition: AssumptionCache.h:41
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:520
llvm::Value::getContext
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:979
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::InstVisitor
Base class for instruction visitors.
Definition: InstVisitor.h:79
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR late optimizations", false, false) INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::KnownBits
Definition: KnownBits.h:23
llvm::Type::getIntNTy
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition: Type.cpp:207
llvm::AnalysisUsage::setPreservesAll
void setPreservesAll()
Set by analyses that do not transform their input at all.
Definition: PassAnalysisSupport.h:130
optimizations
AMDGPU IR late optimizations
Definition: AMDGPULateCodeGenPrepare.cpp:191
llvm::GetPointerBaseWithConstantOffset
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
Definition: ValueTracking.h:279
LegacyDivergenceAnalysis.h
WidenLoads
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
llvm::Instruction::getDebugLoc
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:370
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:397
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
llvm::AnalysisUsage::addRequired
AnalysisUsage & addRequired()
Definition: PassAnalysisSupport.h:75
llvm::cl::desc
Definition: CommandLine.h:414
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
InitializePasses.h
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::sampleprof::Base
@ Base
Definition: Discriminator.h:58
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:389