Go to the documentation of this file.
97 #include "llvm/IR/IntrinsicsAMDGPU.h"
108 #define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
110 using namespace llvm;
120 getFunctionToInstsMap(
User *U,
bool CollectKernelInsts);
126 class ReplaceLDSUseImpl {
138 FunctionToLDSToReplaceInst;
141 std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
143 std::vector<GlobalVariable *> LDSGlobals =
148 return shouldIgnorePointerReplacement(GV);
164 LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
165 return LDSToNonKernels[GV].
empty();
174 auto PointerEntry = LDSToPointer.
insert(std::make_pair(GV,
nullptr));
175 if (!PointerEntry.second)
176 return PointerEntry.first->second;
192 LDSToPointer[GV] = LDSPointer;
202 auto BasicBlockEntry = KernelToInitBB.
insert(std::make_pair(K,
nullptr));
203 if (!BasicBlockEntry.second)
204 return BasicBlockEntry.first->second;
211 Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
215 Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
220 KernelToInitBB[K] = NBB;
229 auto PointerEntry = KernelToLDSPointers.
insert(
231 if (!PointerEntry.second)
232 if (PointerEntry.first->second.contains(LDSPointer))
240 auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
246 KernelToLDSPointers[K].
insert(LDSPointer);
254 for (
auto *U : LDSUsers) {
259 auto FunctionToInsts =
260 AMDGPU::getFunctionToInstsMap(U,
false );
262 for (
const auto &FunctionToInst : FunctionToInsts) {
264 auto &Insts = FunctionToInst.second;
265 for (
auto *
I : Insts) {
278 auto *
CE = cast<ConstantExpr>(U);
284 for (
auto *II : UserInsts) {
285 auto *ReplaceInst = getReplacementInst(
F, GV, LDSPointer);
286 II->replaceUsesOfWith(GV, ReplaceInst);
299 auto LDSEntry = FunctionToLDSToReplaceInst.
insert(
301 if (!LDSEntry.second) {
302 auto ReplaceInstEntry =
303 LDSEntry.first->second.insert(std::make_pair(GV,
nullptr));
304 if (!ReplaceInstEntry.second)
305 return ReplaceInstEntry.first->second;
310 auto *EI = &(*(
F->getEntryBlock().getFirstInsertionPt()));
314 auto *V =
Builder.CreateBitCast(
316 Builder.getInt8Ty(), LDSMemBaseAddr,
322 FunctionToLDSToReplaceInst[
F][GV] = V;
328 ReplaceLDSUseImpl(
Module &M)
329 :
M(
M), Ctx(
M.getContext()),
DL(
M.getDataLayout()) {
338 bool replaceLDSUse();
357 for (
const auto &KernelToCallee : KernelToCallees) {
374 LDSPointer = createLDSPointer(GV);
377 initializeLDSPointer(K, GV, LDSPointer);
388 replaceLDSUseByPointer(GV, LDSPointer);
397 class CollectReachableCallees {
403 void collectAddressTakenFunctions() {
406 for (
const auto &GI : *ECNode) {
407 auto *CGN = GI.second;
408 auto *
F = CGN->getFunction();
411 AddressTakenFunctions.
insert(CGN);
433 while (!CGNStack.empty()) {
434 auto *CGN = CGNStack.pop_back_val();
436 if (!VisitedCGNodes.
insert(CGN).second)
441 if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
444 for (
const auto &GI : *CGN) {
445 auto *RCB = cast<CallBase>(GI.first.getValue());
446 auto *RCGN = GI.second;
448 if (
auto *DCallee = RCGN->getFunction()) {
449 ReachableCallees.
insert(DCallee);
450 }
else if (RCB->isIndirectCall()) {
451 auto *RCBFTy = RCB->getFunctionType();
452 for (
auto *ACGN : AddressTakenFunctions) {
453 auto *ACallee = ACGN->getFunction();
454 if (ACallee->getFunctionType() == RCBFTy) {
455 ReachableCallees.
insert(ACallee);
463 return ReachableCallees;
469 collectAddressTakenFunctions();
472 void collectReachableCallees(
479 KernelToCallees[K] = collectReachableCallees(K);
486 void collectReachableCallees(
489 CollectReachableCallees CRC{
M};
490 CRC.collectReachableCallees(KernelToCallees);
501 while (!UserStack.empty()) {
502 auto *U = UserStack.pop_back_val();
505 if (!VisitedUsers.
insert(U).second)
509 if (isa<GlobalValue>(U))
513 if (isa<Constant>(U)) {
520 Function *
F = cast<Instruction>(U)->getFunction();
529 getFunctionToInstsMap(
User *U,
bool CollectKernelInsts) {
534 UserStack.push_back(U);
536 while (!UserStack.empty()) {
539 if (!VisitedUsers.
insert(UU).second)
542 if (isa<GlobalValue>(UU))
545 if (isa<Constant>(UU)) {
550 auto *
I = cast<Instruction>(UU);
552 if (CollectKernelInsts) {
566 return FunctionToInsts;
573 bool ReplaceLDSUseImpl::replaceLDSUse() {
575 std::vector<GlobalVariable *> LDSGlobals =
576 collectLDSRequiringPointerReplace();
579 if (LDSGlobals.empty())
583 AMDGPU::collectReachableCallees(M, KernelToCallees);
585 if (KernelToCallees.empty()) {
595 bool Changed =
false;
596 for (
auto *GV : LDSGlobals)
597 Changed |= replaceLDSUse(GV);
602 class AMDGPUReplaceLDSUseWithPointer :
public ModulePass {
611 bool runOnModule(
Module &M)
override;
626 "Replace within non-kernel function use of LDS with pointer",
634 bool AMDGPUReplaceLDSUseWithPointer::runOnModule(
Module &M) {
635 ReplaceLDSUseImpl LDSUseReplacer{
M};
636 return LDSUseReplacer.replaceLDSUse();
640 return new AMDGPUReplaceLDSUseWithPointer();
645 ReplaceLDSUseImpl LDSUseReplacer{
M};
646 LDSUseReplacer.replaceLDSUse();
std::vector< GlobalVariable * > findVariablesToLower(Module &M, const Function *F)
Move duplicate certain instructions close to their use
A set of analyses that are preserved following a run of a transformation pass.
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
CallGraphNode * getExternalCallingNode() const
Returns the CallGraphNode which is used to represent undetermined calls into the callgraph.
A parsed version of the target data layout string in and methods for querying it.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Replace within non kernel function use of LDS with pointer
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
char & AMDGPUReplaceLDSUseWithPointerID
const BasicBlock & getEntryBlock() const
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
df_iterator< T > df_end(const T &G)
The basic data container for the call graph of a Module of IR.
INITIALIZE_PASS_BEGIN(AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE, "Replace within non-kernel function use of LDS with pointer", false, false) INITIALIZE_PASS_END(AMDGPUReplaceLDSUseWithPointer
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void setUnnamedAddr(UnnamedAddr Val)
LLVM_NODISCARD T pop_back_val()
static IntegerType * getInt8Ty(LLVMContext &C)
LLVM Basic Block Representation.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
Represent the analysis usage information of a pass.
void convertConstantExprsToInstructions(Instruction *I, ConstantExpr *CE, SmallPtrSetImpl< Instruction * > *Insts=nullptr)
The given instruction I contains given constant expression CE as one of its operands,...
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
@ LOCAL_ADDRESS
Address space for local memory.
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
@ InternalLinkage
Rename collisions when linking (static functions).
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Target-Independent Code Generator Pass Configuration Options.
This is an important base class in LLVM.
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
This is an important class for using LLVM in a threaded context.
df_iterator< T > df_begin(const T &G)
print Print MemDeps of function
@ CE
Windows NT (Windows on ARM)
A Module instance is used to store all the information related to an LLVM module.
Class for arbitrary precision integers.
SmallVector< MachineOperand, 4 > Cond
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
StringRef getName() const
Return a constant reference to the value's name.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_NODISCARD bool empty() const
iterator_range< df_iterator< T > > depth_first(const T &G)
Align getAlign(DataLayout const &DL, const GlobalVariable *GV)
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
ModulePass * createAMDGPUReplaceLDSUseWithPointerPass()
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
LLVM_NODISCARD bool empty() const
const BasicBlock * getParent() const
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
static IntegerType * getInt16Ty(LLVMContext &C)
PointerType * getType() const
Global values are always pointers.
A container for analyses that lazily runs them and caches their results.
Type * getValueType() const
AnalysisUsage & addRequired()
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights, DominatorTree *DT, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
void setAlignment(MaybeAlign Align)
bool isKernelCC(const Function *Func)
LLVM Value Representation.
iterator_range< user_iterator > users()
Add support for conditional and other related patterns Instead of
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.