97#include "llvm/IR/IntrinsicsAMDGPU.h"
108#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
120getFunctionToInstsMap(
User *U,
bool CollectKernelInsts);
126class ReplaceLDSUseImpl {
138 FunctionToLDSToReplaceInst;
141 std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
143 std::vector<GlobalVariable *> LDSGlobals =
148 return shouldIgnorePointerReplacement(GV);
164 LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
165 return LDSToNonKernels[GV].
empty();
174 auto PointerEntry = LDSToPointer.
insert(std::pair(GV,
nullptr));
175 if (!PointerEntry.second)
176 return PointerEntry.first->second;
185 GV->
getName() +
Twine(
".ptr"),
nullptr, GlobalVariable::NotThreadLocal,
192 LDSToPointer[GV] = LDSPointer;
202 auto BasicBlockEntry = KernelToInitBB.
insert(std::pair(K,
nullptr));
203 if (!BasicBlockEntry.second)
204 return BasicBlockEntry.first->second;
207 auto *EI = &(*(
K->getEntryBlock().getFirstInsertionPt()));
211 Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
215 Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
220 KernelToInitBB[
K] = NBB;
229 auto PointerEntry = KernelToLDSPointers.
insert(
231 if (!PointerEntry.second)
232 if (PointerEntry.first->second.contains(LDSPointer))
240 auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
246 KernelToLDSPointers[
K].
insert(LDSPointer);
254 for (
auto *U : LDSUsers) {
259 auto FunctionToInsts =
260 AMDGPU::getFunctionToInstsMap(U,
false );
262 for (
const auto &FunctionToInst : FunctionToInsts) {
264 auto &Insts = FunctionToInst.second;
265 for (
auto *
I : Insts) {
278 auto *
CE = cast<ConstantExpr>(U);
284 for (
auto *II : UserInsts) {
285 auto *ReplaceInst = getReplacementInst(
F, GV, LDSPointer);
286 II->replaceUsesOfWith(GV, ReplaceInst);
299 auto LDSEntry = FunctionToLDSToReplaceInst.
insert(
301 if (!LDSEntry.second) {
302 auto ReplaceInstEntry =
303 LDSEntry.first->second.insert(std::pair(GV,
nullptr));
304 if (!ReplaceInstEntry.second)
305 return ReplaceInstEntry.first->second;
310 auto *EI = &(*(
F->getEntryBlock().getFirstInsertionPt()));
316 Builder.getInt8Ty(), LDSMemBaseAddr,
322 FunctionToLDSToReplaceInst[
F][GV] =
V;
328 ReplaceLDSUseImpl(
Module &M)
329 :
M(
M), Ctx(
M.getContext()),
DL(
M.getDataLayout()) {
338 bool replaceLDSUse();
357 for (
const auto &KernelToCallee : KernelToCallees) {
374 LDSPointer = createLDSPointer(GV);
377 initializeLDSPointer(K, GV, LDSPointer);
388 replaceLDSUseByPointer(GV, LDSPointer);
397class CollectReachableCallees {
403 void collectAddressTakenFunctions() {
406 for (
const auto &GI : *ECNode) {
407 auto *CGN = GI.second;
408 auto *
F = CGN->getFunction();
411 AddressTakenFunctions.
insert(CGN);
433 while (!CGNStack.empty()) {
434 auto *CGN = CGNStack.pop_back_val();
436 if (!VisitedCGNodes.
insert(CGN).second)
441 if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
444 for (
const auto &GI : *CGN) {
445 auto *RCB = cast<CallBase>(*GI.first);
446 auto *RCGN = GI.second;
448 if (
auto *DCallee = RCGN->getFunction()) {
449 ReachableCallees.
insert(DCallee);
450 }
else if (RCB->isIndirectCall()) {
451 auto *RCBFTy = RCB->getFunctionType();
452 for (
auto *ACGN : AddressTakenFunctions) {
453 auto *ACallee = ACGN->getFunction();
454 if (ACallee->getFunctionType() == RCBFTy) {
455 ReachableCallees.
insert(ACallee);
463 return ReachableCallees;
469 collectAddressTakenFunctions();
472 void collectReachableCallees(
479 KernelToCallees[
K] = collectReachableCallees(K);
486void collectReachableCallees(
489 CollectReachableCallees CRC{
M};
490 CRC.collectReachableCallees(KernelToCallees);
501 while (!UserStack.empty()) {
502 auto *
U = UserStack.pop_back_val();
505 if (!VisitedUsers.
insert(U).second)
509 if (isa<GlobalValue>(U))
513 if (isa<Constant>(U)) {
520 Function *
F = cast<Instruction>(U)->getFunction();
529getFunctionToInstsMap(
User *U,
bool CollectKernelInsts) {
536 while (!UserStack.
empty()) {
539 if (!VisitedUsers.
insert(UU).second)
542 if (isa<GlobalValue>(UU))
545 if (isa<Constant>(UU)) {
550 auto *
I = cast<Instruction>(UU);
552 if (CollectKernelInsts) {
566 return FunctionToInsts;
573bool ReplaceLDSUseImpl::replaceLDSUse() {
575 std::vector<GlobalVariable *> LDSGlobals =
576 collectLDSRequiringPointerReplace();
579 if (LDSGlobals.empty())
583 AMDGPU::collectReachableCallees(M, KernelToCallees);
585 if (KernelToCallees.empty()) {
595 bool Changed =
false;
596 for (
auto *GV : LDSGlobals)
597 Changed |= replaceLDSUse(GV);
602class AMDGPUReplaceLDSUseWithPointer :
public ModulePass {
620char AMDGPUReplaceLDSUseWithPointer::ID = 0;
622 AMDGPUReplaceLDSUseWithPointer::ID;
626 "Replace within non-kernel function use of LDS with pointer",
634bool AMDGPUReplaceLDSUseWithPointer::runOnModule(
Module &M) {
635 ReplaceLDSUseImpl LDSUseReplacer{M};
636 return LDSUseReplacer.replaceLDSUse();
640 return new AMDGPUReplaceLDSUseWithPointer();
645 ReplaceLDSUseImpl LDSUseReplacer{M};
646 LDSUseReplacer.replaceLDSUse();
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Replace within non kernel function use of LDS with pointer
SmallVector< MachineOperand, 4 > Cond
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
AMD GCN specific subclass of TargetSubtarget.
Move duplicate certain instructions close to their use
print Print MemDeps of function
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
This file defines generic set operations that may be used on set's of different types,...
Target-Independent Code Generator Pass Configuration Options pass.
Class for arbitrary precision integers.
A container for analyses that lazily runs them and caches their results.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM Basic Block Representation.
The basic data container for the call graph of a Module of IR.
CallGraphNode * getExternalCallingNode() const
Returns the CallGraphNode which is used to represent undetermined calls into the callgraph.
This is an important base class in LLVM.
static Constant * getIntegerValue(Type *Ty, const APInt &V)
Return the value for an integer or pointer constant, or a vector thereof, with the given scalar value...
A parsed version of the target data layout string in and methods for querying it.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalObject.
void setUnnamedAddr(UnnamedAddr Val)
PointerType * getType() const
Global values are always pointers.
@ InternalLinkage
Rename collisions when linking (static functions).
Type * getValueType() const
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
const BasicBlock * getParent() const
This is an important class for using LLVM in a threaded context.
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
virtual bool runOnModule(Module &M)=0
runOnModule - Virtual method overriden by subclasses to process the module being operated on.
A Module instance is used to store all the information related to an LLVM module.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Target-Independent Code Generator Pass Configuration Options.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static IntegerType * getInt16Ty(LLVMContext &C)
static IntegerType * getInt8Ty(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
LLVM Value Representation.
iterator_range< user_iterator > users()
StringRef getName() const
Return a constant reference to the value's name.
@ LOCAL_ADDRESS
Address space for local memory.
std::vector< GlobalVariable * > findLDSVariablesToLower(Module &M, const Function *F)
Align getAlign(DataLayout const &DL, const GlobalVariable *GV)
bool isKernelCC(const Function *Func)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ CE
Windows NT (Windows on ARM)
This is an optimization pass for GlobalISel generic memory operations.
void set_intersect(S1Ty &S1, const S2Ty &S2)
set_intersect(A, B) - Compute A := A ^ B Identical to set_intersection, except that it works on set<>...
void append_range(Container &C, Range &&R)
Wrapper function to append a range to a container.
df_iterator< T > df_begin(const T &G)
char & AMDGPUReplaceLDSUseWithPointerID
void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &)
ModulePass * createAMDGPUReplaceLDSUseWithPointerPass()
void convertConstantExprsToInstructions(Instruction *I, ConstantExpr *CE, SmallPtrSetImpl< Instruction * > *Insts=nullptr)
The given instruction I contains given constant expression CE as one of its operands,...
void erase_if(Container &C, UnaryPredicate P)
Provide a container algorithm similar to C++ Library Fundamentals v2's erase_if which is equivalent t...
df_iterator< T > df_end(const T &G)
iterator_range< df_iterator< T > > depth_first(const T &G)
Instruction * SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore, bool Unreachable, MDNode *BranchWeights, DominatorTree *DT, LoopInfo *LI=nullptr, BasicBlock *ThenBlock=nullptr)
Split the containing block at the specified instruction - everything before SplitBefore stays in the ...
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)