40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
48#define DEBUG_TYPE "amdgpu-promote-alloca"
55 DisablePromoteAllocaToVector(
"disable-promote-alloca-to-vector",
56 cl::desc(
"Disable promote alloca to vector"),
60 DisablePromoteAllocaToLDS(
"disable-promote-alloca-to-lds",
61 cl::desc(
"Disable promote alloca to LDS"),
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc(
"Maximum byte size to consider promote alloca to vector"),
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc(
"Ratio of VGPRs to budget for promoting alloca to vectors"),
83 LoopUserWeight(
"promote-alloca-vector-loop-user-weight",
84 cl::desc(
"The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
90struct GEPToVectorIndex {
91 Value *VarIndex =
nullptr;
97struct MemTransferInfo {
103struct AllocaAnalysis {
108 bool HaveSelectOrPHI =
false;
121 explicit AllocaAnalysis(
AllocaInst *Alloca) : Alloca(Alloca) {}
125class AMDGPUPromoteAllocaImpl {
136 unsigned VGPRBudgetRatio;
137 unsigned MaxVectorRegs;
139 bool IsAMDGCN =
false;
140 bool IsAMDHSA =
false;
142 std::pair<Value *, Value *> getLocalSizeYZ(
IRBuilder<> &Builder);
145 bool collectAllocaUses(AllocaAnalysis &
AA)
const;
151 bool binaryOpIsDerivedFromSameAlloca(
Value *Alloca,
Value *Val,
156 bool hasSufficientLocalMem(
const Function &
F);
159 void analyzePromoteToVector(AllocaAnalysis &
AA)
const;
160 void promoteAllocaToVector(AllocaAnalysis &
AA);
161 void analyzePromoteToLDS(AllocaAnalysis &
AA)
const;
162 bool tryPromoteAllocaToLDS(AllocaAnalysis &
AA,
bool SufficientLDS);
164 void scoreAlloca(AllocaAnalysis &
AA)
const;
166 void setFunctionLimits(
const Function &
F);
172 IsAMDGCN = TT.isAMDGCN();
176 bool run(
Function &
F,
bool PromoteToLDS);
189 if (
auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
190 return AMDGPUPromoteAllocaImpl(
192 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
197 StringRef getPassName()
const override {
return "AMDGPU Promote Alloca"; }
206static unsigned getMaxVGPRs(
unsigned LDSBytes,
const TargetMachine &TM,
216 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
217 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
219 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
220 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(
F), LDSBytes,
F).first,
221 DynamicVGPRBlockSize);
226 if (!
F.hasFnAttribute(Attribute::AlwaysInline) &&
228 MaxVGPRs = std::min(MaxVGPRs, 32u);
234char AMDGPUPromoteAlloca::ID = 0;
237 "AMDGPU promote alloca to vector or LDS",
false,
false)
250 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(
F,
true);
262 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(
F,
false);
272 return new AMDGPUPromoteAlloca();
275bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &
AA)
const {
278 <<
" " << *Inst <<
"\n");
283 while (!WorkList.empty()) {
284 auto *Cur = WorkList.pop_back_val();
285 if (
find(
AA.Pointers, Cur) !=
AA.Pointers.end())
287 AA.Pointers.insert(Cur);
288 for (
auto &U : Cur->uses()) {
292 return RejectUser(Inst,
"pointer escapes via store");
295 AA.Uses.push_back(&U);
298 WorkList.push_back(Inst);
302 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur,
SI, 1, 2))
303 return RejectUser(Inst,
"select from mixed objects");
304 WorkList.push_back(Inst);
305 AA.HaveSelectOrPHI =
true;
311 switch (
Phi->getNumIncomingValues()) {
315 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca, Cur, Phi, 0, 1))
316 return RejectUser(Inst,
"phi from mixed objects");
319 return RejectUser(Inst,
"phi with too many operands");
322 WorkList.push_back(Inst);
323 AA.HaveSelectOrPHI =
true;
330void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &
AA)
const {
334 for (
auto *U :
AA.Uses) {
340 1 + (LoopUserWeight * LI.getLoopDepth(Inst->
getParent()));
341 LLVM_DEBUG(
dbgs() <<
" [+" << UserScore <<
"]:\t" << *Inst <<
"\n");
348void AMDGPUPromoteAllocaImpl::setFunctionLimits(
const Function &
F) {
352 const int R600MaxVectorRegs = 16;
353 MaxVectorRegs =
F.getFnAttributeAsParsedInteger(
354 "amdgpu-promote-alloca-to-vector-max-regs",
355 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
356 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
357 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
358 VGPRBudgetRatio =
F.getFnAttributeAsParsedInteger(
359 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
360 PromoteAllocaToVectorVGPRRatio);
361 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
362 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
365bool AMDGPUPromoteAllocaImpl::run(
Function &
F,
bool PromoteToLDS) {
367 DL = &
Mod->getDataLayout();
370 if (!
ST.isPromoteAllocaEnabled())
373 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(
F);
374 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM,
F);
375 setFunctionLimits(
F);
377 unsigned VectorizationBudget =
378 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
382 std::vector<AllocaAnalysis> Allocas;
387 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
392 AllocaAnalysis
AA{AI};
393 if (collectAllocaUses(
AA)) {
394 analyzePromoteToVector(
AA);
396 analyzePromoteToLDS(
AA);
397 if (
AA.Vector.Ty ||
AA.LDS.Enable) {
399 Allocas.push_back(std::move(
AA));
406 [](
const auto &
A,
const auto &
B) {
return A.Score >
B.Score; });
410 dbgs() <<
"Sorted Worklist:\n";
411 for (
const auto &
AA : Allocas)
412 dbgs() <<
" " << *
AA.Alloca <<
"\n";
417 for (AllocaAnalysis &
AA : Allocas) {
419 const unsigned AllocaCost =
420 DL->getTypeSizeInBits(
AA.Alloca->getAllocatedType());
422 if (AllocaCost <= VectorizationBudget) {
423 promoteAllocaToVector(
AA);
425 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
427 VectorizationBudget -= AllocaCost;
429 << VectorizationBudget <<
"\n");
433 << AllocaCost <<
", budget:" << VectorizationBudget
434 <<
"): " << *
AA.Alloca <<
"\n");
438 if (
AA.LDS.Enable && tryPromoteAllocaToLDS(
AA, SufficientLDS))
462 return I->getOperand(0) == AI &&
470 if (Ptr ==
AA.Alloca)
471 return B.getInt32(0);
474 auto I =
AA.Vector.GEPVectorIdx.find(
GEP);
475 assert(
I !=
AA.Vector.GEPVectorIdx.end() &&
"Must have entry for GEP!");
477 if (!
I->second.Full) {
478 Value *Result =
nullptr;
479 B.SetInsertPoint(
GEP);
481 if (
I->second.VarIndex) {
482 Result =
I->second.VarIndex;
483 Result =
B.CreateSExtOrTrunc(Result,
B.getInt32Ty());
485 if (
I->second.VarMul)
486 Result =
B.CreateMul(Result,
I->second.VarMul);
489 if (
I->second.ConstIndex) {
491 Result =
B.CreateAdd(Result,
I->second.ConstIndex);
493 Result =
I->second.ConstIndex;
497 Result =
B.getInt32(0);
499 I->second.Full = Result;
502 return I->second.Full;
505static std::optional<GEPToVectorIndex>
511 unsigned BW =
DL.getIndexTypeSizeInBits(
GEP->getType());
513 APInt ConstOffset(BW, 0);
534 if (!CurGEP->collectOffset(
DL, BW, VarOffsets, ConstOffset))
538 CurPtr = CurGEP->getPointerOperand();
541 assert(CurPtr == Alloca &&
"GEP not based on alloca");
543 int64_t VecElemSize =
DL.getTypeAllocSize(VecElemTy);
544 if (VarOffsets.
size() > 1)
553 GEPToVectorIndex Result;
555 if (!ConstOffset.
isZero())
556 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.
sextOrTrunc(BW));
558 if (VarOffsets.
empty())
561 const auto &VarOffset = VarOffsets.
front();
564 if (Rem != 0 || OffsetQuot.
isZero())
567 Result.VarIndex = VarOffset.first;
572 if (!OffsetQuot.
isOne())
573 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.
sextOrTrunc(BW));
593 unsigned VecStoreSize,
594 unsigned ElementSize,
600 Builder.SetInsertPoint(Inst);
602 const auto CreateTempPtrIntCast = [&Builder,
DL](
Value *Val,
604 assert(
DL.getTypeStoreSize(Val->getType()) ==
DL.getTypeStoreSize(PtrTy));
605 const unsigned Size =
DL.getTypeStoreSizeInBits(PtrTy);
606 if (!PtrTy->isVectorTy())
607 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(
Size));
611 assert((
Size % NumPtrElts == 0) &&
"Vector size not divisble");
613 return Builder.CreateBitOrPointerCast(
617 Type *VecEltTy =
AA.Vector.Ty->getElementType();
620 case Instruction::Load: {
621 Value *CurVal = GetCurVal();
627 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
629 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
631 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
633 CurVal = CreateTempPtrIntCast(CurVal, CurVal->
getType());
634 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
643 const unsigned NumLoadedElts = AccessSize /
DL.getTypeStoreSize(VecEltTy);
645 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
648 for (
unsigned K = 0; K < NumLoadedElts; ++K) {
650 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
651 SubVec = Builder.CreateInsertElement(
652 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
656 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
657 else if (SubVecTy->isPtrOrPtrVectorTy())
658 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
660 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
666 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
667 if (AccessTy != VecEltTy)
668 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
673 case Instruction::Store: {
680 Value *Val =
SI->getValueOperand();
684 TypeSize AccessSize =
DL.getTypeStoreSize(AccessTy);
686 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
688 Val = CreateTempPtrIntCast(Val, AccessTy);
689 else if (
AA.Vector.Ty->isPtrOrPtrVectorTy())
690 Val = CreateTempPtrIntCast(Val,
AA.Vector.Ty);
691 return Builder.CreateBitOrPointerCast(Val,
AA.Vector.Ty);
698 const unsigned NumWrittenElts =
699 AccessSize /
DL.getTypeStoreSize(VecEltTy);
700 const unsigned NumVecElts =
AA.Vector.Ty->getNumElements();
702 assert(
DL.getTypeStoreSize(SubVecTy) ==
DL.getTypeStoreSize(AccessTy));
704 if (SubVecTy->isPtrOrPtrVectorTy())
705 Val = CreateTempPtrIntCast(Val, SubVecTy);
707 Val = CreateTempPtrIntCast(Val, AccessTy);
709 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
711 Value *CurVec = GetCurVal();
712 for (
unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
715 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
716 CurVec = Builder.CreateInsertElement(
717 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
722 if (Val->
getType() != VecEltTy)
723 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
724 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
726 case Instruction::Call: {
730 unsigned NumCopied =
Length->getZExtValue() / ElementSize;
731 MemTransferInfo *TI = &
AA.Vector.TransferInfo[MTI];
736 for (
unsigned Idx = 0; Idx <
AA.Vector.Ty->getNumElements(); ++Idx) {
737 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
746 return Builder.CreateShuffleVector(GetCurVal(), Mask);
752 Value *Elt = MSI->getOperand(1);
753 const unsigned BytesPerElt =
DL.getTypeStoreSize(VecEltTy);
754 if (BytesPerElt > 1) {
755 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
761 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
762 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
764 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
767 return Builder.CreateVectorSplat(
AA.Vector.Ty->getElementCount(), Elt);
771 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
772 Intr->replaceAllUsesWith(
773 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
774 DL.getTypeAllocSize(
AA.Vector.Ty)));
803 TypeSize AccTS =
DL.getTypeStoreSize(AccessTy);
807 if (AccTS * 8 !=
DL.getTypeSizeInBits(AccessTy))
819template <
typename InstContainer>
831 auto &BlockUses = UsesByBlock[BB];
834 if (BlockUses.empty())
838 if (BlockUses.size() == 1) {
845 if (!BlockUses.contains(&Inst))
866AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(
Type *AllocaTy)
const {
867 if (DisablePromoteAllocaToVector) {
874 uint64_t NumElems = 1;
877 NumElems *= ArrayTy->getNumElements();
878 ElemTy = ArrayTy->getElementType();
884 NumElems *= InnerVectorTy->getNumElements();
885 ElemTy = InnerVectorTy->getElementType();
889 unsigned ElementSize =
DL->getTypeSizeInBits(ElemTy) / 8;
890 if (ElementSize > 0) {
891 unsigned AllocaSize =
DL->getTypeStoreSize(AllocaTy);
896 if (NumElems * ElementSize != AllocaSize)
897 NumElems = AllocaSize / ElementSize;
898 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
908 const unsigned MaxElements =
909 (MaxVectorRegs * 32) /
DL->getTypeSizeInBits(VectorTy->getElementType());
911 if (VectorTy->getNumElements() > MaxElements ||
912 VectorTy->getNumElements() < 2) {
914 <<
" has an unsupported number of elements\n");
918 Type *VecEltTy = VectorTy->getElementType();
919 unsigned ElementSizeInBits =
DL->getTypeSizeInBits(VecEltTy);
920 if (ElementSizeInBits !=
DL->getTypeAllocSizeInBits(VecEltTy)) {
921 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector if the allocation size "
922 "does not match the type's size\n");
929void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &
AA)
const {
930 if (
AA.HaveSelectOrPHI) {
931 LLVM_DEBUG(
dbgs() <<
" Cannot convert to vector due to select or phi\n");
935 Type *AllocaTy =
AA.Alloca->getAllocatedType();
936 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
941 LLVM_DEBUG(
dbgs() <<
" Cannot promote alloca to vector: " << Msg <<
"\n"
942 <<
" " << *Inst <<
"\n");
943 AA.Vector.Ty =
nullptr;
946 Type *VecEltTy =
AA.Vector.Ty->getElementType();
947 unsigned ElementSize =
DL->getTypeSizeInBits(VecEltTy) / 8;
949 for (
auto *U :
AA.Uses) {
958 return RejectUser(Inst,
"unsupported load/store as aggregate");
965 return RejectUser(Inst,
"not a simple load or store");
967 Ptr = Ptr->stripPointerCasts();
970 if (Ptr ==
AA.Alloca &&
971 DL->getTypeStoreSize(
AA.Alloca->getAllocatedType()) ==
972 DL->getTypeStoreSize(AccessTy)) {
973 AA.Vector.Worklist.push_back(Inst);
978 return RejectUser(Inst,
"not a supported access type");
980 AA.Vector.Worklist.push_back(Inst);
989 return RejectUser(Inst,
"cannot compute vector index for GEP");
991 AA.Vector.GEPVectorIdx[
GEP] = std::move(
Index.value());
992 AA.Vector.UsersToRemove.push_back(Inst);
998 AA.Vector.Worklist.push_back(Inst);
1003 if (TransferInst->isVolatile())
1004 return RejectUser(Inst,
"mem transfer inst is volatile");
1007 if (!Len || (
Len->getZExtValue() % ElementSize))
1008 return RejectUser(Inst,
"mem transfer inst length is non-constant or "
1009 "not a multiple of the vector element size");
1012 if (Ptr ==
AA.Alloca)
1013 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1016 const auto &GEPI =
AA.Vector.GEPVectorIdx.find(
GEP)->second;
1019 if (GEPI.ConstIndex)
1020 return GEPI.ConstIndex;
1021 return ConstantInt::get(Ptr->getContext(),
APInt(32, 0));
1024 MemTransferInfo *TI =
1025 &
AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1026 unsigned OpNum =
U->getOperandNo();
1028 Value *Dest = TransferInst->getDest();
1031 return RejectUser(Inst,
"could not calculate constant dest index");
1032 TI->DestIndex =
Index;
1035 Value *Src = TransferInst->getSource();
1038 return RejectUser(Inst,
"could not calculate constant src index");
1039 TI->SrcIndex =
Index;
1045 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1046 AA.Vector.Worklist.push_back(Inst);
1054 return RejectUser(Inst,
"assume-like intrinsic cannot have any users");
1055 AA.Vector.UsersToRemove.push_back(Inst);
1060 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1062 AA.Vector.UsersToRemove.push_back(Inst);
1066 return RejectUser(Inst,
"unhandled alloca user");
1070 for (
const auto &Entry :
AA.Vector.TransferInfo) {
1071 const MemTransferInfo &TI =
Entry.second;
1072 if (!TI.SrcIndex || !TI.DestIndex)
1073 return RejectUser(
Entry.first,
1074 "mem transfer inst between different objects");
1075 AA.Vector.Worklist.push_back(
Entry.first);
1079void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &
AA) {
1081 LLVM_DEBUG(
dbgs() <<
" type conversion: " << *
AA.Alloca->getAllocatedType()
1082 <<
" -> " << *
AA.Vector.Ty <<
'\n');
1083 const unsigned VecStoreSize =
DL->getTypeStoreSize(
AA.Vector.Ty);
1085 Type *VecEltTy =
AA.Vector.Ty->getElementType();
1086 const unsigned ElementSize =
DL->getTypeSizeInBits(VecEltTy) / 8;
1108 BasicBlock *BB = I->getParent();
1109 auto GetCurVal = [&]() -> Value * {
1110 if (Value *CurVal = Updater.FindValueForBlock(BB))
1113 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1114 return Placeholders.back();
1118 IRBuilder<> Builder(I);
1119 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1120 PoisonValue::get(AA.Vector.Ty),
"promotealloca.placeholder"));
1121 Placeholders.push_back(Placeholder);
1122 return Placeholders.back();
1126 ElementSize, GetCurVal);
1133 Placeholder->replaceAllUsesWith(
1135 Placeholder->eraseFromParent();
1141 I->eraseFromParent();
1146 I->dropDroppableUses();
1148 I->eraseFromParent();
1153 AA.Alloca->eraseFromParent();
1156std::pair<Value *, Value *>
1157AMDGPUPromoteAllocaImpl::getLocalSizeYZ(
IRBuilder<> &Builder) {
1167 ST.makeLIDRangeMetadata(LocalSizeY);
1168 ST.makeLIDRangeMetadata(LocalSizeZ);
1170 return std::pair(LocalSizeY, LocalSizeZ);
1211 F.removeFnAttr(
"amdgpu-no-dispatch-ptr");
1228 LoadXY->
setMetadata(LLVMContext::MD_invariant_load, MD);
1229 LoadZU->
setMetadata(LLVMContext::MD_invariant_load, MD);
1230 ST.makeLIDRangeMetadata(LoadZU);
1235 return std::pair(
Y, LoadZU);
1247 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1249 AttrName =
"amdgpu-no-workitem-id-x";
1252 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1254 AttrName =
"amdgpu-no-workitem-id-y";
1258 IntrID = IsAMDGCN ? (
Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1260 AttrName =
"amdgpu-no-workitem-id-z";
1268 ST.makeLIDRangeMetadata(CI);
1269 F->removeFnAttr(AttrName);
1279 switch (
II->getIntrinsicID()) {
1280 case Intrinsic::memcpy:
1281 case Intrinsic::memmove:
1282 case Intrinsic::memset:
1283 case Intrinsic::lifetime_start:
1284 case Intrinsic::lifetime_end:
1285 case Intrinsic::invariant_start:
1286 case Intrinsic::invariant_end:
1287 case Intrinsic::launder_invariant_group:
1288 case Intrinsic::strip_invariant_group:
1289 case Intrinsic::objectsize:
1296bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1318 if (OtherObj != BaseAlloca) {
1320 dbgs() <<
"Found a binary instruction with another alloca object\n");
1327void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &
AA)
const {
1328 if (DisablePromoteAllocaToLDS) {
1336 const Function &ContainingFunction = *
AA.Alloca->getFunction();
1346 <<
" promote alloca to LDS not supported with calling convention.\n");
1357 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1358 AA.LDS.Worklist.push_back(
User);
1363 if (UseInst->
getOpcode() == Instruction::PtrToInt)
1367 if (LI->isVolatile())
1373 if (
SI->isVolatile())
1379 if (RMW->isVolatile())
1385 if (CAS->isVolatile())
1393 if (!binaryOpIsDerivedFromSameAlloca(
AA.Alloca,
Use->get(), ICmp, 0, 1))
1397 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1398 AA.LDS.Worklist.push_back(ICmp);
1405 if (!
GEP->isInBounds())
1418 if (
find(
AA.LDS.Worklist,
User) ==
AA.LDS.Worklist.end())
1419 AA.LDS.Worklist.push_back(
User);
1422 AA.LDS.Enable =
true;
1425bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(
const Function &
F) {
1433 for (
Type *ParamTy : FTy->params()) {
1437 LLVM_DEBUG(
dbgs() <<
"Function has local memory argument. Promoting to "
1438 "local memory disabled.\n");
1443 LocalMemLimit =
ST.getAddressableLocalMemorySize();
1444 if (LocalMemLimit == 0)
1454 if (
Use->getFunction() == &
F)
1458 if (VisitedConstants.
insert(
C).second)
1470 if (visitUsers(&GV, &GV)) {
1478 while (!
Stack.empty()) {
1480 if (visitUsers(&GV,
C)) {
1501 LLVM_DEBUG(
dbgs() <<
"Function has a reference to externally allocated "
1502 "local memory. Promoting to local memory "
1517 CurrentLocalMemUsage = 0;
1523 for (
auto Alloc : AllocatedSizes) {
1524 CurrentLocalMemUsage =
alignTo(CurrentLocalMemUsage,
Alloc.second);
1525 CurrentLocalMemUsage +=
Alloc.first;
1528 unsigned MaxOccupancy =
1529 ST.getWavesPerEU(
ST.getFlatWorkGroupSizes(
F), CurrentLocalMemUsage,
F)
1533 unsigned MaxSizeWithWaveCount =
1534 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy,
F);
1537 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1540 LocalMemLimit = MaxSizeWithWaveCount;
1543 <<
" bytes of LDS\n"
1544 <<
" Rounding size to " << MaxSizeWithWaveCount
1545 <<
" with a maximum occupancy of " << MaxOccupancy <<
'\n'
1546 <<
" and " << (LocalMemLimit - CurrentLocalMemUsage)
1547 <<
" available for promotion\n");
1553bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &
AA,
1554 bool SufficientLDS) {
1564 const Function &ContainingFunction = *
AA.Alloca->getParent()->getParent();
1566 unsigned WorkGroupSize =
ST.getFlatWorkGroupSizes(ContainingFunction).second;
1568 Align Alignment =
DL.getValueOrABITypeAlignment(
1569 AA.Alloca->getAlign(),
AA.Alloca->getAllocatedType());
1577 uint32_t NewSize =
alignTo(CurrentLocalMemUsage, Alignment);
1578 uint32_t AllocSize =
1579 WorkGroupSize *
DL.getTypeAllocSize(
AA.Alloca->getAllocatedType());
1580 NewSize += AllocSize;
1582 if (NewSize > LocalMemLimit) {
1584 <<
" bytes of local memory not available to promote\n");
1588 CurrentLocalMemUsage = NewSize;
1597 Twine(
F->getName()) +
Twine(
'.') +
AA.Alloca->getName(),
nullptr,
1602 Value *TCntY, *TCntZ;
1604 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1605 Value *TIdX = getWorkitemID(Builder, 0);
1606 Value *TIdY = getWorkitemID(Builder, 1);
1607 Value *TIdZ = getWorkitemID(Builder, 2);
1619 AA.Alloca->mutateType(
Offset->getType());
1620 AA.Alloca->replaceAllUsesWith(
Offset);
1621 AA.Alloca->eraseFromParent();
1627 for (
Value *V :
AA.LDS.Worklist) {
1649 assert(
V->getType()->isPtrOrPtrVectorTy());
1651 Type *NewTy =
V->getType()->getWithNewType(NewPtrTy);
1652 V->mutateType(NewTy);
1662 for (
unsigned I = 0,
E =
Phi->getNumIncomingValues();
I !=
E; ++
I) {
1664 Phi->getIncomingValue(
I)))
1675 case Intrinsic::lifetime_start:
1676 case Intrinsic::lifetime_end:
1680 case Intrinsic::memcpy:
1681 case Intrinsic::memmove:
1687 case Intrinsic::memset: {
1695 case Intrinsic::invariant_start:
1696 case Intrinsic::invariant_end:
1697 case Intrinsic::launder_invariant_group:
1698 case Intrinsic::strip_invariant_group: {
1716 case Intrinsic::objectsize: {
1720 Intrinsic::objectsize,
1736 assert(
ID == Intrinsic::memcpy ||
ID == Intrinsic::memmove);
1740 ID,
MI->getRawDest(),
MI->getDestAlign(),
MI->getRawSource(),
1741 MI->getSourceAlign(),
MI->getLength(),
MI->isVolatile());
1743 for (
unsigned I = 0;
I != 2; ++
I) {
1745 B->addDereferenceableParamAttr(
I, Bytes);
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
bool isOne() const
Determine if this is a value of 1.
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
Represents analyses that only rely on functions' control flow.
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
BasicBlock * GetInsertBlock() const
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
The legacy pass manager's analysis pass to compute loop information.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
This class implements a map that also provides access to all stored values in a deterministic order.
std::pair< KeyT, ValueT > & front()
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Helper class for SSA formation on a set of values defined in multiple blocks.
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
The instances of the Type class are immutable: once they are created, they are never changed.
bool isArrayTy() const
True if this is an instance of ArrayType.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isPointerTy() const
True if this is an instance of PointerType.
bool isAggregateType() const
Return true if the type is an aggregate type.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
A Use represents the edge between a Value definition and its users.
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
This is an optimization pass for GlobalISel generic memory operations.
void stable_sort(R &&Range)
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
void sort(IteratorTy Start, IteratorTy End)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
This struct is a compact representation of a valid (non-zero power of two) alignment.
A MapVector that performs no allocations if smaller than a certain size.
Function object to check whether the second component of a container supported by std::get (like std:...