LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(4));
82
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(4));
88
89// We support vector indices of the form (A * stride) + B
90// All parts are optional.
91struct GEPToVectorIndex {
92 Value *VarIndex = nullptr; // defaults to 0
93 ConstantInt *VarMul = nullptr; // defaults to 1
94 ConstantInt *ConstIndex = nullptr; // defaults to 0
95 Value *Full = nullptr;
96};
97
98struct MemTransferInfo {
99 ConstantInt *SrcIndex = nullptr;
100 ConstantInt *DestIndex = nullptr;
101};
102
103// Analysis for planning the different strategies of alloca promotion.
104struct AllocaAnalysis {
105 AllocaInst *Alloca = nullptr;
106 DenseSet<Value *> Pointers;
108 unsigned Score = 0;
109 bool HaveSelectOrPHI = false;
110 struct {
111 FixedVectorType *Ty = nullptr;
113 SmallVector<Instruction *> UsersToRemove;
116 } Vector;
117 struct {
118 bool Enable = false;
119 SmallVector<User *> Worklist;
120 } LDS;
121
122 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
123};
124
125// Shared implementation which can do both promotion to vector and to LDS.
126class AMDGPUPromoteAllocaImpl {
127private:
128 const TargetMachine &TM;
129 LoopInfo &LI;
130 Module *Mod = nullptr;
131 const DataLayout *DL = nullptr;
132
133 // FIXME: This should be per-kernel.
134 uint32_t LocalMemLimit = 0;
135 uint32_t CurrentLocalMemUsage = 0;
136 unsigned MaxVGPRs;
137 unsigned VGPRBudgetRatio;
138 unsigned MaxVectorRegs;
139
140 bool IsAMDGCN = false;
141 bool IsAMDHSA = false;
142
143 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
144 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
145
146 bool collectAllocaUses(AllocaAnalysis &AA) const;
147
148 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
149 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
150 /// Returns true if both operands are derived from the same alloca. Val should
151 /// be the same value as one of the input operands of UseInst.
152 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
153 Instruction *UseInst, int OpIdx0,
154 int OpIdx1) const;
155
156 /// Check whether we have enough local memory for promotion.
157 bool hasSufficientLocalMem(const Function &F);
158
159 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
160 void analyzePromoteToVector(AllocaAnalysis &AA) const;
161 void promoteAllocaToVector(AllocaAnalysis &AA);
162 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
163 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
164 SetVector<IntrinsicInst *> &DeferredIntrs);
165 void
166 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
167
168 void scoreAlloca(AllocaAnalysis &AA) const;
169
170 void setFunctionLimits(const Function &F);
171
172public:
173 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
174
175 const Triple &TT = TM.getTargetTriple();
176 IsAMDGCN = TT.isAMDGCN();
177 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
178 }
179
180 bool run(Function &F, bool PromoteToLDS);
181};
182
183// FIXME: This can create globals so should be a module pass.
184class AMDGPUPromoteAlloca : public FunctionPass {
185public:
186 static char ID;
187
188 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
189
190 bool runOnFunction(Function &F) override {
191 if (skipFunction(F))
192 return false;
193 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
194 return AMDGPUPromoteAllocaImpl(
195 TPC->getTM<TargetMachine>(),
196 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
197 .run(F, /*PromoteToLDS*/ true);
198 return false;
199 }
200
201 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
202
203 void getAnalysisUsage(AnalysisUsage &AU) const override {
204 AU.setPreservesCFG();
207 }
208};
209
210static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
211 const Function &F) {
212 if (!TM.getTargetTriple().isAMDGCN())
213 return 128;
214
215 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
216
217 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
218 // Temporarily check both the attribute and the subtarget feature, until the
219 // latter is removed.
220 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
221 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
222
223 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
224 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
225 DynamicVGPRBlockSize);
226
227 // A non-entry function has only 32 caller preserved registers.
228 // Do not promote alloca which will force spilling unless we know the function
229 // will be inlined.
230 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
231 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
232 MaxVGPRs = std::min(MaxVGPRs, 32u);
233 return MaxVGPRs;
234}
235
236} // end anonymous namespace
237
238char AMDGPUPromoteAlloca::ID = 0;
239
241 "AMDGPU promote alloca to vector or LDS", false, false)
242// Move LDS uses from functions to kernels before promote alloca for accurate
243// estimation of LDS available
244INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
246INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
247 "AMDGPU promote alloca to vector or LDS", false, false)
248
249char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
250
253 auto &LI = AM.getResult<LoopAnalysis>(F);
254 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
255 if (Changed) {
258 return PA;
259 }
260 return PreservedAnalyses::all();
261}
262
265 auto &LI = AM.getResult<LoopAnalysis>(F);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
267 if (Changed) {
270 return PA;
271 }
272 return PreservedAnalyses::all();
273}
274
276 return new AMDGPUPromoteAlloca();
277}
278
279bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
280 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
281 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
282 << " " << *Inst << "\n");
283 return false;
284 };
285
286 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
287 while (!WorkList.empty()) {
288 auto *Cur = WorkList.pop_back_val();
289 if (find(AA.Pointers, Cur) != AA.Pointers.end())
290 continue;
291 AA.Pointers.insert(Cur);
292 for (auto &U : Cur->uses()) {
293 auto *Inst = cast<Instruction>(U.getUser());
294 if (isa<StoreInst>(Inst)) {
295 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
296 return RejectUser(Inst, "pointer escapes via store");
297 }
298 }
299 AA.Uses.push_back(&U);
300
301 if (isa<GetElementPtrInst>(U.getUser())) {
302 WorkList.push_back(Inst);
303 } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
304 // Only promote a select if we know that the other select operand is
305 // from another pointer that will also be promoted.
306 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
307 return RejectUser(Inst, "select from mixed objects");
308 WorkList.push_back(Inst);
309 AA.HaveSelectOrPHI = true;
310 } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
311 // Repeat for phis.
312
313 // TODO: Handle more complex cases. We should be able to replace loops
314 // over arrays.
315 switch (Phi->getNumIncomingValues()) {
316 case 1:
317 break;
318 case 2:
319 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
320 return RejectUser(Inst, "phi from mixed objects");
321 break;
322 default:
323 return RejectUser(Inst, "phi with too many operands");
324 }
325
326 WorkList.push_back(Inst);
327 AA.HaveSelectOrPHI = true;
328 }
329 }
330 }
331 return true;
332}
333
334void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
335 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
336 unsigned Score = 0;
337 // Increment score by one for each user + a bonus for users within loops.
338 for (auto *U : AA.Uses) {
339 Instruction *Inst = cast<Instruction>(U->getUser());
340 if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
341 isa<PHINode>(Inst))
342 continue;
343 unsigned UserScore =
344 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
345 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
346 Score += UserScore;
347 }
348 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
349 AA.Score = Score;
350}
351
352void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
353 // Load per function limits, overriding with global options where appropriate.
354 // R600 register tuples/aliasing are fragile with large vector promotions so
355 // apply architecture specific limit here.
356 const int R600MaxVectorRegs = 16;
357 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
358 "amdgpu-promote-alloca-to-vector-max-regs",
359 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
360 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
361 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
362 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
363 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
364 PromoteAllocaToVectorVGPRRatio);
365 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
366 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
367}
368
369bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
370 Mod = F.getParent();
371 DL = &Mod->getDataLayout();
372
374 if (!ST.isPromoteAllocaEnabled())
375 return false;
376
377 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
378 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
379 setFunctionLimits(F);
380
381 unsigned VectorizationBudget =
382 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
383 : (MaxVGPRs * 32)) /
384 VGPRBudgetRatio;
385
386 std::vector<AllocaAnalysis> Allocas;
387 for (Instruction &I : F.getEntryBlock()) {
388 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
389 // Array allocations are probably not worth handling, since an allocation
390 // of the array type is the canonical form.
391 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
392 continue;
393
394 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
395
396 AllocaAnalysis AA{AI};
397 if (collectAllocaUses(AA)) {
398 analyzePromoteToVector(AA);
399 if (PromoteToLDS)
400 analyzePromoteToLDS(AA);
401 if (AA.Vector.Ty || AA.LDS.Enable) {
402 scoreAlloca(AA);
403 Allocas.push_back(std::move(AA));
404 }
405 }
406 }
407 }
408
409 stable_sort(Allocas,
410 [](const auto &A, const auto &B) { return A.Score > B.Score; });
411
412 // clang-format off
414 dbgs() << "Sorted Worklist:\n";
415 for (const auto &AA : Allocas)
416 dbgs() << " " << *AA.Alloca << "\n";
417 );
418 // clang-format on
419
420 bool Changed = false;
421 SetVector<IntrinsicInst *> DeferredIntrs;
422 for (AllocaAnalysis &AA : Allocas) {
423 if (AA.Vector.Ty) {
424 const unsigned AllocaCost =
425 DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
426 // First, check if we have enough budget to vectorize this alloca.
427 if (AllocaCost <= VectorizationBudget) {
428 promoteAllocaToVector(AA);
429 Changed = true;
430 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
431 "Underflow!");
432 VectorizationBudget -= AllocaCost;
433 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
434 << VectorizationBudget << "\n");
435 continue;
436 } else {
437 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
438 << AllocaCost << ", budget:" << VectorizationBudget
439 << "): " << *AA.Alloca << "\n");
440 }
441 }
442
443 if (AA.LDS.Enable &&
444 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
445 Changed = true;
446 }
447 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
448
449 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
450 // dangling pointers. If we want to reuse it past this point, the loop above
451 // would need to be updated to remove successfully promoted allocas.
452
453 return Changed;
454}
455
456// Checks if the instruction I is a memset user of the alloca AI that we can
457// deal with. Currently, only non-volatile memsets that affect the whole alloca
458// are handled.
460 const DataLayout &DL) {
461 using namespace PatternMatch;
462 // For now we only care about non-volatile memsets that affect the whole type
463 // (start at index 0 and fill the whole alloca).
464 //
465 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
466 // (except maybe volatile ones?) - we just need to use shufflevector if it
467 // only affects a subset of the vector.
468 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
469 return I->getOperand(0) == AI &&
470 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
471}
472
473static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
474 IRBuilder<> B(Ptr->getContext());
475
476 Ptr = Ptr->stripPointerCasts();
477 if (Ptr == AA.Alloca)
478 return B.getInt32(0);
479
480 auto *GEP = cast<GetElementPtrInst>(Ptr);
481 auto I = AA.Vector.GEPVectorIdx.find(GEP);
482 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
483
484 if (!I->second.Full) {
485 Value *Result = nullptr;
486 B.SetInsertPoint(GEP);
487
488 if (I->second.VarIndex) {
489 Result = I->second.VarIndex;
490 Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
491
492 if (I->second.VarMul)
493 Result = B.CreateMul(Result, I->second.VarMul);
494 }
495
496 if (I->second.ConstIndex) {
497 if (Result)
498 Result = B.CreateAdd(Result, I->second.ConstIndex);
499 else
500 Result = I->second.ConstIndex;
501 }
502
503 if (!Result)
504 Result = B.getInt32(0);
505
506 I->second.Full = Result;
507 }
508
509 return I->second.Full;
510}
511
512static std::optional<GEPToVectorIndex>
514 Type *VecElemTy, const DataLayout &DL) {
515 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
516 // helper.
517 LLVMContext &Ctx = GEP->getContext();
518 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
520 APInt ConstOffset(BW, 0);
521
522 // Walk backwards through nested GEPs to collect both constant and variable
523 // offsets, so that nested vector GEP chains can be lowered in one step.
524 //
525 // Given this IR fragment as input:
526 //
527 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
528 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
529 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
530 // %3 = load i32, ptr addrspace(5) %2, align 4
531 //
532 // Combine both GEP operations in a single pass, producing:
533 // BasePtr = %0
534 // ConstOffset = 4
535 // VarOffsets = { %j -> element_size(<2 x i32>) }
536 //
537 // That lets us emit a single buffer_load directly into a VGPR, without ever
538 // allocating scratch memory for the intermediate pointer.
539 Value *CurPtr = GEP;
540 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
541 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
542 return {};
543
544 // Move to the next outer pointer.
545 CurPtr = CurGEP->getPointerOperand();
546 }
547
548 assert(CurPtr == Alloca && "GEP not based on alloca");
549
550 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
551 if (VarOffsets.size() > 1)
552 return {};
553
554 APInt IndexQuot;
555 int64_t Rem;
556 APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
557 if (Rem != 0)
558 return {};
559
560 GEPToVectorIndex Result;
561
562 if (!ConstOffset.isZero())
563 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
564
565 if (VarOffsets.empty())
566 return Result;
567
568 const auto &VarOffset = VarOffsets.front();
569 APInt OffsetQuot;
570 APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
571 if (Rem != 0 || OffsetQuot.isZero())
572 return {};
573
574 Result.VarIndex = VarOffset.first;
575 auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
576 if (!OffsetType)
577 return {};
578
579 if (!OffsetQuot.isOne())
580 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
581
582 return Result;
583}
584
585/// Promotes a single user of the alloca to a vector form.
586///
587/// \param Inst Instruction to be promoted.
588/// \param DL Module Data Layout.
589/// \param AA Alloca Analysis.
590/// \param VecStoreSize Size of \p VectorTy in bytes.
591/// \param ElementSize Size of \p VectorTy element type in bytes.
592/// \param CurVal Current value of the vector (e.g. last stored value)
593/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
594/// be promoted now. This happens when promoting requires \p
595/// CurVal, but \p CurVal is nullptr.
596/// \return the stored value if \p Inst would have written to the alloca, or
597/// nullptr otherwise.
599 AllocaAnalysis &AA,
600 unsigned VecStoreSize,
601 unsigned ElementSize,
602 function_ref<Value *()> GetCurVal) {
603 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
604 // to do more folding, especially in the case of vector splats.
607 Builder.SetInsertPoint(Inst);
608
609 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
610 Type *PtrTy) -> Value * {
611 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
612 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
613 if (!PtrTy->isVectorTy())
614 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
615 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
616 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
617 // first cast the ptr vector to <2 x i64>.
618 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
619 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
620 return Builder.CreateBitOrPointerCast(
621 Val, FixedVectorType::get(EltTy, NumPtrElts));
622 };
623
624 Type *VecEltTy = AA.Vector.Ty->getElementType();
625
626 switch (Inst->getOpcode()) {
627 case Instruction::Load: {
628 Value *CurVal = GetCurVal();
629 Value *Index =
631
632 // We're loading the full vector.
633 Type *AccessTy = Inst->getType();
634 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
635 if (Constant *CI = dyn_cast<Constant>(Index)) {
636 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
637 if (AccessTy->isPtrOrPtrVectorTy())
638 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
639 else if (CurVal->getType()->isPtrOrPtrVectorTy())
640 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
641 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
642 Inst->replaceAllUsesWith(NewVal);
643 return nullptr;
644 }
645 }
646
647 // Loading a subvector.
648 if (isa<FixedVectorType>(AccessTy)) {
649 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
650 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
651 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
652 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
653
654 // If idx is dynamic, then sandwich load with bitcasts.
655 // ie. VectorTy SubVecTy AccessTy
656 // <64 x i8> -> <16 x i8> <8 x i16>
657 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
658 // Extracting subvector with dynamic index has very large expansion in
659 // the amdgpu backend. Limit to pow2.
660 FixedVectorType *VectorTy = AA.Vector.Ty;
661 TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
662 uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
663 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
664 unsigned TotalNumElts = VectorTy->getNumElements();
665 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
666 if (!isa<ConstantInt>(Index) &&
667 llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
668 IsProperlyDivisible && IsAlignedLoad) {
669 IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
670 const unsigned NewNumElts =
671 DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
672 const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
673 FixedVectorType *BitCastTy =
674 FixedVectorType::get(NewElemTy, NewNumElts);
675 Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
676 Value *NewIdx = Builder.CreateLShr(
677 Index, ConstantInt::get(Index->getType(), LShrAmt));
678 Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
679 Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
680 Inst->replaceAllUsesWith(BCOut);
681 return nullptr;
682 }
683
684 Value *SubVec = PoisonValue::get(SubVecTy);
685 for (unsigned K = 0; K < NumLoadedElts; ++K) {
686 Value *CurIdx =
687 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
688 SubVec = Builder.CreateInsertElement(
689 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
690 }
691
692 if (AccessTy->isPtrOrPtrVectorTy())
693 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
694 else if (SubVecTy->isPtrOrPtrVectorTy())
695 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
696
697 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
698 Inst->replaceAllUsesWith(SubVec);
699 return nullptr;
700 }
701
702 // We're loading one element.
703 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
704 if (AccessTy != VecEltTy)
705 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
706
707 Inst->replaceAllUsesWith(ExtractElement);
708 return nullptr;
709 }
710 case Instruction::Store: {
711 // For stores, it's a bit trickier and it depends on whether we're storing
712 // the full vector or not. If we're storing the full vector, we don't need
713 // to know the current value. If this is a store of a single element, we
714 // need to know the value.
716 Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
717 Value *Val = SI->getValueOperand();
718
719 // We're storing the full vector, we can handle this without knowing CurVal.
720 Type *AccessTy = Val->getType();
721 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
722 if (Constant *CI = dyn_cast<Constant>(Index)) {
723 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
724 if (AccessTy->isPtrOrPtrVectorTy())
725 Val = CreateTempPtrIntCast(Val, AccessTy);
726 else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
727 Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
728 return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
729 }
730 }
731
732 // Storing a subvector.
733 if (isa<FixedVectorType>(AccessTy)) {
734 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
735 const unsigned NumWrittenElts =
736 AccessSize / DL.getTypeStoreSize(VecEltTy);
737 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
738 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
739 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
740
741 if (SubVecTy->isPtrOrPtrVectorTy())
742 Val = CreateTempPtrIntCast(Val, SubVecTy);
743 else if (AccessTy->isPtrOrPtrVectorTy())
744 Val = CreateTempPtrIntCast(Val, AccessTy);
745
746 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
747
748 Value *CurVec = GetCurVal();
749 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
750 K < NumElts; ++K) {
751 Value *CurIdx =
752 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
753 CurVec = Builder.CreateInsertElement(
754 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
755 }
756 return CurVec;
757 }
758
759 if (Val->getType() != VecEltTy)
760 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
761 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
762 }
763 case Instruction::Call: {
764 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
765 // For memcpy, we need to know curval.
766 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
767 unsigned NumCopied = Length->getZExtValue() / ElementSize;
768 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
769 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
770 unsigned DestBegin = TI->DestIndex->getZExtValue();
771
772 SmallVector<int> Mask;
773 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
774 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
775 Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
776 ? SrcBegin++
778 } else {
779 Mask.push_back(Idx);
780 }
781 }
782
783 return Builder.CreateShuffleVector(GetCurVal(), Mask);
784 }
785
786 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
787 // For memset, we don't need to know the previous value because we
788 // currently only allow memsets that cover the whole alloca.
789 Value *Elt = MSI->getOperand(1);
790 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
791 if (BytesPerElt > 1) {
792 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
793
794 // If the element type of the vector is a pointer, we need to first cast
795 // to an integer, then use a PtrCast.
796 if (VecEltTy->isPointerTy()) {
797 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
798 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
799 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
800 } else
801 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
802 }
803
804 return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
805 }
806
807 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
808 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809 Intr->replaceAllUsesWith(
810 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
811 DL.getTypeAllocSize(AA.Vector.Ty)));
812 return nullptr;
813 }
814 }
815
816 llvm_unreachable("Unsupported call when promoting alloca to vector");
817 }
818
819 default:
820 llvm_unreachable("Inconsistency in instructions promotable to vector");
821 }
822
823 llvm_unreachable("Did not return after promoting instruction!");
824}
825
826static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
827 const DataLayout &DL) {
828 // Access as a vector type can work if the size of the access vector is a
829 // multiple of the size of the alloca's vector element type.
830 //
831 // Examples:
832 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
833 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
834 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
835 // - 3*32 is not a multiple of 64
836 //
837 // We could handle more complicated cases, but it'd make things a lot more
838 // complicated.
839 if (isa<FixedVectorType>(AccessTy)) {
840 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
841 // If the type size and the store size don't match, we would need to do more
842 // than just bitcast to translate between an extracted/insertable subvectors
843 // and the accessed value.
844 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
845 return false;
846 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
847 return AccTS.isKnownMultipleOf(VecTS);
848 }
849
851 DL);
852}
853
854/// Iterates over an instruction worklist that may contain multiple instructions
855/// from the same basic block, but in a different order.
856template <typename InstContainer>
857static void forEachWorkListItem(const InstContainer &WorkList,
858 std::function<void(Instruction *)> Fn) {
859 // Bucket up uses of the alloca by the block they occur in.
860 // This is important because we have to handle multiple defs/uses in a block
861 // ourselves: SSAUpdater is purely for cross-block references.
863 for (Instruction *User : WorkList)
864 UsesByBlock[User->getParent()].insert(User);
865
866 for (Instruction *User : WorkList) {
867 BasicBlock *BB = User->getParent();
868 auto &BlockUses = UsesByBlock[BB];
869
870 // Already processed, skip.
871 if (BlockUses.empty())
872 continue;
873
874 // Only user in the block, directly process it.
875 if (BlockUses.size() == 1) {
876 Fn(User);
877 continue;
878 }
879
880 // Multiple users in the block, do a linear scan to see users in order.
881 for (Instruction &Inst : *BB) {
882 if (!BlockUses.contains(&Inst))
883 continue;
884
885 Fn(&Inst);
886 }
887
888 // Clear the block so we know it's been processed.
889 BlockUses.clear();
890 }
891}
892
893/// Find an insert point after an alloca, after all other allocas clustered at
894/// the start of the block.
897 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
898 ;
899 return I;
900}
901
903AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
904 if (DisablePromoteAllocaToVector) {
905 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
906 return nullptr;
907 }
908
909 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
910 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
911 uint64_t NumElems = 1;
912 Type *ElemTy;
913 do {
914 NumElems *= ArrayTy->getNumElements();
915 ElemTy = ArrayTy->getElementType();
916 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
917
918 // Check for array of vectors
919 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
920 if (InnerVectorTy) {
921 NumElems *= InnerVectorTy->getNumElements();
922 ElemTy = InnerVectorTy->getElementType();
923 }
924
925 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
926 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
927 if (ElementSize > 0) {
928 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
929 // Expand vector if required to match padding of inner type,
930 // i.e. odd size subvectors.
931 // Storage size of new vector must match that of alloca for correct
932 // behaviour of byte offsets and GEP computation.
933 if (NumElems * ElementSize != AllocaSize)
934 NumElems = AllocaSize / ElementSize;
935 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
936 VectorTy = FixedVectorType::get(ElemTy, NumElems);
937 }
938 }
939 }
940 if (!VectorTy) {
941 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
942 return nullptr;
943 }
944
945 const unsigned MaxElements =
946 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
947
948 if (VectorTy->getNumElements() > MaxElements ||
949 VectorTy->getNumElements() < 2) {
950 LLVM_DEBUG(dbgs() << " " << *VectorTy
951 << " has an unsupported number of elements\n");
952 return nullptr;
953 }
954
955 Type *VecEltTy = VectorTy->getElementType();
956 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
957 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
958 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
959 "does not match the type's size\n");
960 return nullptr;
961 }
962
963 return VectorTy;
964}
965
966void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
967 if (AA.HaveSelectOrPHI) {
968 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
969 return;
970 }
971
972 Type *AllocaTy = AA.Alloca->getAllocatedType();
973 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974 if (!AA.Vector.Ty)
975 return;
976
977 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
978 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
979 << " " << *Inst << "\n");
980 AA.Vector.Ty = nullptr;
981 };
982
983 Type *VecEltTy = AA.Vector.Ty->getElementType();
984 unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
985 assert(ElementSize > 0);
986 for (auto *U : AA.Uses) {
987 Instruction *Inst = cast<Instruction>(U->getUser());
988
989 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
990 assert(!isa<StoreInst>(Inst) ||
991 U->getOperandNo() == StoreInst::getPointerOperandIndex());
992
993 Type *AccessTy = getLoadStoreType(Inst);
994 if (AccessTy->isAggregateType())
995 return RejectUser(Inst, "unsupported load/store as aggregate");
996 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
997
998 // Check that this is a simple access of a vector element.
999 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
1000 : cast<StoreInst>(Inst)->isSimple();
1001 if (!IsSimple)
1002 return RejectUser(Inst, "not a simple load or store");
1003
1004 Ptr = Ptr->stripPointerCasts();
1005
1006 // Alloca already accessed as vector.
1007 if (Ptr == AA.Alloca &&
1008 DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
1009 DL->getTypeStoreSize(AccessTy)) {
1010 AA.Vector.Worklist.push_back(Inst);
1011 continue;
1012 }
1013
1014 if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
1015 return RejectUser(Inst, "not a supported access type");
1016
1017 AA.Vector.Worklist.push_back(Inst);
1018 continue;
1019 }
1020
1021 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
1022 // If we can't compute a vector index from this GEP, then we can't
1023 // promote this alloca to vector.
1024 auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
1025 if (!Index)
1026 return RejectUser(Inst, "cannot compute vector index for GEP");
1027
1028 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1029 AA.Vector.UsersToRemove.push_back(Inst);
1030 continue;
1031 }
1032
1033 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
1034 MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
1035 AA.Vector.Worklist.push_back(Inst);
1036 continue;
1037 }
1038
1039 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
1040 if (TransferInst->isVolatile())
1041 return RejectUser(Inst, "mem transfer inst is volatile");
1042
1043 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
1044 if (!Len || (Len->getZExtValue() % ElementSize))
1045 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1046 "not a multiple of the vector element size");
1047
1048 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1049 if (Ptr == AA.Alloca)
1050 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1051
1053 const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
1054 if (GEPI.VarIndex)
1055 return nullptr;
1056 if (GEPI.ConstIndex)
1057 return GEPI.ConstIndex;
1058 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1059 };
1060
1061 MemTransferInfo *TI =
1062 &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1063 unsigned OpNum = U->getOperandNo();
1064 if (OpNum == 0) {
1065 Value *Dest = TransferInst->getDest();
1066 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1067 if (!Index)
1068 return RejectUser(Inst, "could not calculate constant dest index");
1069 TI->DestIndex = Index;
1070 } else {
1071 assert(OpNum == 1);
1072 Value *Src = TransferInst->getSource();
1073 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1074 if (!Index)
1075 return RejectUser(Inst, "could not calculate constant src index");
1076 TI->SrcIndex = Index;
1077 }
1078 continue;
1079 }
1080
1081 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
1082 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083 AA.Vector.Worklist.push_back(Inst);
1084 continue;
1085 }
1086 }
1087
1088 // Ignore assume-like intrinsics and comparisons used in assumes.
1089 if (isAssumeLikeIntrinsic(Inst)) {
1090 if (!Inst->use_empty())
1091 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1092 AA.Vector.UsersToRemove.push_back(Inst);
1093 continue;
1094 }
1095
1096 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
1097 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1098 })) {
1099 AA.Vector.UsersToRemove.push_back(Inst);
1100 continue;
1101 }
1102
1103 return RejectUser(Inst, "unhandled alloca user");
1104 }
1105
1106 // Follow-up check to ensure we've seen both sides of all transfer insts.
1107 for (const auto &Entry : AA.Vector.TransferInfo) {
1108 const MemTransferInfo &TI = Entry.second;
1109 if (!TI.SrcIndex || !TI.DestIndex)
1110 return RejectUser(Entry.first,
1111 "mem transfer inst between different objects");
1112 AA.Vector.Worklist.push_back(Entry.first);
1113 }
1114}
1115
1116void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1117 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1118 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1119 << " -> " << *AA.Vector.Ty << '\n');
1120 const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
1121
1122 Type *VecEltTy = AA.Vector.Ty->getElementType();
1123 const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
1124
1125 // Alloca is uninitialized memory. Imitate that by making the first value
1126 // undef.
1127 SSAUpdater Updater;
1128 Updater.Initialize(AA.Vector.Ty, "promotealloca");
1129
1130 BasicBlock *EntryBB = AA.Alloca->getParent();
1131 BasicBlock::iterator InitInsertPos =
1132 skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
1133 IRBuilder<> Builder(&*InitInsertPos);
1134 Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
1135 AllocaInitValue->takeName(AA.Alloca);
1136
1137 Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
1138
1139 // First handle the initial worklist, in basic block order.
1140 //
1141 // Insert a placeholder whenever we need the vector value at the top of a
1142 // basic block.
1143 SmallVector<Instruction *> Placeholders;
1144 forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
1145 BasicBlock *BB = I->getParent();
1146 auto GetCurVal = [&]() -> Value * {
1147 if (Value *CurVal = Updater.FindValueForBlock(BB))
1148 return CurVal;
1149
1150 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151 return Placeholders.back();
1152
1153 // If the current value in the basic block is not yet known, insert a
1154 // placeholder that we will replace later.
1155 IRBuilder<> Builder(I);
1156 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1157 PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
1158 Placeholders.push_back(Placeholder);
1159 return Placeholders.back();
1160 };
1161
1162 Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
1163 ElementSize, GetCurVal);
1164 if (Result)
1165 Updater.AddAvailableValue(BB, Result);
1166 });
1167
1168 // Now fixup the placeholders.
1169 SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
1170 for (auto [Index, Placeholder] : enumerate(Placeholders)) {
1171 Value *NewVal = Updater.GetValueInMiddleOfBlock(Placeholder->getParent());
1172 PlaceholderToNewVal[Index] = NewVal;
1173 Placeholder->replaceAllUsesWith(NewVal);
1174 }
1175 // Note: we cannot merge this loop with the previous one because it is
1176 // possible that the placeholder itself can be used in the SSAUpdater. The
1177 // replaceAllUsesWith doesn't replace those uses.
1178 for (auto [Index, Placeholder] : enumerate(Placeholders)) {
1179 if (!Placeholder->use_empty())
1180 Placeholder->replaceAllUsesWith(PlaceholderToNewVal[Index]);
1181 Placeholder->eraseFromParent();
1182 }
1183
1184 // Delete all instructions.
1185 for (Instruction *I : AA.Vector.Worklist) {
1186 assert(I->use_empty());
1187 I->eraseFromParent();
1188 }
1189
1190 // Delete all the users that are known to be removeable.
1191 for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
1192 I->dropDroppableUses();
1193 assert(I->use_empty());
1194 I->eraseFromParent();
1195 }
1196
1197 // Alloca should now be dead too.
1198 assert(AA.Alloca->use_empty());
1199 AA.Alloca->eraseFromParent();
1200}
1201
1202std::pair<Value *, Value *>
1203AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1204 Function &F = *Builder.GetInsertBlock()->getParent();
1206
1207 if (!IsAMDHSA) {
1208 CallInst *LocalSizeY =
1209 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1210 CallInst *LocalSizeZ =
1211 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1212
1213 ST.makeLIDRangeMetadata(LocalSizeY);
1214 ST.makeLIDRangeMetadata(LocalSizeZ);
1215
1216 return std::pair(LocalSizeY, LocalSizeZ);
1217 }
1218
1219 // We must read the size out of the dispatch pointer.
1220 assert(IsAMDGCN);
1221
1222 // We are indexing into this struct, and want to extract the workgroup_size_*
1223 // fields.
1224 //
1225 // typedef struct hsa_kernel_dispatch_packet_s {
1226 // uint16_t header;
1227 // uint16_t setup;
1228 // uint16_t workgroup_size_x ;
1229 // uint16_t workgroup_size_y;
1230 // uint16_t workgroup_size_z;
1231 // uint16_t reserved0;
1232 // uint32_t grid_size_x ;
1233 // uint32_t grid_size_y ;
1234 // uint32_t grid_size_z;
1235 //
1236 // uint32_t private_segment_size;
1237 // uint32_t group_segment_size;
1238 // uint64_t kernel_object;
1239 //
1240 // #ifdef HSA_LARGE_MODEL
1241 // void *kernarg_address;
1242 // #elif defined HSA_LITTLE_ENDIAN
1243 // void *kernarg_address;
1244 // uint32_t reserved1;
1245 // #else
1246 // uint32_t reserved1;
1247 // void *kernarg_address;
1248 // #endif
1249 // uint64_t reserved2;
1250 // hsa_signal_t completion_signal; // uint64_t wrapper
1251 // } hsa_kernel_dispatch_packet_t
1252 //
1253 CallInst *DispatchPtr =
1254 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1255 DispatchPtr->addRetAttr(Attribute::NoAlias);
1256 DispatchPtr->addRetAttr(Attribute::NonNull);
1257 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1258
1259 // Size of the dispatch packet struct.
1260 DispatchPtr->addDereferenceableRetAttr(64);
1261
1262 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1263
1264 // We could do a single 64-bit load here, but it's likely that the basic
1265 // 32-bit and extract sequence is already present, and it is probably easier
1266 // to CSE this. The loads should be mergeable later anyway.
1267 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1268 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1269
1270 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1271 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1272
1273 MDNode *MD = MDNode::get(Mod->getContext(), {});
1274 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1275 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1276 ST.makeLIDRangeMetadata(LoadZU);
1277
1278 // Extract y component. Upper half of LoadZU should be zero already.
1279 Value *Y = Builder.CreateLShr(LoadXY, 16);
1280
1281 return std::pair(Y, LoadZU);
1282}
1283
1284Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1285 unsigned N) {
1286 Function *F = Builder.GetInsertBlock()->getParent();
1289 StringRef AttrName;
1290
1291 switch (N) {
1292 case 0:
1293 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1294 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1295 AttrName = "amdgpu-no-workitem-id-x";
1296 break;
1297 case 1:
1298 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1299 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1300 AttrName = "amdgpu-no-workitem-id-y";
1301 break;
1302
1303 case 2:
1304 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1305 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1306 AttrName = "amdgpu-no-workitem-id-z";
1307 break;
1308 default:
1309 llvm_unreachable("invalid dimension");
1310 }
1311
1312 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1313 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1314 ST.makeLIDRangeMetadata(CI);
1315 F->removeFnAttr(AttrName);
1316
1317 return CI;
1318}
1319
1320static bool isCallPromotable(CallInst *CI) {
1322 if (!II)
1323 return false;
1324
1325 switch (II->getIntrinsicID()) {
1326 case Intrinsic::memcpy:
1327 case Intrinsic::memmove:
1328 case Intrinsic::memset:
1329 case Intrinsic::lifetime_start:
1330 case Intrinsic::lifetime_end:
1331 case Intrinsic::invariant_start:
1332 case Intrinsic::invariant_end:
1333 case Intrinsic::launder_invariant_group:
1334 case Intrinsic::strip_invariant_group:
1335 case Intrinsic::objectsize:
1336 return true;
1337 default:
1338 return false;
1339 }
1340}
1341
1342bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1343 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1344 int OpIdx1) const {
1345 // Figure out which operand is the one we might not be promoting.
1346 Value *OtherOp = Inst->getOperand(OpIdx0);
1347 if (Val == OtherOp)
1348 OtherOp = Inst->getOperand(OpIdx1);
1349
1351 return true;
1352
1353 // TODO: getUnderlyingObject will not work on a vector getelementptr
1354 Value *OtherObj = getUnderlyingObject(OtherOp);
1355 if (!isa<AllocaInst>(OtherObj))
1356 return false;
1357
1358 // TODO: We should be able to replace undefs with the right pointer type.
1359
1360 // TODO: If we know the other base object is another promotable
1361 // alloca, not necessarily this alloca, we can do this. The
1362 // important part is both must have the same address space at
1363 // the end.
1364 if (OtherObj != BaseAlloca) {
1365 LLVM_DEBUG(
1366 dbgs() << "Found a binary instruction with another alloca object\n");
1367 return false;
1368 }
1369
1370 return true;
1371}
1372
1373void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1374 if (DisablePromoteAllocaToLDS) {
1375 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1376 return;
1377 }
1378
1379 // Don't promote the alloca to LDS for shader calling conventions as the work
1380 // item ID intrinsics are not supported for these calling conventions.
1381 // Furthermore not all LDS is available for some of the stages.
1382 const Function &ContainingFunction = *AA.Alloca->getFunction();
1383 CallingConv::ID CC = ContainingFunction.getCallingConv();
1384
1385 switch (CC) {
1388 break;
1389 default:
1390 LLVM_DEBUG(
1391 dbgs()
1392 << " promote alloca to LDS not supported with calling convention.\n");
1393 return;
1394 }
1395
1396 for (Use *Use : AA.Uses) {
1397 auto *User = Use->getUser();
1398
1399 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1400 if (!isCallPromotable(CI))
1401 return;
1402
1403 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1404 AA.LDS.Worklist.push_back(User);
1405 continue;
1406 }
1407
1409 if (UseInst->getOpcode() == Instruction::PtrToInt)
1410 return;
1411
1412 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1413 if (LI->isVolatile())
1414 return;
1415 continue;
1416 }
1417
1418 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1419 if (SI->isVolatile())
1420 return;
1421 continue;
1422 }
1423
1424 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1425 if (RMW->isVolatile())
1426 return;
1427 continue;
1428 }
1429
1430 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1431 if (CAS->isVolatile())
1432 return;
1433 continue;
1434 }
1435
1436 // Only promote a select if we know that the other select operand
1437 // is from another pointer that will also be promoted.
1438 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1439 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
1440 return;
1441
1442 // May need to rewrite constant operands.
1443 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1444 AA.LDS.Worklist.push_back(ICmp);
1445 continue;
1446 }
1447
1449 // Be conservative if an address could be computed outside the bounds of
1450 // the alloca.
1451 if (!GEP->isInBounds())
1452 return;
1454 // Do not promote vector/aggregate type instructions. It is hard to track
1455 // their users.
1456
1457 // Do not promote addrspacecast.
1458 //
1459 // TODO: If we know the address is only observed through flat pointers, we
1460 // could still promote.
1461 return;
1462 }
1463
1464 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1465 AA.LDS.Worklist.push_back(User);
1466 }
1467
1468 AA.LDS.Enable = true;
1469}
1470
1471bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1472
1473 FunctionType *FTy = F.getFunctionType();
1475
1476 // If the function has any arguments in the local address space, then it's
1477 // possible these arguments require the entire local memory space, so
1478 // we cannot use local memory in the pass.
1479 for (Type *ParamTy : FTy->params()) {
1480 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1481 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1482 LocalMemLimit = 0;
1483 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1484 "local memory disabled.\n");
1485 return false;
1486 }
1487 }
1488
1489 LocalMemLimit = ST.getAddressableLocalMemorySize();
1490 if (LocalMemLimit == 0)
1491 return false;
1492
1494 SmallPtrSet<const Constant *, 8> VisitedConstants;
1496
1497 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1498 for (const User *U : Val->users()) {
1499 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1500 if (Use->getFunction() == &F)
1501 return true;
1502 } else {
1503 const Constant *C = cast<Constant>(U);
1504 if (VisitedConstants.insert(C).second)
1505 Stack.push_back(C);
1506 }
1507 }
1508
1509 return false;
1510 };
1511
1512 for (GlobalVariable &GV : Mod->globals()) {
1514 continue;
1515
1516 if (visitUsers(&GV, &GV)) {
1517 UsedLDS.insert(&GV);
1518 Stack.clear();
1519 continue;
1520 }
1521
1522 // For any ConstantExpr uses, we need to recursively search the users until
1523 // we see a function.
1524 while (!Stack.empty()) {
1525 const Constant *C = Stack.pop_back_val();
1526 if (visitUsers(&GV, C)) {
1527 UsedLDS.insert(&GV);
1528 Stack.clear();
1529 break;
1530 }
1531 }
1532 }
1533
1534 const DataLayout &DL = Mod->getDataLayout();
1535 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1536 AllocatedSizes.reserve(UsedLDS.size());
1537
1538 for (const GlobalVariable *GV : UsedLDS) {
1539 Align Alignment =
1540 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1541 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1542
1543 // HIP uses an extern unsized array in local address space for dynamically
1544 // allocated shared memory. In that case, we have to disable the promotion.
1545 if (GV->hasExternalLinkage() && AllocSize == 0) {
1546 LocalMemLimit = 0;
1547 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1548 "local memory. Promoting to local memory "
1549 "disabled.\n");
1550 return false;
1551 }
1552
1553 AllocatedSizes.emplace_back(AllocSize, Alignment);
1554 }
1555
1556 // Sort to try to estimate the worst case alignment padding
1557 //
1558 // FIXME: We should really do something to fix the addresses to a more optimal
1559 // value instead
1560 llvm::sort(AllocatedSizes, llvm::less_second());
1561
1562 // Check how much local memory is being used by global objects
1563 CurrentLocalMemUsage = 0;
1564
1565 // FIXME: Try to account for padding here. The real padding and address is
1566 // currently determined from the inverse order of uses in the function when
1567 // legalizing, which could also potentially change. We try to estimate the
1568 // worst case here, but we probably should fix the addresses earlier.
1569 for (auto Alloc : AllocatedSizes) {
1570 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1571 CurrentLocalMemUsage += Alloc.first;
1572 }
1573
1574 unsigned MaxOccupancy =
1575 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1576 .second;
1577
1578 // Round up to the next tier of usage.
1579 unsigned MaxSizeWithWaveCount =
1580 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1581
1582 // Program may already use more LDS than is usable at maximum occupancy.
1583 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1584 return false;
1585
1586 LocalMemLimit = MaxSizeWithWaveCount;
1587
1588 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1589 << " bytes of LDS\n"
1590 << " Rounding size to " << MaxSizeWithWaveCount
1591 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1592 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1593 << " available for promotion\n");
1594
1595 return true;
1596}
1597
1598// FIXME: Should try to pick the most likely to be profitable allocas first.
1599bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1600 AllocaAnalysis &AA, bool SufficientLDS,
1601 SetVector<IntrinsicInst *> &DeferredIntrs) {
1602 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1603
1604 // Not likely to have sufficient local memory for promotion.
1605 if (!SufficientLDS)
1606 return false;
1607
1608 const DataLayout &DL = Mod->getDataLayout();
1609 IRBuilder<> Builder(AA.Alloca);
1610
1611 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1612 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1613 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1614
1615 Align Alignment = DL.getValueOrABITypeAlignment(
1616 AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
1617
1618 // FIXME: This computed padding is likely wrong since it depends on inverse
1619 // usage order.
1620 //
1621 // FIXME: It is also possible that if we're allowed to use all of the memory
1622 // could end up using more than the maximum due to alignment padding.
1623
1624 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1625 uint32_t AllocSize =
1626 WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
1627 NewSize += AllocSize;
1628
1629 if (NewSize > LocalMemLimit) {
1630 LLVM_DEBUG(dbgs() << " " << AllocSize
1631 << " bytes of local memory not available to promote\n");
1632 return false;
1633 }
1634
1635 CurrentLocalMemUsage = NewSize;
1636
1637 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1638
1639 Function *F = AA.Alloca->getFunction();
1640
1641 Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
1644 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1647 GV->setAlignment(AA.Alloca->getAlign());
1648
1649 Value *TCntY, *TCntZ;
1650
1651 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1652 Value *TIdX = getWorkitemID(Builder, 0);
1653 Value *TIdY = getWorkitemID(Builder, 1);
1654 Value *TIdZ = getWorkitemID(Builder, 2);
1655
1656 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1657 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1658 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1659 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1660 TID = Builder.CreateAdd(TID, TIdZ);
1661
1662 LLVMContext &Context = Mod->getContext();
1664
1665 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1666 AA.Alloca->mutateType(Offset->getType());
1667 AA.Alloca->replaceAllUsesWith(Offset);
1668 AA.Alloca->eraseFromParent();
1669
1671
1672 for (Value *V : AA.LDS.Worklist) {
1674 if (!Call) {
1675 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1676 Value *LHS = CI->getOperand(0);
1677 Value *RHS = CI->getOperand(1);
1678
1679 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1681 CI->setOperand(0, Constant::getNullValue(NewTy));
1682
1684 CI->setOperand(1, Constant::getNullValue(NewTy));
1685
1686 continue;
1687 }
1688
1689 // The operand's value should be corrected on its own and we don't want to
1690 // touch the users.
1692 continue;
1693
1694 assert(V->getType()->isPtrOrPtrVectorTy());
1695
1696 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1697 V->mutateType(NewTy);
1698
1699 // Adjust the types of any constant operands.
1702 SI->setOperand(1, Constant::getNullValue(NewTy));
1703
1705 SI->setOperand(2, Constant::getNullValue(NewTy));
1706 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1707 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1709 Phi->getIncomingValue(I)))
1710 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1711 }
1712 }
1713
1714 continue;
1715 }
1716
1718 Builder.SetInsertPoint(Intr);
1719 switch (Intr->getIntrinsicID()) {
1720 case Intrinsic::lifetime_start:
1721 case Intrinsic::lifetime_end:
1722 // These intrinsics are for address space 0 only
1723 Intr->eraseFromParent();
1724 continue;
1725 case Intrinsic::memcpy:
1726 case Intrinsic::memmove:
1727 // These have 2 pointer operands. In case if second pointer also needs
1728 // to be replaced we defer processing of these intrinsics until all
1729 // other values are processed.
1730 DeferredIntrs.insert(Intr);
1731 continue;
1732 case Intrinsic::memset: {
1733 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1734 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1735 MemSet->getLength(), MemSet->getDestAlign(),
1736 MemSet->isVolatile());
1737 Intr->eraseFromParent();
1738 continue;
1739 }
1740 case Intrinsic::invariant_start:
1741 case Intrinsic::invariant_end:
1742 case Intrinsic::launder_invariant_group:
1743 case Intrinsic::strip_invariant_group: {
1745 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1746 Args.emplace_back(Intr->getArgOperand(0));
1747 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1748 Args.emplace_back(Intr->getArgOperand(0));
1749 Args.emplace_back(Intr->getArgOperand(1));
1750 }
1751 Args.emplace_back(Offset);
1753 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1754 CallInst *NewIntr =
1755 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1756 Intr->mutateType(NewIntr->getType());
1757 Intr->replaceAllUsesWith(NewIntr);
1758 Intr->eraseFromParent();
1759 continue;
1760 }
1761 case Intrinsic::objectsize: {
1762 Value *Src = Intr->getOperand(0);
1763
1764 CallInst *NewCall = Builder.CreateIntrinsic(
1765 Intrinsic::objectsize,
1767 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1768 Intr->replaceAllUsesWith(NewCall);
1769 Intr->eraseFromParent();
1770 continue;
1771 }
1772 default:
1773 Intr->print(errs());
1774 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1775 }
1776 }
1777
1778 return true;
1779}
1780
1781void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1782 SetVector<IntrinsicInst *> &DeferredIntrs) {
1783
1784 for (IntrinsicInst *Intr : DeferredIntrs) {
1785 IRBuilder<> Builder(Intr);
1786 Builder.SetInsertPoint(Intr);
1788 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1789
1791 auto *B = Builder.CreateMemTransferInst(
1792 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1793 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1794
1795 for (unsigned I = 0; I != 2; ++I) {
1796 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1797 B->addDereferenceableParamAttr(I, Bytes);
1798 }
1799 }
1800
1801 Intr->eraseFromParent();
1802 }
1803}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
AMDGPU promote alloca to vector or LDS
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static std::optional< GEPToVectorIndex > computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL)
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1996
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1078
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:928
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:238
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:708
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1106
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446