LLVM 23.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
48
49#define DEBUG_TYPE "amdgpu-promote-alloca"
50
51using namespace llvm;
52
53namespace {
54
55static cl::opt<bool>
56 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
57 cl::desc("Disable promote alloca to vector"),
58 cl::init(false));
59
60static cl::opt<bool>
61 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
62 cl::desc("Disable promote alloca to LDS"),
63 cl::init(false));
64
65static cl::opt<unsigned> PromoteAllocaToVectorLimit(
66 "amdgpu-promote-alloca-to-vector-limit",
67 cl::desc("Maximum byte size to consider promote alloca to vector"),
68 cl::init(0));
69
70static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
71 "amdgpu-promote-alloca-to-vector-max-regs",
73 "Maximum vector size (in 32b registers) to use when promoting alloca"),
74 cl::init(32));
75
76// Use up to 1/4 of available register budget for vectorization.
77// FIXME: Increase the limit for whole function budgets? Perhaps x2?
78static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
79 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
80 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
81 cl::init(4));
82
84 LoopUserWeight("promote-alloca-vector-loop-user-weight",
85 cl::desc("The bonus weight of users of allocas within loop "
86 "when sorting profitable allocas"),
87 cl::init(4));
88
89// We support vector indices of the form (A * stride) + B
90// All parts are optional.
91struct GEPToVectorIndex {
92 Value *VarIndex = nullptr; // defaults to 0
93 ConstantInt *VarMul = nullptr; // defaults to 1
94 ConstantInt *ConstIndex = nullptr; // defaults to 0
95 Value *Full = nullptr;
96};
97
98struct MemTransferInfo {
99 ConstantInt *SrcIndex = nullptr;
100 ConstantInt *DestIndex = nullptr;
101};
102
103// Analysis for planning the different strategies of alloca promotion.
104struct AllocaAnalysis {
105 AllocaInst *Alloca = nullptr;
106 DenseSet<Value *> Pointers;
108 unsigned Score = 0;
109 bool HaveSelectOrPHI = false;
110 struct {
111 FixedVectorType *Ty = nullptr;
113 SmallVector<Instruction *> UsersToRemove;
116 } Vector;
117 struct {
118 bool Enable = false;
119 SmallVector<User *> Worklist;
120 } LDS;
121
122 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
123};
124
125// Shared implementation which can do both promotion to vector and to LDS.
126class AMDGPUPromoteAllocaImpl {
127private:
128 const TargetMachine &TM;
129 LoopInfo &LI;
130 Module *Mod = nullptr;
131 const DataLayout *DL = nullptr;
132
133 // FIXME: This should be per-kernel.
134 uint32_t LocalMemLimit = 0;
135 uint32_t CurrentLocalMemUsage = 0;
136 unsigned MaxVGPRs;
137 unsigned VGPRBudgetRatio;
138 unsigned MaxVectorRegs;
139
140 bool IsAMDGCN = false;
141 bool IsAMDHSA = false;
142
143 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
144 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
145
146 bool collectAllocaUses(AllocaAnalysis &AA) const;
147
148 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
149 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
150 /// Returns true if both operands are derived from the same alloca. Val should
151 /// be the same value as one of the input operands of UseInst.
152 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
153 Instruction *UseInst, int OpIdx0,
154 int OpIdx1) const;
155
156 /// Check whether we have enough local memory for promotion.
157 bool hasSufficientLocalMem(const Function &F);
158
159 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
160 void analyzePromoteToVector(AllocaAnalysis &AA) const;
161 void promoteAllocaToVector(AllocaAnalysis &AA);
162 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
163 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS,
164 SetVector<IntrinsicInst *> &DeferredIntrs);
165 void
166 finishDeferredAllocaToLDSPromotion(SetVector<IntrinsicInst *> &DeferredIntrs);
167
168 void scoreAlloca(AllocaAnalysis &AA) const;
169
170 void setFunctionLimits(const Function &F);
171
172public:
173 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
174
175 const Triple &TT = TM.getTargetTriple();
176 IsAMDGCN = TT.isAMDGCN();
177 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
178 }
179
180 bool run(Function &F, bool PromoteToLDS);
181};
182
183// FIXME: This can create globals so should be a module pass.
184class AMDGPUPromoteAlloca : public FunctionPass {
185public:
186 static char ID;
187
188 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
189
190 bool runOnFunction(Function &F) override {
191 if (skipFunction(F))
192 return false;
193 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
194 return AMDGPUPromoteAllocaImpl(
195 TPC->getTM<TargetMachine>(),
196 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
197 .run(F, /*PromoteToLDS*/ true);
198 return false;
199 }
200
201 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
202
203 void getAnalysisUsage(AnalysisUsage &AU) const override {
204 AU.setPreservesCFG();
207 }
208};
209
210static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
211 const Function &F) {
212 if (!TM.getTargetTriple().isAMDGCN())
213 return 128;
214
215 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
216
217 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
218 // Temporarily check both the attribute and the subtarget feature, until the
219 // latter is removed.
220 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
221 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
222
223 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
224 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
225 DynamicVGPRBlockSize);
226
227 // A non-entry function has only 32 caller preserved registers.
228 // Do not promote alloca which will force spilling unless we know the function
229 // will be inlined.
230 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
231 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
232 MaxVGPRs = std::min(MaxVGPRs, 32u);
233 return MaxVGPRs;
234}
235
236} // end anonymous namespace
237
238char AMDGPUPromoteAlloca::ID = 0;
239
241 "AMDGPU promote alloca to vector or LDS", false, false)
242// Move LDS uses from functions to kernels before promote alloca for accurate
243// estimation of LDS available
244INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
246INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
247 "AMDGPU promote alloca to vector or LDS", false, false)
248
249char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
250
253 auto &LI = AM.getResult<LoopAnalysis>(F);
254 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
255 if (Changed) {
258 return PA;
259 }
260 return PreservedAnalyses::all();
261}
262
265 auto &LI = AM.getResult<LoopAnalysis>(F);
266 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
267 if (Changed) {
270 return PA;
271 }
272 return PreservedAnalyses::all();
273}
274
276 return new AMDGPUPromoteAlloca();
277}
278
279bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
280 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
281 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
282 << " " << *Inst << "\n");
283 return false;
284 };
285
286 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
287 while (!WorkList.empty()) {
288 auto *Cur = WorkList.pop_back_val();
289 if (find(AA.Pointers, Cur) != AA.Pointers.end())
290 continue;
291 AA.Pointers.insert(Cur);
292 for (auto &U : Cur->uses()) {
293 auto *Inst = cast<Instruction>(U.getUser());
294 if (isa<StoreInst>(Inst)) {
295 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
296 return RejectUser(Inst, "pointer escapes via store");
297 }
298 }
299 AA.Uses.push_back(&U);
300
301 if (isa<GetElementPtrInst>(U.getUser())) {
302 WorkList.push_back(Inst);
303 } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
304 // Only promote a select if we know that the other select operand is
305 // from another pointer that will also be promoted.
306 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
307 return RejectUser(Inst, "select from mixed objects");
308 WorkList.push_back(Inst);
309 AA.HaveSelectOrPHI = true;
310 } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
311 // Repeat for phis.
312
313 // TODO: Handle more complex cases. We should be able to replace loops
314 // over arrays.
315 switch (Phi->getNumIncomingValues()) {
316 case 1:
317 break;
318 case 2:
319 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
320 return RejectUser(Inst, "phi from mixed objects");
321 break;
322 default:
323 return RejectUser(Inst, "phi with too many operands");
324 }
325
326 WorkList.push_back(Inst);
327 AA.HaveSelectOrPHI = true;
328 }
329 }
330 }
331 return true;
332}
333
334void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
335 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
336 unsigned Score = 0;
337 // Increment score by one for each user + a bonus for users within loops.
338 for (auto *U : AA.Uses) {
339 Instruction *Inst = cast<Instruction>(U->getUser());
340 if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
341 isa<PHINode>(Inst))
342 continue;
343 unsigned UserScore =
344 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
345 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
346 Score += UserScore;
347 }
348 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
349 AA.Score = Score;
350}
351
352void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
353 // Load per function limits, overriding with global options where appropriate.
354 // R600 register tuples/aliasing are fragile with large vector promotions so
355 // apply architecture specific limit here.
356 const int R600MaxVectorRegs = 16;
357 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
358 "amdgpu-promote-alloca-to-vector-max-regs",
359 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
360 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
361 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
362 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
363 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
364 PromoteAllocaToVectorVGPRRatio);
365 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
366 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
367}
368
369bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
370 Mod = F.getParent();
371 DL = &Mod->getDataLayout();
372
374 if (!ST.enablePromoteAlloca())
375 return false;
376
377 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
378 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
379 setFunctionLimits(F);
380
381 unsigned VectorizationBudget =
382 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
383 : (MaxVGPRs * 32)) /
384 VGPRBudgetRatio;
385
386 std::vector<AllocaAnalysis> Allocas;
387 for (Instruction &I : F.getEntryBlock()) {
388 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
389 // Array allocations are probably not worth handling, since an allocation
390 // of the array type is the canonical form.
391 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
392 continue;
393
394 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
395
396 AllocaAnalysis AA{AI};
397 if (collectAllocaUses(AA)) {
398 analyzePromoteToVector(AA);
399 if (PromoteToLDS)
400 analyzePromoteToLDS(AA);
401 if (AA.Vector.Ty || AA.LDS.Enable) {
402 scoreAlloca(AA);
403 Allocas.push_back(std::move(AA));
404 }
405 }
406 }
407 }
408
409 stable_sort(Allocas,
410 [](const auto &A, const auto &B) { return A.Score > B.Score; });
411
412 // clang-format off
414 dbgs() << "Sorted Worklist:\n";
415 for (const auto &AA : Allocas)
416 dbgs() << " " << *AA.Alloca << "\n";
417 );
418 // clang-format on
419
420 bool Changed = false;
421 SetVector<IntrinsicInst *> DeferredIntrs;
422 for (AllocaAnalysis &AA : Allocas) {
423 if (AA.Vector.Ty) {
424 const unsigned AllocaCost =
425 DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
426 // First, check if we have enough budget to vectorize this alloca.
427 if (AllocaCost <= VectorizationBudget) {
428 promoteAllocaToVector(AA);
429 Changed = true;
430 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
431 "Underflow!");
432 VectorizationBudget -= AllocaCost;
433 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
434 << VectorizationBudget << "\n");
435 continue;
436 } else {
437 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
438 << AllocaCost << ", budget:" << VectorizationBudget
439 << "): " << *AA.Alloca << "\n");
440 }
441 }
442
443 if (AA.LDS.Enable &&
444 tryPromoteAllocaToLDS(AA, SufficientLDS, DeferredIntrs))
445 Changed = true;
446 }
447 finishDeferredAllocaToLDSPromotion(DeferredIntrs);
448
449 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
450 // dangling pointers. If we want to reuse it past this point, the loop above
451 // would need to be updated to remove successfully promoted allocas.
452
453 return Changed;
454}
455
456// Checks if the instruction I is a memset user of the alloca AI that we can
457// deal with. Currently, only non-volatile memsets that affect the whole alloca
458// are handled.
460 const DataLayout &DL) {
461 using namespace PatternMatch;
462 // For now we only care about non-volatile memsets that affect the whole type
463 // (start at index 0 and fill the whole alloca).
464 //
465 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
466 // (except maybe volatile ones?) - we just need to use shufflevector if it
467 // only affects a subset of the vector.
468 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
469 return I->getOperand(0) == AI &&
470 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
471}
472
473static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
474 IRBuilder<> B(Ptr->getContext());
475
476 Ptr = Ptr->stripPointerCasts();
477 if (Ptr == AA.Alloca)
478 return B.getInt32(0);
479
480 auto *GEP = cast<GetElementPtrInst>(Ptr);
481 auto I = AA.Vector.GEPVectorIdx.find(GEP);
482 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
483
484 if (!I->second.Full) {
485 Value *Result = nullptr;
486 B.SetInsertPoint(GEP);
487
488 if (I->second.VarIndex) {
489 Result = I->second.VarIndex;
490 Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
491
492 if (I->second.VarMul)
493 Result = B.CreateMul(Result, I->second.VarMul);
494 }
495
496 if (I->second.ConstIndex) {
497 if (Result)
498 Result = B.CreateAdd(Result, I->second.ConstIndex);
499 else
500 Result = I->second.ConstIndex;
501 }
502
503 if (!Result)
504 Result = B.getInt32(0);
505
506 I->second.Full = Result;
507 }
508
509 return I->second.Full;
510}
511
512static std::optional<GEPToVectorIndex>
514 Type *VecElemTy, const DataLayout &DL) {
515 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
516 // helper.
517 LLVMContext &Ctx = GEP->getContext();
518 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
520 APInt ConstOffset(BW, 0);
521
522 // Walk backwards through nested GEPs to collect both constant and variable
523 // offsets, so that nested vector GEP chains can be lowered in one step.
524 //
525 // Given this IR fragment as input:
526 //
527 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
528 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
529 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
530 // %3 = load i32, ptr addrspace(5) %2, align 4
531 //
532 // Combine both GEP operations in a single pass, producing:
533 // BasePtr = %0
534 // ConstOffset = 4
535 // VarOffsets = { %j -> element_size(<2 x i32>) }
536 //
537 // That lets us emit a single buffer_load directly into a VGPR, without ever
538 // allocating scratch memory for the intermediate pointer.
539 Value *CurPtr = GEP;
540 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
541 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
542 return {};
543
544 // Move to the next outer pointer.
545 CurPtr = CurGEP->getPointerOperand();
546 }
547
548 assert(CurPtr == Alloca && "GEP not based on alloca");
549
550 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
551 if (VarOffsets.size() > 1)
552 return {};
553
554 APInt IndexQuot;
555 int64_t Rem;
556 APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
557 if (Rem != 0)
558 return {};
559
560 GEPToVectorIndex Result;
561
562 if (!ConstOffset.isZero())
563 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
564
565 if (VarOffsets.empty())
566 return Result;
567
568 const auto &VarOffset = VarOffsets.front();
569 APInt OffsetQuot;
570 APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
571 if (Rem != 0 || OffsetQuot.isZero())
572 return {};
573
574 Result.VarIndex = VarOffset.first;
575 auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
576 if (!OffsetType)
577 return {};
578
579 if (!OffsetQuot.isOne())
580 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
581
582 return Result;
583}
584
585/// Promotes a single user of the alloca to a vector form.
586///
587/// \param Inst Instruction to be promoted.
588/// \param DL Module Data Layout.
589/// \param AA Alloca Analysis.
590/// \param VecStoreSize Size of \p VectorTy in bytes.
591/// \param ElementSize Size of \p VectorTy element type in bytes.
592/// \param CurVal Current value of the vector (e.g. last stored value)
593/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
594/// be promoted now. This happens when promoting requires \p
595/// CurVal, but \p CurVal is nullptr.
596/// \return the stored value if \p Inst would have written to the alloca, or
597/// nullptr otherwise.
599 AllocaAnalysis &AA,
600 unsigned VecStoreSize,
601 unsigned ElementSize,
602 function_ref<Value *()> GetCurVal) {
603 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
604 // to do more folding, especially in the case of vector splats.
607 Builder.SetInsertPoint(Inst);
608
609 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
610 Type *PtrTy) -> Value * {
611 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
612 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
613 if (!PtrTy->isVectorTy())
614 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
615 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
616 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
617 // first cast the ptr vector to <2 x i64>.
618 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
619 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
620 return Builder.CreateBitOrPointerCast(
621 Val, FixedVectorType::get(EltTy, NumPtrElts));
622 };
623
624 Type *VecEltTy = AA.Vector.Ty->getElementType();
625
626 switch (Inst->getOpcode()) {
627 case Instruction::Load: {
628 Value *CurVal = GetCurVal();
629 Value *Index =
631
632 // We're loading the full vector.
633 Type *AccessTy = Inst->getType();
634 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
635 if (Constant *CI = dyn_cast<Constant>(Index)) {
636 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
637 if (AccessTy->isPtrOrPtrVectorTy())
638 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
639 else if (CurVal->getType()->isPtrOrPtrVectorTy())
640 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
641 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
642 Inst->replaceAllUsesWith(NewVal);
643 return nullptr;
644 }
645 }
646
647 // Loading a subvector.
648 if (isa<FixedVectorType>(AccessTy)) {
649 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
650 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
651 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
652 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
653
654 // If idx is dynamic, then sandwich load with bitcasts.
655 // ie. VectorTy SubVecTy AccessTy
656 // <64 x i8> -> <16 x i8> <8 x i16>
657 // <64 x i8> -> <4 x i128> -> i128 -> <8 x i16>
658 // Extracting subvector with dynamic index has very large expansion in
659 // the amdgpu backend. Limit to pow2.
660 FixedVectorType *VectorTy = AA.Vector.Ty;
661 TypeSize NumBits = DL.getTypeStoreSize(SubVecTy) * 8u;
662 uint64_t LoadAlign = cast<LoadInst>(Inst)->getAlign().value();
663 bool IsAlignedLoad = NumBits <= (LoadAlign * 8u);
664 unsigned TotalNumElts = VectorTy->getNumElements();
665 bool IsProperlyDivisible = TotalNumElts % NumLoadedElts == 0;
666 if (!isa<ConstantInt>(Index) &&
667 llvm::isPowerOf2_32(SubVecTy->getNumElements()) &&
668 IsProperlyDivisible && IsAlignedLoad) {
669 IntegerType *NewElemTy = Builder.getIntNTy(NumBits);
670 const unsigned NewNumElts =
671 DL.getTypeStoreSize(VectorTy) * 8u / NumBits;
672 const unsigned LShrAmt = llvm::Log2_32(SubVecTy->getNumElements());
673 FixedVectorType *BitCastTy =
674 FixedVectorType::get(NewElemTy, NewNumElts);
675 Value *BCVal = Builder.CreateBitCast(CurVal, BitCastTy);
676 Value *NewIdx = Builder.CreateLShr(
677 Index, ConstantInt::get(Index->getType(), LShrAmt));
678 Value *ExtVal = Builder.CreateExtractElement(BCVal, NewIdx);
679 Value *BCOut = Builder.CreateBitCast(ExtVal, AccessTy);
680 Inst->replaceAllUsesWith(BCOut);
681 return nullptr;
682 }
683
684 Value *SubVec = PoisonValue::get(SubVecTy);
685 for (unsigned K = 0; K < NumLoadedElts; ++K) {
686 Value *CurIdx =
687 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
688 SubVec = Builder.CreateInsertElement(
689 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
690 }
691
692 if (AccessTy->isPtrOrPtrVectorTy())
693 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
694 else if (SubVecTy->isPtrOrPtrVectorTy())
695 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
696
697 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
698 Inst->replaceAllUsesWith(SubVec);
699 return nullptr;
700 }
701
702 // We're loading one element.
703 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
704 if (AccessTy != VecEltTy)
705 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
706
707 Inst->replaceAllUsesWith(ExtractElement);
708 return nullptr;
709 }
710 case Instruction::Store: {
711 // For stores, it's a bit trickier and it depends on whether we're storing
712 // the full vector or not. If we're storing the full vector, we don't need
713 // to know the current value. If this is a store of a single element, we
714 // need to know the value.
716 Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
717 Value *Val = SI->getValueOperand();
718
719 // We're storing the full vector, we can handle this without knowing CurVal.
720 Type *AccessTy = Val->getType();
721 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
722 if (Constant *CI = dyn_cast<Constant>(Index)) {
723 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
724 if (AccessTy->isPtrOrPtrVectorTy())
725 Val = CreateTempPtrIntCast(Val, AccessTy);
726 else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
727 Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
728 return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
729 }
730 }
731
732 // Storing a subvector.
733 if (isa<FixedVectorType>(AccessTy)) {
734 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
735 const unsigned NumWrittenElts =
736 AccessSize / DL.getTypeStoreSize(VecEltTy);
737 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
738 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
739 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
740
741 if (SubVecTy->isPtrOrPtrVectorTy())
742 Val = CreateTempPtrIntCast(Val, SubVecTy);
743 else if (AccessTy->isPtrOrPtrVectorTy())
744 Val = CreateTempPtrIntCast(Val, AccessTy);
745
746 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
747
748 Value *CurVec = GetCurVal();
749 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
750 K < NumElts; ++K) {
751 Value *CurIdx =
752 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
753 CurVec = Builder.CreateInsertElement(
754 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
755 }
756 return CurVec;
757 }
758
759 if (Val->getType() != VecEltTy)
760 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
761 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
762 }
763 case Instruction::Call: {
764 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
765 // For memcpy, we need to know curval.
766 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
767 unsigned NumCopied = Length->getZExtValue() / ElementSize;
768 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
769 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
770 unsigned DestBegin = TI->DestIndex->getZExtValue();
771
772 SmallVector<int> Mask;
773 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
774 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
775 Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
776 ? SrcBegin++
778 } else {
779 Mask.push_back(Idx);
780 }
781 }
782
783 return Builder.CreateShuffleVector(GetCurVal(), Mask);
784 }
785
786 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
787 // For memset, we don't need to know the previous value because we
788 // currently only allow memsets that cover the whole alloca.
789 Value *Elt = MSI->getOperand(1);
790 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
791 if (BytesPerElt > 1) {
792 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
793
794 // If the element type of the vector is a pointer, we need to first cast
795 // to an integer, then use a PtrCast.
796 if (VecEltTy->isPointerTy()) {
797 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
798 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
799 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
800 } else
801 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
802 }
803
804 return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
805 }
806
807 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
808 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
809 Intr->replaceAllUsesWith(
810 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
811 DL.getTypeAllocSize(AA.Vector.Ty)));
812 return nullptr;
813 }
814 }
815
816 llvm_unreachable("Unsupported call when promoting alloca to vector");
817 }
818
819 default:
820 llvm_unreachable("Inconsistency in instructions promotable to vector");
821 }
822
823 llvm_unreachable("Did not return after promoting instruction!");
824}
825
826static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
827 const DataLayout &DL) {
828 // Access as a vector type can work if the size of the access vector is a
829 // multiple of the size of the alloca's vector element type.
830 //
831 // Examples:
832 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
833 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
834 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
835 // - 3*32 is not a multiple of 64
836 //
837 // We could handle more complicated cases, but it'd make things a lot more
838 // complicated.
839 if (isa<FixedVectorType>(AccessTy)) {
840 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
841 // If the type size and the store size don't match, we would need to do more
842 // than just bitcast to translate between an extracted/insertable subvectors
843 // and the accessed value.
844 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
845 return false;
846 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
847 return AccTS.isKnownMultipleOf(VecTS);
848 }
849
851 DL);
852}
853
854/// Iterates over an instruction worklist that may contain multiple instructions
855/// from the same basic block, but in a different order.
856template <typename InstContainer>
857static void forEachWorkListItem(const InstContainer &WorkList,
858 std::function<void(Instruction *)> Fn) {
859 // Bucket up uses of the alloca by the block they occur in.
860 // This is important because we have to handle multiple defs/uses in a block
861 // ourselves: SSAUpdater is purely for cross-block references.
863 for (Instruction *User : WorkList)
864 UsesByBlock[User->getParent()].insert(User);
865
866 for (Instruction *User : WorkList) {
867 BasicBlock *BB = User->getParent();
868 auto &BlockUses = UsesByBlock[BB];
869
870 // Already processed, skip.
871 if (BlockUses.empty())
872 continue;
873
874 // Only user in the block, directly process it.
875 if (BlockUses.size() == 1) {
876 Fn(User);
877 continue;
878 }
879
880 // Multiple users in the block, do a linear scan to see users in order.
881 for (Instruction &Inst : *BB) {
882 if (!BlockUses.contains(&Inst))
883 continue;
884
885 Fn(&Inst);
886 }
887
888 // Clear the block so we know it's been processed.
889 BlockUses.clear();
890 }
891}
892
893/// Find an insert point after an alloca, after all other allocas clustered at
894/// the start of the block.
897 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
898 ;
899 return I;
900}
901
903AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
904 if (DisablePromoteAllocaToVector) {
905 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
906 return nullptr;
907 }
908
909 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
910 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
911 uint64_t NumElems = 1;
912 Type *ElemTy;
913 do {
914 NumElems *= ArrayTy->getNumElements();
915 ElemTy = ArrayTy->getElementType();
916 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
917
918 // Check for array of vectors
919 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
920 if (InnerVectorTy) {
921 NumElems *= InnerVectorTy->getNumElements();
922 ElemTy = InnerVectorTy->getElementType();
923 }
924
925 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
926 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
927 if (ElementSize > 0) {
928 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
929 // Expand vector if required to match padding of inner type,
930 // i.e. odd size subvectors.
931 // Storage size of new vector must match that of alloca for correct
932 // behaviour of byte offsets and GEP computation.
933 if (NumElems * ElementSize != AllocaSize)
934 NumElems = AllocaSize / ElementSize;
935 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
936 VectorTy = FixedVectorType::get(ElemTy, NumElems);
937 }
938 }
939 }
940 if (!VectorTy) {
941 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
942 return nullptr;
943 }
944
945 const unsigned MaxElements =
946 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
947
948 if (VectorTy->getNumElements() > MaxElements ||
949 VectorTy->getNumElements() < 2) {
950 LLVM_DEBUG(dbgs() << " " << *VectorTy
951 << " has an unsupported number of elements\n");
952 return nullptr;
953 }
954
955 Type *VecEltTy = VectorTy->getElementType();
956 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
957 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
958 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
959 "does not match the type's size\n");
960 return nullptr;
961 }
962
963 return VectorTy;
964}
965
966void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
967 if (AA.HaveSelectOrPHI) {
968 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
969 return;
970 }
971
972 Type *AllocaTy = AA.Alloca->getAllocatedType();
973 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
974 if (!AA.Vector.Ty)
975 return;
976
977 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
978 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
979 << " " << *Inst << "\n");
980 AA.Vector.Ty = nullptr;
981 };
982
983 Type *VecEltTy = AA.Vector.Ty->getElementType();
984 unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
985 assert(ElementSize > 0);
986 for (auto *U : AA.Uses) {
987 Instruction *Inst = cast<Instruction>(U->getUser());
988
989 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
990 assert(!isa<StoreInst>(Inst) ||
991 U->getOperandNo() == StoreInst::getPointerOperandIndex());
992
993 Type *AccessTy = getLoadStoreType(Inst);
994 if (AccessTy->isAggregateType())
995 return RejectUser(Inst, "unsupported load/store as aggregate");
996 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
997
998 // Check that this is a simple access of a vector element.
999 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
1000 : cast<StoreInst>(Inst)->isSimple();
1001 if (!IsSimple)
1002 return RejectUser(Inst, "not a simple load or store");
1003
1004 Ptr = Ptr->stripPointerCasts();
1005
1006 // Alloca already accessed as vector.
1007 if (Ptr == AA.Alloca &&
1008 DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
1009 DL->getTypeStoreSize(AccessTy)) {
1010 AA.Vector.Worklist.push_back(Inst);
1011 continue;
1012 }
1013
1014 if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
1015 return RejectUser(Inst, "not a supported access type");
1016
1017 AA.Vector.Worklist.push_back(Inst);
1018 continue;
1019 }
1020
1021 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
1022 // If we can't compute a vector index from this GEP, then we can't
1023 // promote this alloca to vector.
1024 auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
1025 if (!Index)
1026 return RejectUser(Inst, "cannot compute vector index for GEP");
1027
1028 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
1029 AA.Vector.UsersToRemove.push_back(Inst);
1030 continue;
1031 }
1032
1033 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
1034 MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
1035 AA.Vector.Worklist.push_back(Inst);
1036 continue;
1037 }
1038
1039 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
1040 if (TransferInst->isVolatile())
1041 return RejectUser(Inst, "mem transfer inst is volatile");
1042
1043 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
1044 if (!Len || (Len->getZExtValue() % ElementSize))
1045 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1046 "not a multiple of the vector element size");
1047
1048 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1049 if (Ptr == AA.Alloca)
1050 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1051
1053 const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
1054 if (GEPI.VarIndex)
1055 return nullptr;
1056 if (GEPI.ConstIndex)
1057 return GEPI.ConstIndex;
1058 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1059 };
1060
1061 MemTransferInfo *TI =
1062 &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1063 unsigned OpNum = U->getOperandNo();
1064 if (OpNum == 0) {
1065 Value *Dest = TransferInst->getDest();
1066 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1067 if (!Index)
1068 return RejectUser(Inst, "could not calculate constant dest index");
1069 TI->DestIndex = Index;
1070 } else {
1071 assert(OpNum == 1);
1072 Value *Src = TransferInst->getSource();
1073 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1074 if (!Index)
1075 return RejectUser(Inst, "could not calculate constant src index");
1076 TI->SrcIndex = Index;
1077 }
1078 continue;
1079 }
1080
1081 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
1082 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1083 AA.Vector.Worklist.push_back(Inst);
1084 continue;
1085 }
1086 }
1087
1088 // Ignore assume-like intrinsics and comparisons used in assumes.
1089 if (isAssumeLikeIntrinsic(Inst)) {
1090 if (!Inst->use_empty())
1091 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1092 AA.Vector.UsersToRemove.push_back(Inst);
1093 continue;
1094 }
1095
1096 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
1097 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1098 })) {
1099 AA.Vector.UsersToRemove.push_back(Inst);
1100 continue;
1101 }
1102
1103 return RejectUser(Inst, "unhandled alloca user");
1104 }
1105
1106 // Follow-up check to ensure we've seen both sides of all transfer insts.
1107 for (const auto &Entry : AA.Vector.TransferInfo) {
1108 const MemTransferInfo &TI = Entry.second;
1109 if (!TI.SrcIndex || !TI.DestIndex)
1110 return RejectUser(Entry.first,
1111 "mem transfer inst between different objects");
1112 AA.Vector.Worklist.push_back(Entry.first);
1113 }
1114}
1115
1116void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1117 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1118 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1119 << " -> " << *AA.Vector.Ty << '\n');
1120 const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
1121
1122 Type *VecEltTy = AA.Vector.Ty->getElementType();
1123 const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
1124
1125 // Alloca is uninitialized memory. Imitate that by making the first value
1126 // undef.
1127 SSAUpdater Updater;
1128 Updater.Initialize(AA.Vector.Ty, "promotealloca");
1129
1130 BasicBlock *EntryBB = AA.Alloca->getParent();
1131 BasicBlock::iterator InitInsertPos =
1132 skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
1133 IRBuilder<> Builder(&*InitInsertPos);
1134 Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
1135 AllocaInitValue->takeName(AA.Alloca);
1136
1137 Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
1138
1139 // First handle the initial worklist, in basic block order.
1140 //
1141 // Insert a placeholder whenever we need the vector value at the top of a
1142 // basic block.
1143 SmallVector<Instruction *> Placeholders;
1144 forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
1145 BasicBlock *BB = I->getParent();
1146 auto GetCurVal = [&]() -> Value * {
1147 if (Value *CurVal = Updater.FindValueForBlock(BB))
1148 return CurVal;
1149
1150 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1151 return Placeholders.back();
1152
1153 // If the current value in the basic block is not yet known, insert a
1154 // placeholder that we will replace later.
1155 IRBuilder<> Builder(I);
1156 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1157 PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
1158 Placeholders.push_back(Placeholder);
1159 return Placeholders.back();
1160 };
1161
1162 Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
1163 ElementSize, GetCurVal);
1164 if (Result)
1165 Updater.AddAvailableValue(BB, Result);
1166 });
1167
1168 // Now fixup the placeholders.
1169 SmallVector<Value *> PlaceholderToNewVal(Placeholders.size());
1170 for (auto [Index, Placeholder] : enumerate(Placeholders)) {
1171 Value *NewVal = Updater.GetValueInMiddleOfBlock(Placeholder->getParent());
1172 PlaceholderToNewVal[Index] = NewVal;
1173 Placeholder->replaceAllUsesWith(NewVal);
1174 }
1175 // Note: we cannot merge this loop with the previous one because it is
1176 // possible that the placeholder itself can be used in the SSAUpdater. The
1177 // replaceAllUsesWith doesn't replace those uses.
1178 for (auto [Index, Placeholder] : enumerate(Placeholders)) {
1179 if (!Placeholder->use_empty())
1180 Placeholder->replaceAllUsesWith(PlaceholderToNewVal[Index]);
1181 Placeholder->eraseFromParent();
1182 }
1183
1184 // Delete all instructions.
1185 for (Instruction *I : AA.Vector.Worklist) {
1186 assert(I->use_empty());
1187 I->eraseFromParent();
1188 }
1189
1190 // Delete all the users that are known to be removeable.
1191 for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
1192 I->dropDroppableUses();
1193 assert(I->use_empty());
1194 I->eraseFromParent();
1195 }
1196
1197 // Alloca should now be dead too.
1198 assert(AA.Alloca->use_empty());
1199 AA.Alloca->eraseFromParent();
1200}
1201
1202std::pair<Value *, Value *>
1203AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1204 Function &F = *Builder.GetInsertBlock()->getParent();
1206
1207 if (!IsAMDHSA) {
1208 CallInst *LocalSizeY =
1209 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1210 CallInst *LocalSizeZ =
1211 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1212
1213 ST.makeLIDRangeMetadata(LocalSizeY);
1214 ST.makeLIDRangeMetadata(LocalSizeZ);
1215
1216 return std::pair(LocalSizeY, LocalSizeZ);
1217 }
1218
1219 // We must read the size out of the dispatch pointer.
1220 assert(IsAMDGCN);
1221
1222 // We are indexing into this struct, and want to extract the workgroup_size_*
1223 // fields.
1224 //
1225 // typedef struct hsa_kernel_dispatch_packet_s {
1226 // uint16_t header;
1227 // uint16_t setup;
1228 // uint16_t workgroup_size_x ;
1229 // uint16_t workgroup_size_y;
1230 // uint16_t workgroup_size_z;
1231 // uint16_t reserved0;
1232 // uint32_t grid_size_x ;
1233 // uint32_t grid_size_y ;
1234 // uint32_t grid_size_z;
1235 //
1236 // uint32_t private_segment_size;
1237 // uint32_t group_segment_size;
1238 // uint64_t kernel_object;
1239 //
1240 // #ifdef HSA_LARGE_MODEL
1241 // void *kernarg_address;
1242 // #elif defined HSA_LITTLE_ENDIAN
1243 // void *kernarg_address;
1244 // uint32_t reserved1;
1245 // #else
1246 // uint32_t reserved1;
1247 // void *kernarg_address;
1248 // #endif
1249 // uint64_t reserved2;
1250 // hsa_signal_t completion_signal; // uint64_t wrapper
1251 // } hsa_kernel_dispatch_packet_t
1252 //
1253 CallInst *DispatchPtr =
1254 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1255 DispatchPtr->addRetAttr(Attribute::NoAlias);
1256 DispatchPtr->addRetAttr(Attribute::NonNull);
1257 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1258
1259 // Size of the dispatch packet struct.
1260 DispatchPtr->addDereferenceableRetAttr(64);
1261
1262 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1263
1264 // We could do a single 64-bit load here, but it's likely that the basic
1265 // 32-bit and extract sequence is already present, and it is probably easier
1266 // to CSE this. The loads should be mergeable later anyway.
1267 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1268 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1269
1270 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1271 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1272
1273 MDNode *MD = MDNode::get(Mod->getContext(), {});
1274 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1275 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1276 ST.makeLIDRangeMetadata(LoadZU);
1277
1278 // Extract y component. Upper half of LoadZU should be zero already.
1279 Value *Y = Builder.CreateLShr(LoadXY, 16);
1280
1281 return std::pair(Y, LoadZU);
1282}
1283
1284Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1285 unsigned N) {
1286 Function *F = Builder.GetInsertBlock()->getParent();
1289 StringRef AttrName;
1290
1291 switch (N) {
1292 case 0:
1293 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1294 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1295 AttrName = "amdgpu-no-workitem-id-x";
1296 break;
1297 case 1:
1298 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1299 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1300 AttrName = "amdgpu-no-workitem-id-y";
1301 break;
1302
1303 case 2:
1304 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1305 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1306 AttrName = "amdgpu-no-workitem-id-z";
1307 break;
1308 default:
1309 llvm_unreachable("invalid dimension");
1310 }
1311
1312 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1313 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1314 ST.makeLIDRangeMetadata(CI);
1315 F->removeFnAttr(AttrName);
1316
1317 return CI;
1318}
1319
1320static bool isCallPromotable(CallInst *CI) {
1322 if (!II)
1323 return false;
1324
1325 switch (II->getIntrinsicID()) {
1326 case Intrinsic::memcpy:
1327 case Intrinsic::memmove:
1328 case Intrinsic::memset:
1329 case Intrinsic::lifetime_start:
1330 case Intrinsic::lifetime_end:
1331 case Intrinsic::invariant_start:
1332 case Intrinsic::invariant_end:
1333 case Intrinsic::launder_invariant_group:
1334 case Intrinsic::strip_invariant_group:
1335 case Intrinsic::objectsize:
1336 return true;
1337 default:
1338 return false;
1339 }
1340}
1341
1342bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1343 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1344 int OpIdx1) const {
1345 // Figure out which operand is the one we might not be promoting.
1346 Value *OtherOp = Inst->getOperand(OpIdx0);
1347 if (Val == OtherOp)
1348 OtherOp = Inst->getOperand(OpIdx1);
1349
1351 return true;
1352
1353 // TODO: getUnderlyingObject will not work on a vector getelementptr
1354 Value *OtherObj = getUnderlyingObject(OtherOp);
1355 if (!isa<AllocaInst>(OtherObj))
1356 return false;
1357
1358 // TODO: We should be able to replace undefs with the right pointer type.
1359
1360 // TODO: If we know the other base object is another promotable
1361 // alloca, not necessarily this alloca, we can do this. The
1362 // important part is both must have the same address space at
1363 // the end.
1364 if (OtherObj != BaseAlloca) {
1365 LLVM_DEBUG(
1366 dbgs() << "Found a binary instruction with another alloca object\n");
1367 return false;
1368 }
1369
1370 return true;
1371}
1372
1373void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1374 if (DisablePromoteAllocaToLDS) {
1375 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1376 return;
1377 }
1378
1379 // Don't promote the alloca to LDS for shader calling conventions as the work
1380 // item ID intrinsics are not supported for these calling conventions.
1381 // Furthermore not all LDS is available for some of the stages.
1382 const Function &ContainingFunction = *AA.Alloca->getFunction();
1383 CallingConv::ID CC = ContainingFunction.getCallingConv();
1384
1385 switch (CC) {
1388 break;
1389 default:
1390 LLVM_DEBUG(
1391 dbgs()
1392 << " promote alloca to LDS not supported with calling convention.\n");
1393 return;
1394 }
1395
1396 for (Use *Use : AA.Uses) {
1397 auto *User = Use->getUser();
1398
1399 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1400 if (!isCallPromotable(CI))
1401 return;
1402
1403 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1404 AA.LDS.Worklist.push_back(User);
1405 continue;
1406 }
1407
1409 if (UseInst->getOpcode() == Instruction::PtrToInt)
1410 return;
1411
1412 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1413 if (LI->isVolatile())
1414 return;
1415 continue;
1416 }
1417
1418 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1419 if (SI->isVolatile())
1420 return;
1421 continue;
1422 }
1423
1424 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1425 if (RMW->isVolatile())
1426 return;
1427 continue;
1428 }
1429
1430 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1431 if (CAS->isVolatile())
1432 return;
1433 continue;
1434 }
1435
1436 // Only promote a select if we know that the other select operand
1437 // is from another pointer that will also be promoted.
1438 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1439 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
1440 return;
1441
1442 // May need to rewrite constant operands.
1443 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1444 AA.LDS.Worklist.push_back(ICmp);
1445 continue;
1446 }
1447
1449 // Be conservative if an address could be computed outside the bounds of
1450 // the alloca.
1451 if (!GEP->isInBounds())
1452 return;
1454 // Do not promote vector/aggregate type instructions. It is hard to track
1455 // their users.
1456
1457 // Do not promote addrspacecast.
1458 //
1459 // TODO: If we know the address is only observed through flat pointers, we
1460 // could still promote.
1461 return;
1462 }
1463
1464 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1465 AA.LDS.Worklist.push_back(User);
1466 }
1467
1468 AA.LDS.Enable = true;
1469}
1470
1471bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1472
1473 FunctionType *FTy = F.getFunctionType();
1475
1476 // If the function has any arguments in the local address space, then it's
1477 // possible these arguments require the entire local memory space, so
1478 // we cannot use local memory in the pass.
1479 for (Type *ParamTy : FTy->params()) {
1480 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1481 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1482 LocalMemLimit = 0;
1483 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1484 "local memory disabled.\n");
1485 return false;
1486 }
1487 }
1488
1489 LocalMemLimit = ST.getAddressableLocalMemorySize();
1490 if (LocalMemLimit == 0)
1491 return false;
1492
1494 SmallPtrSet<const Constant *, 8> VisitedConstants;
1496
1497 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1498 for (const User *U : Val->users()) {
1499 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1500 if (Use->getFunction() == &F)
1501 return true;
1502 } else {
1503 const Constant *C = cast<Constant>(U);
1504 if (VisitedConstants.insert(C).second)
1505 Stack.push_back(C);
1506 }
1507 }
1508
1509 return false;
1510 };
1511
1512 for (GlobalVariable &GV : Mod->globals()) {
1514 continue;
1515
1516 if (visitUsers(&GV, &GV)) {
1517 UsedLDS.insert(&GV);
1518 Stack.clear();
1519 continue;
1520 }
1521
1522 // For any ConstantExpr uses, we need to recursively search the users until
1523 // we see a function.
1524 while (!Stack.empty()) {
1525 const Constant *C = Stack.pop_back_val();
1526 if (visitUsers(&GV, C)) {
1527 UsedLDS.insert(&GV);
1528 Stack.clear();
1529 break;
1530 }
1531 }
1532 }
1533
1534 const DataLayout &DL = Mod->getDataLayout();
1535 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1536 AllocatedSizes.reserve(UsedLDS.size());
1537
1538 for (const GlobalVariable *GV : UsedLDS) {
1539 Align Alignment =
1540 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1541 uint64_t AllocSize = GV->getGlobalSize(DL);
1542
1543 // HIP uses an extern unsized array in local address space for dynamically
1544 // allocated shared memory. In that case, we have to disable the promotion.
1545 if (GV->hasExternalLinkage() && AllocSize == 0) {
1546 LocalMemLimit = 0;
1547 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1548 "local memory. Promoting to local memory "
1549 "disabled.\n");
1550 return false;
1551 }
1552
1553 AllocatedSizes.emplace_back(AllocSize, Alignment);
1554 }
1555
1556 // Sort to try to estimate the worst case alignment padding
1557 //
1558 // FIXME: We should really do something to fix the addresses to a more optimal
1559 // value instead
1560 llvm::sort(AllocatedSizes, llvm::less_second());
1561
1562 // Check how much local memory is being used by global objects
1563 CurrentLocalMemUsage = 0;
1564
1565 // FIXME: Try to account for padding here. The real padding and address is
1566 // currently determined from the inverse order of uses in the function when
1567 // legalizing, which could also potentially change. We try to estimate the
1568 // worst case here, but we probably should fix the addresses earlier.
1569 for (auto Alloc : AllocatedSizes) {
1570 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1571 CurrentLocalMemUsage += Alloc.first;
1572 }
1573
1574 unsigned MaxOccupancy =
1575 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1576 .second;
1577
1578 // Round up to the next tier of usage.
1579 unsigned MaxSizeWithWaveCount =
1580 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1581
1582 // Program may already use more LDS than is usable at maximum occupancy.
1583 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1584 return false;
1585
1586 LocalMemLimit = MaxSizeWithWaveCount;
1587
1588 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1589 << " bytes of LDS\n"
1590 << " Rounding size to " << MaxSizeWithWaveCount
1591 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1592 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1593 << " available for promotion\n");
1594
1595 return true;
1596}
1597
1598// FIXME: Should try to pick the most likely to be profitable allocas first.
1599bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(
1600 AllocaAnalysis &AA, bool SufficientLDS,
1601 SetVector<IntrinsicInst *> &DeferredIntrs) {
1602 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1603
1604 // Not likely to have sufficient local memory for promotion.
1605 if (!SufficientLDS)
1606 return false;
1607
1608 const DataLayout &DL = Mod->getDataLayout();
1609 IRBuilder<> Builder(AA.Alloca);
1610
1611 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1612 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1613 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1614
1615 Align Alignment = DL.getValueOrABITypeAlignment(
1616 AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
1617
1618 // FIXME: This computed padding is likely wrong since it depends on inverse
1619 // usage order.
1620 //
1621 // FIXME: It is also possible that if we're allowed to use all of the memory
1622 // could end up using more than the maximum due to alignment padding.
1623
1624 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1625 uint32_t AllocSize =
1626 WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
1627 NewSize += AllocSize;
1628
1629 if (NewSize > LocalMemLimit) {
1630 LLVM_DEBUG(dbgs() << " " << AllocSize
1631 << " bytes of local memory not available to promote\n");
1632 return false;
1633 }
1634
1635 CurrentLocalMemUsage = NewSize;
1636
1637 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1638
1639 Function *F = AA.Alloca->getFunction();
1640
1641 Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
1644 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1647 GV->setAlignment(AA.Alloca->getAlign());
1648
1649 Value *TCntY, *TCntZ;
1650
1651 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1652 Value *TIdX = getWorkitemID(Builder, 0);
1653 Value *TIdY = getWorkitemID(Builder, 1);
1654 Value *TIdZ = getWorkitemID(Builder, 2);
1655
1656 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1657 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1658 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1659 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1660 TID = Builder.CreateAdd(TID, TIdZ);
1661
1662 LLVMContext &Context = Mod->getContext();
1664
1665 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1666 AA.Alloca->mutateType(Offset->getType());
1667 AA.Alloca->replaceAllUsesWith(Offset);
1668 AA.Alloca->eraseFromParent();
1669
1671
1672 for (Value *V : AA.LDS.Worklist) {
1674 if (!Call) {
1675 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1676 Value *LHS = CI->getOperand(0);
1677 Value *RHS = CI->getOperand(1);
1678
1679 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1681 CI->setOperand(0, Constant::getNullValue(NewTy));
1682
1684 CI->setOperand(1, Constant::getNullValue(NewTy));
1685
1686 continue;
1687 }
1688
1689 // The operand's value should be corrected on its own and we don't want to
1690 // touch the users.
1692 continue;
1693
1694 assert(V->getType()->isPtrOrPtrVectorTy());
1695
1696 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1697 V->mutateType(NewTy);
1698
1699 // Adjust the types of any constant operands.
1702 SI->setOperand(1, Constant::getNullValue(NewTy));
1703
1705 SI->setOperand(2, Constant::getNullValue(NewTy));
1706 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1707 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1709 Phi->getIncomingValue(I)))
1710 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1711 }
1712 }
1713
1714 continue;
1715 }
1716
1718 Builder.SetInsertPoint(Intr);
1719 switch (Intr->getIntrinsicID()) {
1720 case Intrinsic::lifetime_start:
1721 case Intrinsic::lifetime_end:
1722 // These intrinsics are for address space 0 only
1723 Intr->eraseFromParent();
1724 continue;
1725 case Intrinsic::memcpy:
1726 case Intrinsic::memmove:
1727 // These have 2 pointer operands. In case if second pointer also needs
1728 // to be replaced we defer processing of these intrinsics until all
1729 // other values are processed.
1730 DeferredIntrs.insert(Intr);
1731 continue;
1732 case Intrinsic::memset: {
1733 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1734 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1735 MemSet->getLength(), MemSet->getDestAlign(),
1736 MemSet->isVolatile());
1737 Intr->eraseFromParent();
1738 continue;
1739 }
1740 case Intrinsic::invariant_start:
1741 case Intrinsic::invariant_end:
1742 case Intrinsic::launder_invariant_group:
1743 case Intrinsic::strip_invariant_group: {
1745 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1746 Args.emplace_back(Intr->getArgOperand(0));
1747 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1748 Args.emplace_back(Intr->getArgOperand(0));
1749 Args.emplace_back(Intr->getArgOperand(1));
1750 }
1751 Args.emplace_back(Offset);
1753 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1754 CallInst *NewIntr =
1755 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1756 Intr->mutateType(NewIntr->getType());
1757 Intr->replaceAllUsesWith(NewIntr);
1758 Intr->eraseFromParent();
1759 continue;
1760 }
1761 case Intrinsic::objectsize: {
1762 Value *Src = Intr->getOperand(0);
1763
1764 CallInst *NewCall = Builder.CreateIntrinsic(
1765 Intrinsic::objectsize,
1767 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1768 Intr->replaceAllUsesWith(NewCall);
1769 Intr->eraseFromParent();
1770 continue;
1771 }
1772 default:
1773 Intr->print(errs());
1774 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1775 }
1776 }
1777
1778 return true;
1779}
1780
1781void AMDGPUPromoteAllocaImpl::finishDeferredAllocaToLDSPromotion(
1782 SetVector<IntrinsicInst *> &DeferredIntrs) {
1783
1784 for (IntrinsicInst *Intr : DeferredIntrs) {
1785 IRBuilder<> Builder(Intr);
1786 Builder.SetInsertPoint(Intr);
1788 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1789
1791 auto *B = Builder.CreateMemTransferInst(
1792 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1793 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1794
1795 for (unsigned I = 0; I != 2; ++I) {
1796 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1797 B->addDereferenceableParamAttr(I, Bytes);
1798 }
1799 }
1800
1801 Intr->eraseFromParent();
1802 }
1803}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
AMDGPU promote alloca to vector or LDS
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static std::optional< GEPToVectorIndex > computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL)
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1901
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1052
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:483
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:561
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1872
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1517
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1953
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1407
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2481
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1995
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1441
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2776
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:933
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:708
bool use_empty() const
Definition Value.h:346
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2106
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2544
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446