LLVM 22.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// Eliminates allocas by either converting them into vectors or by migrating
10// them to local address space.
11//
12// Two passes are exposed by this file:
13// - "promote-alloca-to-vector", which runs early in the pipeline and only
14// promotes to vector. Promotion to vector is almost always profitable
15// except when the alloca is too big and the promotion would result in
16// very high register pressure.
17// - "promote-alloca", which does both promotion to vector and LDS and runs
18// much later in the pipeline. This runs after SROA because promoting to
19// LDS is of course less profitable than getting rid of the alloca or
20// vectorizing it, thus we only want to do it when the only alternative is
21// lowering the alloca to stack.
22//
23// Note that both of them exist for the old and new PMs. The new PM passes are
24// declared in AMDGPU.h and the legacy PM ones are declared here.s
25//
26//===----------------------------------------------------------------------===//
27
28#include "AMDGPU.h"
29#include "GCNSubtarget.h"
31#include "llvm/ADT/STLExtras.h"
38#include "llvm/IR/IRBuilder.h"
40#include "llvm/IR/IntrinsicsAMDGPU.h"
41#include "llvm/IR/IntrinsicsR600.h"
44#include "llvm/Pass.h"
47
48#define DEBUG_TYPE "amdgpu-promote-alloca"
49
50using namespace llvm;
51
52namespace {
53
54static cl::opt<bool>
55 DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
56 cl::desc("Disable promote alloca to vector"),
57 cl::init(false));
58
59static cl::opt<bool>
60 DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
61 cl::desc("Disable promote alloca to LDS"),
62 cl::init(false));
63
64static cl::opt<unsigned> PromoteAllocaToVectorLimit(
65 "amdgpu-promote-alloca-to-vector-limit",
66 cl::desc("Maximum byte size to consider promote alloca to vector"),
67 cl::init(0));
68
69static cl::opt<unsigned> PromoteAllocaToVectorMaxRegs(
70 "amdgpu-promote-alloca-to-vector-max-regs",
72 "Maximum vector size (in 32b registers) to use when promoting alloca"),
73 cl::init(32));
74
75// Use up to 1/4 of available register budget for vectorization.
76// FIXME: Increase the limit for whole function budgets? Perhaps x2?
77static cl::opt<unsigned> PromoteAllocaToVectorVGPRRatio(
78 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
79 cl::desc("Ratio of VGPRs to budget for promoting alloca to vectors"),
80 cl::init(4));
81
83 LoopUserWeight("promote-alloca-vector-loop-user-weight",
84 cl::desc("The bonus weight of users of allocas within loop "
85 "when sorting profitable allocas"),
86 cl::init(4));
87
88// We support vector indices of the form (A * stride) + B
89// All parts are optional.
90struct GEPToVectorIndex {
91 Value *VarIndex = nullptr; // defaults to 0
92 ConstantInt *VarMul = nullptr; // defaults to 1
93 ConstantInt *ConstIndex = nullptr; // defaults to 0
94 Value *Full = nullptr;
95};
96
97struct MemTransferInfo {
98 ConstantInt *SrcIndex = nullptr;
99 ConstantInt *DestIndex = nullptr;
100};
101
102// Analysis for planning the different strategies of alloca promotion.
103struct AllocaAnalysis {
104 AllocaInst *Alloca = nullptr;
105 DenseSet<Value *> Pointers;
107 unsigned Score = 0;
108 bool HaveSelectOrPHI = false;
109 struct {
110 FixedVectorType *Ty = nullptr;
112 SmallVector<Instruction *> UsersToRemove;
115 } Vector;
116 struct {
117 bool Enable = false;
118 SmallVector<User *> Worklist;
119 } LDS;
120
121 explicit AllocaAnalysis(AllocaInst *Alloca) : Alloca(Alloca) {}
122};
123
124// Shared implementation which can do both promotion to vector and to LDS.
125class AMDGPUPromoteAllocaImpl {
126private:
127 const TargetMachine &TM;
128 LoopInfo &LI;
129 Module *Mod = nullptr;
130 const DataLayout *DL = nullptr;
131
132 // FIXME: This should be per-kernel.
133 uint32_t LocalMemLimit = 0;
134 uint32_t CurrentLocalMemUsage = 0;
135 unsigned MaxVGPRs;
136 unsigned VGPRBudgetRatio;
137 unsigned MaxVectorRegs;
138
139 bool IsAMDGCN = false;
140 bool IsAMDHSA = false;
141
142 std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
143 Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
144
145 bool collectAllocaUses(AllocaAnalysis &AA) const;
146
147 /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
148 /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
149 /// Returns true if both operands are derived from the same alloca. Val should
150 /// be the same value as one of the input operands of UseInst.
151 bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
152 Instruction *UseInst, int OpIdx0,
153 int OpIdx1) const;
154
155 /// Check whether we have enough local memory for promotion.
156 bool hasSufficientLocalMem(const Function &F);
157
158 FixedVectorType *getVectorTypeForAlloca(Type *AllocaTy) const;
159 void analyzePromoteToVector(AllocaAnalysis &AA) const;
160 void promoteAllocaToVector(AllocaAnalysis &AA);
161 void analyzePromoteToLDS(AllocaAnalysis &AA) const;
162 bool tryPromoteAllocaToLDS(AllocaAnalysis &AA, bool SufficientLDS);
163
164 void scoreAlloca(AllocaAnalysis &AA) const;
165
166 void setFunctionLimits(const Function &F);
167
168public:
169 AMDGPUPromoteAllocaImpl(TargetMachine &TM, LoopInfo &LI) : TM(TM), LI(LI) {
170
171 const Triple &TT = TM.getTargetTriple();
172 IsAMDGCN = TT.isAMDGCN();
173 IsAMDHSA = TT.getOS() == Triple::AMDHSA;
174 }
175
176 bool run(Function &F, bool PromoteToLDS);
177};
178
179// FIXME: This can create globals so should be a module pass.
180class AMDGPUPromoteAlloca : public FunctionPass {
181public:
182 static char ID;
183
184 AMDGPUPromoteAlloca() : FunctionPass(ID) {}
185
186 bool runOnFunction(Function &F) override {
187 if (skipFunction(F))
188 return false;
189 if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
190 return AMDGPUPromoteAllocaImpl(
191 TPC->getTM<TargetMachine>(),
192 getAnalysis<LoopInfoWrapperPass>().getLoopInfo())
193 .run(F, /*PromoteToLDS*/ true);
194 return false;
195 }
196
197 StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
198
199 void getAnalysisUsage(AnalysisUsage &AU) const override {
200 AU.setPreservesCFG();
203 }
204};
205
206static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
207 const Function &F) {
208 if (!TM.getTargetTriple().isAMDGCN())
209 return 128;
210
211 const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
212
213 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
214 // Temporarily check both the attribute and the subtarget feature, until the
215 // latter is removed.
216 if (DynamicVGPRBlockSize == 0 && ST.isDynamicVGPREnabled())
217 DynamicVGPRBlockSize = ST.getDynamicVGPRBlockSize();
218
219 unsigned MaxVGPRs = ST.getMaxNumVGPRs(
220 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first,
221 DynamicVGPRBlockSize);
222
223 // A non-entry function has only 32 caller preserved registers.
224 // Do not promote alloca which will force spilling unless we know the function
225 // will be inlined.
226 if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
227 !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
228 MaxVGPRs = std::min(MaxVGPRs, 32u);
229 return MaxVGPRs;
230}
231
232} // end anonymous namespace
233
234char AMDGPUPromoteAlloca::ID = 0;
235
237 "AMDGPU promote alloca to vector or LDS", false, false)
238// Move LDS uses from functions to kernels before promote alloca for accurate
239// estimation of LDS available
240INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
242INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
243 "AMDGPU promote alloca to vector or LDS", false, false)
244
245char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
246
249 auto &LI = AM.getResult<LoopAnalysis>(F);
250 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/true);
251 if (Changed) {
254 return PA;
255 }
256 return PreservedAnalyses::all();
257}
258
261 auto &LI = AM.getResult<LoopAnalysis>(F);
262 bool Changed = AMDGPUPromoteAllocaImpl(TM, LI).run(F, /*PromoteToLDS=*/false);
263 if (Changed) {
266 return PA;
267 }
268 return PreservedAnalyses::all();
269}
270
272 return new AMDGPUPromoteAlloca();
273}
274
275bool AMDGPUPromoteAllocaImpl::collectAllocaUses(AllocaAnalysis &AA) const {
276 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
277 LLVM_DEBUG(dbgs() << " Cannot promote alloca: " << Msg << "\n"
278 << " " << *Inst << "\n");
279 return false;
280 };
281
282 SmallVector<Instruction *, 4> WorkList({AA.Alloca});
283 while (!WorkList.empty()) {
284 auto *Cur = WorkList.pop_back_val();
285 if (find(AA.Pointers, Cur) != AA.Pointers.end())
286 continue;
287 AA.Pointers.insert(Cur);
288 for (auto &U : Cur->uses()) {
289 auto *Inst = cast<Instruction>(U.getUser());
290 if (isa<StoreInst>(Inst)) {
291 if (U.getOperandNo() != StoreInst::getPointerOperandIndex()) {
292 return RejectUser(Inst, "pointer escapes via store");
293 }
294 }
295 AA.Uses.push_back(&U);
296
297 if (isa<GetElementPtrInst>(U.getUser())) {
298 WorkList.push_back(Inst);
299 } else if (auto *SI = dyn_cast<SelectInst>(Inst)) {
300 // Only promote a select if we know that the other select operand is
301 // from another pointer that will also be promoted.
302 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, SI, 1, 2))
303 return RejectUser(Inst, "select from mixed objects");
304 WorkList.push_back(Inst);
305 AA.HaveSelectOrPHI = true;
306 } else if (auto *Phi = dyn_cast<PHINode>(Inst)) {
307 // Repeat for phis.
308
309 // TODO: Handle more complex cases. We should be able to replace loops
310 // over arrays.
311 switch (Phi->getNumIncomingValues()) {
312 case 1:
313 break;
314 case 2:
315 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Cur, Phi, 0, 1))
316 return RejectUser(Inst, "phi from mixed objects");
317 break;
318 default:
319 return RejectUser(Inst, "phi with too many operands");
320 }
321
322 WorkList.push_back(Inst);
323 AA.HaveSelectOrPHI = true;
324 }
325 }
326 }
327 return true;
328}
329
330void AMDGPUPromoteAllocaImpl::scoreAlloca(AllocaAnalysis &AA) const {
331 LLVM_DEBUG(dbgs() << "Scoring: " << *AA.Alloca << "\n");
332 unsigned Score = 0;
333 // Increment score by one for each user + a bonus for users within loops.
334 for (auto *U : AA.Uses) {
335 Instruction *Inst = cast<Instruction>(U->getUser());
336 if (isa<GetElementPtrInst>(Inst) || isa<SelectInst>(Inst) ||
337 isa<PHINode>(Inst))
338 continue;
339 unsigned UserScore =
340 1 + (LoopUserWeight * LI.getLoopDepth(Inst->getParent()));
341 LLVM_DEBUG(dbgs() << " [+" << UserScore << "]:\t" << *Inst << "\n");
342 Score += UserScore;
343 }
344 LLVM_DEBUG(dbgs() << " => Final Score:" << Score << "\n");
345 AA.Score = Score;
346}
347
348void AMDGPUPromoteAllocaImpl::setFunctionLimits(const Function &F) {
349 // Load per function limits, overriding with global options where appropriate.
350 // R600 register tuples/aliasing are fragile with large vector promotions so
351 // apply architecture specific limit here.
352 const int R600MaxVectorRegs = 16;
353 MaxVectorRegs = F.getFnAttributeAsParsedInteger(
354 "amdgpu-promote-alloca-to-vector-max-regs",
355 IsAMDGCN ? PromoteAllocaToVectorMaxRegs : R600MaxVectorRegs);
356 if (PromoteAllocaToVectorMaxRegs.getNumOccurrences())
357 MaxVectorRegs = PromoteAllocaToVectorMaxRegs;
358 VGPRBudgetRatio = F.getFnAttributeAsParsedInteger(
359 "amdgpu-promote-alloca-to-vector-vgpr-ratio",
360 PromoteAllocaToVectorVGPRRatio);
361 if (PromoteAllocaToVectorVGPRRatio.getNumOccurrences())
362 VGPRBudgetRatio = PromoteAllocaToVectorVGPRRatio;
363}
364
365bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
366 Mod = F.getParent();
367 DL = &Mod->getDataLayout();
368
370 if (!ST.isPromoteAllocaEnabled())
371 return false;
372
373 bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
374 MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
375 setFunctionLimits(F);
376
377 unsigned VectorizationBudget =
378 (PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
379 : (MaxVGPRs * 32)) /
380 VGPRBudgetRatio;
381
382 std::vector<AllocaAnalysis> Allocas;
383 for (Instruction &I : F.getEntryBlock()) {
384 if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
385 // Array allocations are probably not worth handling, since an allocation
386 // of the array type is the canonical form.
387 if (!AI->isStaticAlloca() || AI->isArrayAllocation())
388 continue;
389
390 LLVM_DEBUG(dbgs() << "Analyzing: " << *AI << '\n');
391
392 AllocaAnalysis AA{AI};
393 if (collectAllocaUses(AA)) {
394 analyzePromoteToVector(AA);
395 if (PromoteToLDS)
396 analyzePromoteToLDS(AA);
397 if (AA.Vector.Ty || AA.LDS.Enable) {
398 scoreAlloca(AA);
399 Allocas.push_back(std::move(AA));
400 }
401 }
402 }
403 }
404
405 stable_sort(Allocas,
406 [](const auto &A, const auto &B) { return A.Score > B.Score; });
407
408 // clang-format off
410 dbgs() << "Sorted Worklist:\n";
411 for (const auto &AA : Allocas)
412 dbgs() << " " << *AA.Alloca << "\n";
413 );
414 // clang-format on
415
416 bool Changed = false;
417 for (AllocaAnalysis &AA : Allocas) {
418 if (AA.Vector.Ty) {
419 const unsigned AllocaCost =
420 DL->getTypeSizeInBits(AA.Alloca->getAllocatedType());
421 // First, check if we have enough budget to vectorize this alloca.
422 if (AllocaCost <= VectorizationBudget) {
423 promoteAllocaToVector(AA);
424 Changed = true;
425 assert((VectorizationBudget - AllocaCost) < VectorizationBudget &&
426 "Underflow!");
427 VectorizationBudget -= AllocaCost;
428 LLVM_DEBUG(dbgs() << " Remaining vectorization budget:"
429 << VectorizationBudget << "\n");
430 continue;
431 } else {
432 LLVM_DEBUG(dbgs() << "Alloca too big for vectorization (size:"
433 << AllocaCost << ", budget:" << VectorizationBudget
434 << "): " << *AA.Alloca << "\n");
435 }
436 }
437
438 if (AA.LDS.Enable && tryPromoteAllocaToLDS(AA, SufficientLDS))
439 Changed = true;
440 }
441
442 // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
443 // dangling pointers. If we want to reuse it past this point, the loop above
444 // would need to be updated to remove successfully promoted allocas.
445
446 return Changed;
447}
448
449// Checks if the instruction I is a memset user of the alloca AI that we can
450// deal with. Currently, only non-volatile memsets that affect the whole alloca
451// are handled.
453 const DataLayout &DL) {
454 using namespace PatternMatch;
455 // For now we only care about non-volatile memsets that affect the whole type
456 // (start at index 0 and fill the whole alloca).
457 //
458 // TODO: Now that we moved to PromoteAlloca we could handle any memsets
459 // (except maybe volatile ones?) - we just need to use shufflevector if it
460 // only affects a subset of the vector.
461 const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
462 return I->getOperand(0) == AI &&
463 match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
464}
465
466static Value *calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA) {
467 IRBuilder<> B(Ptr->getContext());
468
469 Ptr = Ptr->stripPointerCasts();
470 if (Ptr == AA.Alloca)
471 return B.getInt32(0);
472
473 auto *GEP = cast<GetElementPtrInst>(Ptr);
474 auto I = AA.Vector.GEPVectorIdx.find(GEP);
475 assert(I != AA.Vector.GEPVectorIdx.end() && "Must have entry for GEP!");
476
477 if (!I->second.Full) {
478 Value *Result = nullptr;
479 B.SetInsertPoint(GEP);
480
481 if (I->second.VarIndex) {
482 Result = I->second.VarIndex;
483 Result = B.CreateSExtOrTrunc(Result, B.getInt32Ty());
484
485 if (I->second.VarMul)
486 Result = B.CreateMul(Result, I->second.VarMul);
487 }
488
489 if (I->second.ConstIndex) {
490 if (Result)
491 Result = B.CreateAdd(Result, I->second.ConstIndex);
492 else
493 Result = I->second.ConstIndex;
494 }
495
496 if (!Result)
497 Result = B.getInt32(0);
498
499 I->second.Full = Result;
500 }
501
502 return I->second.Full;
503}
504
505static std::optional<GEPToVectorIndex>
507 Type *VecElemTy, const DataLayout &DL) {
508 // TODO: Extracting a "multiple of X" from a GEP might be a useful generic
509 // helper.
510 LLVMContext &Ctx = GEP->getContext();
511 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
513 APInt ConstOffset(BW, 0);
514
515 // Walk backwards through nested GEPs to collect both constant and variable
516 // offsets, so that nested vector GEP chains can be lowered in one step.
517 //
518 // Given this IR fragment as input:
519 //
520 // %0 = alloca [10 x <2 x i32>], align 8, addrspace(5)
521 // %1 = getelementptr [10 x <2 x i32>], ptr addrspace(5) %0, i32 0, i32 %j
522 // %2 = getelementptr i8, ptr addrspace(5) %1, i32 4
523 // %3 = load i32, ptr addrspace(5) %2, align 4
524 //
525 // Combine both GEP operations in a single pass, producing:
526 // BasePtr = %0
527 // ConstOffset = 4
528 // VarOffsets = { %j -> element_size(<2 x i32>) }
529 //
530 // That lets us emit a single buffer_load directly into a VGPR, without ever
531 // allocating scratch memory for the intermediate pointer.
532 Value *CurPtr = GEP;
533 while (auto *CurGEP = dyn_cast<GetElementPtrInst>(CurPtr)) {
534 if (!CurGEP->collectOffset(DL, BW, VarOffsets, ConstOffset))
535 return {};
536
537 // Move to the next outer pointer.
538 CurPtr = CurGEP->getPointerOperand();
539 }
540
541 assert(CurPtr == Alloca && "GEP not based on alloca");
542
543 int64_t VecElemSize = DL.getTypeAllocSize(VecElemTy);
544 if (VarOffsets.size() > 1)
545 return {};
546
547 APInt IndexQuot;
548 int64_t Rem;
549 APInt::sdivrem(ConstOffset, VecElemSize, IndexQuot, Rem);
550 if (Rem != 0)
551 return {};
552
553 GEPToVectorIndex Result;
554
555 if (!ConstOffset.isZero())
556 Result.ConstIndex = ConstantInt::get(Ctx, IndexQuot.sextOrTrunc(BW));
557
558 if (VarOffsets.empty())
559 return Result;
560
561 const auto &VarOffset = VarOffsets.front();
562 APInt OffsetQuot;
563 APInt::sdivrem(VarOffset.second, VecElemSize, OffsetQuot, Rem);
564 if (Rem != 0 || OffsetQuot.isZero())
565 return {};
566
567 Result.VarIndex = VarOffset.first;
568 auto *OffsetType = dyn_cast<IntegerType>(Result.VarIndex->getType());
569 if (!OffsetType)
570 return {};
571
572 if (!OffsetQuot.isOne())
573 Result.VarMul = ConstantInt::get(Ctx, OffsetQuot.sextOrTrunc(BW));
574
575 return Result;
576}
577
578/// Promotes a single user of the alloca to a vector form.
579///
580/// \param Inst Instruction to be promoted.
581/// \param DL Module Data Layout.
582/// \param AA Alloca Analysis.
583/// \param VecStoreSize Size of \p VectorTy in bytes.
584/// \param ElementSize Size of \p VectorTy element type in bytes.
585/// \param CurVal Current value of the vector (e.g. last stored value)
586/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
587/// be promoted now. This happens when promoting requires \p
588/// CurVal, but \p CurVal is nullptr.
589/// \return the stored value if \p Inst would have written to the alloca, or
590/// nullptr otherwise.
592 AllocaAnalysis &AA,
593 unsigned VecStoreSize,
594 unsigned ElementSize,
595 function_ref<Value *()> GetCurVal) {
596 // Note: we use InstSimplifyFolder because it can leverage the DataLayout
597 // to do more folding, especially in the case of vector splats.
600 Builder.SetInsertPoint(Inst);
601
602 const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
603 Type *PtrTy) -> Value * {
604 assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
605 const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
606 if (!PtrTy->isVectorTy())
607 return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
608 const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
609 // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
610 // first cast the ptr vector to <2 x i64>.
611 assert((Size % NumPtrElts == 0) && "Vector size not divisble");
612 Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
613 return Builder.CreateBitOrPointerCast(
614 Val, FixedVectorType::get(EltTy, NumPtrElts));
615 };
616
617 Type *VecEltTy = AA.Vector.Ty->getElementType();
618
619 switch (Inst->getOpcode()) {
620 case Instruction::Load: {
621 Value *CurVal = GetCurVal();
622 Value *Index =
624
625 // We're loading the full vector.
626 Type *AccessTy = Inst->getType();
627 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
628 if (Constant *CI = dyn_cast<Constant>(Index)) {
629 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
630 if (AccessTy->isPtrOrPtrVectorTy())
631 CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
632 else if (CurVal->getType()->isPtrOrPtrVectorTy())
633 CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
634 Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
635 Inst->replaceAllUsesWith(NewVal);
636 return nullptr;
637 }
638 }
639
640 // Loading a subvector.
641 if (isa<FixedVectorType>(AccessTy)) {
642 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
643 const unsigned NumLoadedElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
644 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
645 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
646
647 Value *SubVec = PoisonValue::get(SubVecTy);
648 for (unsigned K = 0; K < NumLoadedElts; ++K) {
649 Value *CurIdx =
650 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
651 SubVec = Builder.CreateInsertElement(
652 SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
653 }
654
655 if (AccessTy->isPtrOrPtrVectorTy())
656 SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
657 else if (SubVecTy->isPtrOrPtrVectorTy())
658 SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
659
660 SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
661 Inst->replaceAllUsesWith(SubVec);
662 return nullptr;
663 }
664
665 // We're loading one element.
666 Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
667 if (AccessTy != VecEltTy)
668 ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
669
670 Inst->replaceAllUsesWith(ExtractElement);
671 return nullptr;
672 }
673 case Instruction::Store: {
674 // For stores, it's a bit trickier and it depends on whether we're storing
675 // the full vector or not. If we're storing the full vector, we don't need
676 // to know the current value. If this is a store of a single element, we
677 // need to know the value.
679 Value *Index = calculateVectorIndex(SI->getPointerOperand(), AA);
680 Value *Val = SI->getValueOperand();
681
682 // We're storing the full vector, we can handle this without knowing CurVal.
683 Type *AccessTy = Val->getType();
684 TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
685 if (Constant *CI = dyn_cast<Constant>(Index)) {
686 if (CI->isZeroValue() && AccessSize == VecStoreSize) {
687 if (AccessTy->isPtrOrPtrVectorTy())
688 Val = CreateTempPtrIntCast(Val, AccessTy);
689 else if (AA.Vector.Ty->isPtrOrPtrVectorTy())
690 Val = CreateTempPtrIntCast(Val, AA.Vector.Ty);
691 return Builder.CreateBitOrPointerCast(Val, AA.Vector.Ty);
692 }
693 }
694
695 // Storing a subvector.
696 if (isa<FixedVectorType>(AccessTy)) {
697 assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
698 const unsigned NumWrittenElts =
699 AccessSize / DL.getTypeStoreSize(VecEltTy);
700 const unsigned NumVecElts = AA.Vector.Ty->getNumElements();
701 auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
702 assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
703
704 if (SubVecTy->isPtrOrPtrVectorTy())
705 Val = CreateTempPtrIntCast(Val, SubVecTy);
706 else if (AccessTy->isPtrOrPtrVectorTy())
707 Val = CreateTempPtrIntCast(Val, AccessTy);
708
709 Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
710
711 Value *CurVec = GetCurVal();
712 for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
713 K < NumElts; ++K) {
714 Value *CurIdx =
715 Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
716 CurVec = Builder.CreateInsertElement(
717 CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
718 }
719 return CurVec;
720 }
721
722 if (Val->getType() != VecEltTy)
723 Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
724 return Builder.CreateInsertElement(GetCurVal(), Val, Index);
725 }
726 case Instruction::Call: {
727 if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
728 // For memcpy, we need to know curval.
729 ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
730 unsigned NumCopied = Length->getZExtValue() / ElementSize;
731 MemTransferInfo *TI = &AA.Vector.TransferInfo[MTI];
732 unsigned SrcBegin = TI->SrcIndex->getZExtValue();
733 unsigned DestBegin = TI->DestIndex->getZExtValue();
734
735 SmallVector<int> Mask;
736 for (unsigned Idx = 0; Idx < AA.Vector.Ty->getNumElements(); ++Idx) {
737 if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
738 Mask.push_back(SrcBegin < AA.Vector.Ty->getNumElements()
739 ? SrcBegin++
741 } else {
742 Mask.push_back(Idx);
743 }
744 }
745
746 return Builder.CreateShuffleVector(GetCurVal(), Mask);
747 }
748
749 if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
750 // For memset, we don't need to know the previous value because we
751 // currently only allow memsets that cover the whole alloca.
752 Value *Elt = MSI->getOperand(1);
753 const unsigned BytesPerElt = DL.getTypeStoreSize(VecEltTy);
754 if (BytesPerElt > 1) {
755 Value *EltBytes = Builder.CreateVectorSplat(BytesPerElt, Elt);
756
757 // If the element type of the vector is a pointer, we need to first cast
758 // to an integer, then use a PtrCast.
759 if (VecEltTy->isPointerTy()) {
760 Type *PtrInt = Builder.getIntNTy(BytesPerElt * 8);
761 Elt = Builder.CreateBitCast(EltBytes, PtrInt);
762 Elt = Builder.CreateIntToPtr(Elt, VecEltTy);
763 } else
764 Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
765 }
766
767 return Builder.CreateVectorSplat(AA.Vector.Ty->getElementCount(), Elt);
768 }
769
770 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
771 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
772 Intr->replaceAllUsesWith(
773 Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
774 DL.getTypeAllocSize(AA.Vector.Ty)));
775 return nullptr;
776 }
777 }
778
779 llvm_unreachable("Unsupported call when promoting alloca to vector");
780 }
781
782 default:
783 llvm_unreachable("Inconsistency in instructions promotable to vector");
784 }
785
786 llvm_unreachable("Did not return after promoting instruction!");
787}
788
789static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
790 const DataLayout &DL) {
791 // Access as a vector type can work if the size of the access vector is a
792 // multiple of the size of the alloca's vector element type.
793 //
794 // Examples:
795 // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
796 // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
797 // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
798 // - 3*32 is not a multiple of 64
799 //
800 // We could handle more complicated cases, but it'd make things a lot more
801 // complicated.
802 if (isa<FixedVectorType>(AccessTy)) {
803 TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
804 // If the type size and the store size don't match, we would need to do more
805 // than just bitcast to translate between an extracted/insertable subvectors
806 // and the accessed value.
807 if (AccTS * 8 != DL.getTypeSizeInBits(AccessTy))
808 return false;
809 TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
810 return AccTS.isKnownMultipleOf(VecTS);
811 }
812
814 DL);
815}
816
817/// Iterates over an instruction worklist that may contain multiple instructions
818/// from the same basic block, but in a different order.
819template <typename InstContainer>
820static void forEachWorkListItem(const InstContainer &WorkList,
821 std::function<void(Instruction *)> Fn) {
822 // Bucket up uses of the alloca by the block they occur in.
823 // This is important because we have to handle multiple defs/uses in a block
824 // ourselves: SSAUpdater is purely for cross-block references.
826 for (Instruction *User : WorkList)
827 UsesByBlock[User->getParent()].insert(User);
828
829 for (Instruction *User : WorkList) {
830 BasicBlock *BB = User->getParent();
831 auto &BlockUses = UsesByBlock[BB];
832
833 // Already processed, skip.
834 if (BlockUses.empty())
835 continue;
836
837 // Only user in the block, directly process it.
838 if (BlockUses.size() == 1) {
839 Fn(User);
840 continue;
841 }
842
843 // Multiple users in the block, do a linear scan to see users in order.
844 for (Instruction &Inst : *BB) {
845 if (!BlockUses.contains(&Inst))
846 continue;
847
848 Fn(&Inst);
849 }
850
851 // Clear the block so we know it's been processed.
852 BlockUses.clear();
853 }
854}
855
856/// Find an insert point after an alloca, after all other allocas clustered at
857/// the start of the block.
860 for (BasicBlock::iterator E = BB.end(); I != E && isa<AllocaInst>(*I); ++I)
861 ;
862 return I;
863}
864
866AMDGPUPromoteAllocaImpl::getVectorTypeForAlloca(Type *AllocaTy) const {
867 if (DisablePromoteAllocaToVector) {
868 LLVM_DEBUG(dbgs() << " Promote alloca to vectors is disabled\n");
869 return nullptr;
870 }
871
872 auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
873 if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
874 uint64_t NumElems = 1;
875 Type *ElemTy;
876 do {
877 NumElems *= ArrayTy->getNumElements();
878 ElemTy = ArrayTy->getElementType();
879 } while ((ArrayTy = dyn_cast<ArrayType>(ElemTy)));
880
881 // Check for array of vectors
882 auto *InnerVectorTy = dyn_cast<FixedVectorType>(ElemTy);
883 if (InnerVectorTy) {
884 NumElems *= InnerVectorTy->getNumElements();
885 ElemTy = InnerVectorTy->getElementType();
886 }
887
888 if (VectorType::isValidElementType(ElemTy) && NumElems > 0) {
889 unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8;
890 if (ElementSize > 0) {
891 unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy);
892 // Expand vector if required to match padding of inner type,
893 // i.e. odd size subvectors.
894 // Storage size of new vector must match that of alloca for correct
895 // behaviour of byte offsets and GEP computation.
896 if (NumElems * ElementSize != AllocaSize)
897 NumElems = AllocaSize / ElementSize;
898 if (NumElems > 0 && (AllocaSize % ElementSize) == 0)
899 VectorTy = FixedVectorType::get(ElemTy, NumElems);
900 }
901 }
902 }
903 if (!VectorTy) {
904 LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
905 return nullptr;
906 }
907
908 const unsigned MaxElements =
909 (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType());
910
911 if (VectorTy->getNumElements() > MaxElements ||
912 VectorTy->getNumElements() < 2) {
913 LLVM_DEBUG(dbgs() << " " << *VectorTy
914 << " has an unsupported number of elements\n");
915 return nullptr;
916 }
917
918 Type *VecEltTy = VectorTy->getElementType();
919 unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy);
920 if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) {
921 LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size "
922 "does not match the type's size\n");
923 return nullptr;
924 }
925
926 return VectorTy;
927}
928
929void AMDGPUPromoteAllocaImpl::analyzePromoteToVector(AllocaAnalysis &AA) const {
930 if (AA.HaveSelectOrPHI) {
931 LLVM_DEBUG(dbgs() << " Cannot convert to vector due to select or phi\n");
932 return;
933 }
934
935 Type *AllocaTy = AA.Alloca->getAllocatedType();
936 AA.Vector.Ty = getVectorTypeForAlloca(AllocaTy);
937 if (!AA.Vector.Ty)
938 return;
939
940 const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
941 LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
942 << " " << *Inst << "\n");
943 AA.Vector.Ty = nullptr;
944 };
945
946 Type *VecEltTy = AA.Vector.Ty->getElementType();
947 unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
948 assert(ElementSize > 0);
949 for (auto *U : AA.Uses) {
950 Instruction *Inst = cast<Instruction>(U->getUser());
951
952 if (Value *Ptr = getLoadStorePointerOperand(Inst)) {
953 assert(!isa<StoreInst>(Inst) ||
954 U->getOperandNo() == StoreInst::getPointerOperandIndex());
955
956 Type *AccessTy = getLoadStoreType(Inst);
957 if (AccessTy->isAggregateType())
958 return RejectUser(Inst, "unsupported load/store as aggregate");
959 assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
960
961 // Check that this is a simple access of a vector element.
962 bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
963 : cast<StoreInst>(Inst)->isSimple();
964 if (!IsSimple)
965 return RejectUser(Inst, "not a simple load or store");
966
967 Ptr = Ptr->stripPointerCasts();
968
969 // Alloca already accessed as vector.
970 if (Ptr == AA.Alloca &&
971 DL->getTypeStoreSize(AA.Alloca->getAllocatedType()) ==
972 DL->getTypeStoreSize(AccessTy)) {
973 AA.Vector.Worklist.push_back(Inst);
974 continue;
975 }
976
977 if (!isSupportedAccessType(AA.Vector.Ty, AccessTy, *DL))
978 return RejectUser(Inst, "not a supported access type");
979
980 AA.Vector.Worklist.push_back(Inst);
981 continue;
982 }
983
984 if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
985 // If we can't compute a vector index from this GEP, then we can't
986 // promote this alloca to vector.
987 auto Index = computeGEPToVectorIndex(GEP, AA.Alloca, VecEltTy, *DL);
988 if (!Index)
989 return RejectUser(Inst, "cannot compute vector index for GEP");
990
991 AA.Vector.GEPVectorIdx[GEP] = std::move(Index.value());
992 AA.Vector.UsersToRemove.push_back(Inst);
993 continue;
994 }
995
996 if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
997 MSI && isSupportedMemset(MSI, AA.Alloca, *DL)) {
998 AA.Vector.Worklist.push_back(Inst);
999 continue;
1000 }
1001
1002 if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
1003 if (TransferInst->isVolatile())
1004 return RejectUser(Inst, "mem transfer inst is volatile");
1005
1006 ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
1007 if (!Len || (Len->getZExtValue() % ElementSize))
1008 return RejectUser(Inst, "mem transfer inst length is non-constant or "
1009 "not a multiple of the vector element size");
1010
1011 auto getConstIndexIntoAlloca = [&](Value *Ptr) -> ConstantInt * {
1012 if (Ptr == AA.Alloca)
1013 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1014
1016 const auto &GEPI = AA.Vector.GEPVectorIdx.find(GEP)->second;
1017 if (GEPI.VarIndex)
1018 return nullptr;
1019 if (GEPI.ConstIndex)
1020 return GEPI.ConstIndex;
1021 return ConstantInt::get(Ptr->getContext(), APInt(32, 0));
1022 };
1023
1024 MemTransferInfo *TI =
1025 &AA.Vector.TransferInfo.try_emplace(TransferInst).first->second;
1026 unsigned OpNum = U->getOperandNo();
1027 if (OpNum == 0) {
1028 Value *Dest = TransferInst->getDest();
1029 ConstantInt *Index = getConstIndexIntoAlloca(Dest);
1030 if (!Index)
1031 return RejectUser(Inst, "could not calculate constant dest index");
1032 TI->DestIndex = Index;
1033 } else {
1034 assert(OpNum == 1);
1035 Value *Src = TransferInst->getSource();
1036 ConstantInt *Index = getConstIndexIntoAlloca(Src);
1037 if (!Index)
1038 return RejectUser(Inst, "could not calculate constant src index");
1039 TI->SrcIndex = Index;
1040 }
1041 continue;
1042 }
1043
1044 if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
1045 if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
1046 AA.Vector.Worklist.push_back(Inst);
1047 continue;
1048 }
1049 }
1050
1051 // Ignore assume-like intrinsics and comparisons used in assumes.
1052 if (isAssumeLikeIntrinsic(Inst)) {
1053 if (!Inst->use_empty())
1054 return RejectUser(Inst, "assume-like intrinsic cannot have any users");
1055 AA.Vector.UsersToRemove.push_back(Inst);
1056 continue;
1057 }
1058
1059 if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
1060 return isAssumeLikeIntrinsic(cast<Instruction>(U));
1061 })) {
1062 AA.Vector.UsersToRemove.push_back(Inst);
1063 continue;
1064 }
1065
1066 return RejectUser(Inst, "unhandled alloca user");
1067 }
1068
1069 // Follow-up check to ensure we've seen both sides of all transfer insts.
1070 for (const auto &Entry : AA.Vector.TransferInfo) {
1071 const MemTransferInfo &TI = Entry.second;
1072 if (!TI.SrcIndex || !TI.DestIndex)
1073 return RejectUser(Entry.first,
1074 "mem transfer inst between different objects");
1075 AA.Vector.Worklist.push_back(Entry.first);
1076 }
1077}
1078
1079void AMDGPUPromoteAllocaImpl::promoteAllocaToVector(AllocaAnalysis &AA) {
1080 LLVM_DEBUG(dbgs() << "Promoting to vectors: " << *AA.Alloca << '\n');
1081 LLVM_DEBUG(dbgs() << " type conversion: " << *AA.Alloca->getAllocatedType()
1082 << " -> " << *AA.Vector.Ty << '\n');
1083 const unsigned VecStoreSize = DL->getTypeStoreSize(AA.Vector.Ty);
1084
1085 Type *VecEltTy = AA.Vector.Ty->getElementType();
1086 const unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
1087
1088 // Alloca is uninitialized memory. Imitate that by making the first value
1089 // undef.
1090 SSAUpdater Updater;
1091 Updater.Initialize(AA.Vector.Ty, "promotealloca");
1092
1093 BasicBlock *EntryBB = AA.Alloca->getParent();
1094 BasicBlock::iterator InitInsertPos =
1095 skipToNonAllocaInsertPt(*EntryBB, AA.Alloca->getIterator());
1096 IRBuilder<> Builder(&*InitInsertPos);
1097 Value *AllocaInitValue = Builder.CreateFreeze(PoisonValue::get(AA.Vector.Ty));
1098 AllocaInitValue->takeName(AA.Alloca);
1099
1100 Updater.AddAvailableValue(AA.Alloca->getParent(), AllocaInitValue);
1101
1102 // First handle the initial worklist, in basic block order.
1103 //
1104 // Insert a placeholder whenever we need the vector value at the top of a
1105 // basic block.
1106 SmallVector<Instruction *> Placeholders;
1107 forEachWorkListItem(AA.Vector.Worklist, [&](Instruction *I) {
1108 BasicBlock *BB = I->getParent();
1109 auto GetCurVal = [&]() -> Value * {
1110 if (Value *CurVal = Updater.FindValueForBlock(BB))
1111 return CurVal;
1112
1113 if (!Placeholders.empty() && Placeholders.back()->getParent() == BB)
1114 return Placeholders.back();
1115
1116 // If the current value in the basic block is not yet known, insert a
1117 // placeholder that we will replace later.
1118 IRBuilder<> Builder(I);
1119 auto *Placeholder = cast<Instruction>(Builder.CreateFreeze(
1120 PoisonValue::get(AA.Vector.Ty), "promotealloca.placeholder"));
1121 Placeholders.push_back(Placeholder);
1122 return Placeholders.back();
1123 };
1124
1125 Value *Result = promoteAllocaUserToVector(I, *DL, AA, VecStoreSize,
1126 ElementSize, GetCurVal);
1127 if (Result)
1128 Updater.AddAvailableValue(BB, Result);
1129 });
1130
1131 // Now fixup the placeholders.
1132 for (Instruction *Placeholder : Placeholders) {
1133 Placeholder->replaceAllUsesWith(
1134 Updater.GetValueInMiddleOfBlock(Placeholder->getParent()));
1135 Placeholder->eraseFromParent();
1136 }
1137
1138 // Delete all instructions.
1139 for (Instruction *I : AA.Vector.Worklist) {
1140 assert(I->use_empty());
1141 I->eraseFromParent();
1142 }
1143
1144 // Delete all the users that are known to be removeable.
1145 for (Instruction *I : reverse(AA.Vector.UsersToRemove)) {
1146 I->dropDroppableUses();
1147 assert(I->use_empty());
1148 I->eraseFromParent();
1149 }
1150
1151 // Alloca should now be dead too.
1152 assert(AA.Alloca->use_empty());
1153 AA.Alloca->eraseFromParent();
1154}
1155
1156std::pair<Value *, Value *>
1157AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
1158 Function &F = *Builder.GetInsertBlock()->getParent();
1160
1161 if (!IsAMDHSA) {
1162 CallInst *LocalSizeY =
1163 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {});
1164 CallInst *LocalSizeZ =
1165 Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {});
1166
1167 ST.makeLIDRangeMetadata(LocalSizeY);
1168 ST.makeLIDRangeMetadata(LocalSizeZ);
1169
1170 return std::pair(LocalSizeY, LocalSizeZ);
1171 }
1172
1173 // We must read the size out of the dispatch pointer.
1174 assert(IsAMDGCN);
1175
1176 // We are indexing into this struct, and want to extract the workgroup_size_*
1177 // fields.
1178 //
1179 // typedef struct hsa_kernel_dispatch_packet_s {
1180 // uint16_t header;
1181 // uint16_t setup;
1182 // uint16_t workgroup_size_x ;
1183 // uint16_t workgroup_size_y;
1184 // uint16_t workgroup_size_z;
1185 // uint16_t reserved0;
1186 // uint32_t grid_size_x ;
1187 // uint32_t grid_size_y ;
1188 // uint32_t grid_size_z;
1189 //
1190 // uint32_t private_segment_size;
1191 // uint32_t group_segment_size;
1192 // uint64_t kernel_object;
1193 //
1194 // #ifdef HSA_LARGE_MODEL
1195 // void *kernarg_address;
1196 // #elif defined HSA_LITTLE_ENDIAN
1197 // void *kernarg_address;
1198 // uint32_t reserved1;
1199 // #else
1200 // uint32_t reserved1;
1201 // void *kernarg_address;
1202 // #endif
1203 // uint64_t reserved2;
1204 // hsa_signal_t completion_signal; // uint64_t wrapper
1205 // } hsa_kernel_dispatch_packet_t
1206 //
1207 CallInst *DispatchPtr =
1208 Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {});
1209 DispatchPtr->addRetAttr(Attribute::NoAlias);
1210 DispatchPtr->addRetAttr(Attribute::NonNull);
1211 F.removeFnAttr("amdgpu-no-dispatch-ptr");
1212
1213 // Size of the dispatch packet struct.
1214 DispatchPtr->addDereferenceableRetAttr(64);
1215
1216 Type *I32Ty = Type::getInt32Ty(Mod->getContext());
1217
1218 // We could do a single 64-bit load here, but it's likely that the basic
1219 // 32-bit and extract sequence is already present, and it is probably easier
1220 // to CSE this. The loads should be mergeable later anyway.
1221 Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 1);
1222 LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
1223
1224 Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, DispatchPtr, 2);
1225 LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
1226
1227 MDNode *MD = MDNode::get(Mod->getContext(), {});
1228 LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
1229 LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
1230 ST.makeLIDRangeMetadata(LoadZU);
1231
1232 // Extract y component. Upper half of LoadZU should be zero already.
1233 Value *Y = Builder.CreateLShr(LoadXY, 16);
1234
1235 return std::pair(Y, LoadZU);
1236}
1237
1238Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
1239 unsigned N) {
1240 Function *F = Builder.GetInsertBlock()->getParent();
1243 StringRef AttrName;
1244
1245 switch (N) {
1246 case 0:
1247 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
1248 : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
1249 AttrName = "amdgpu-no-workitem-id-x";
1250 break;
1251 case 1:
1252 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
1253 : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
1254 AttrName = "amdgpu-no-workitem-id-y";
1255 break;
1256
1257 case 2:
1258 IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
1259 : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
1260 AttrName = "amdgpu-no-workitem-id-z";
1261 break;
1262 default:
1263 llvm_unreachable("invalid dimension");
1264 }
1265
1266 Function *WorkitemIdFn = Intrinsic::getOrInsertDeclaration(Mod, IntrID);
1267 CallInst *CI = Builder.CreateCall(WorkitemIdFn);
1268 ST.makeLIDRangeMetadata(CI);
1269 F->removeFnAttr(AttrName);
1270
1271 return CI;
1272}
1273
1274static bool isCallPromotable(CallInst *CI) {
1276 if (!II)
1277 return false;
1278
1279 switch (II->getIntrinsicID()) {
1280 case Intrinsic::memcpy:
1281 case Intrinsic::memmove:
1282 case Intrinsic::memset:
1283 case Intrinsic::lifetime_start:
1284 case Intrinsic::lifetime_end:
1285 case Intrinsic::invariant_start:
1286 case Intrinsic::invariant_end:
1287 case Intrinsic::launder_invariant_group:
1288 case Intrinsic::strip_invariant_group:
1289 case Intrinsic::objectsize:
1290 return true;
1291 default:
1292 return false;
1293 }
1294}
1295
1296bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
1297 Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
1298 int OpIdx1) const {
1299 // Figure out which operand is the one we might not be promoting.
1300 Value *OtherOp = Inst->getOperand(OpIdx0);
1301 if (Val == OtherOp)
1302 OtherOp = Inst->getOperand(OpIdx1);
1303
1305 return true;
1306
1307 // TODO: getUnderlyingObject will not work on a vector getelementptr
1308 Value *OtherObj = getUnderlyingObject(OtherOp);
1309 if (!isa<AllocaInst>(OtherObj))
1310 return false;
1311
1312 // TODO: We should be able to replace undefs with the right pointer type.
1313
1314 // TODO: If we know the other base object is another promotable
1315 // alloca, not necessarily this alloca, we can do this. The
1316 // important part is both must have the same address space at
1317 // the end.
1318 if (OtherObj != BaseAlloca) {
1319 LLVM_DEBUG(
1320 dbgs() << "Found a binary instruction with another alloca object\n");
1321 return false;
1322 }
1323
1324 return true;
1325}
1326
1327void AMDGPUPromoteAllocaImpl::analyzePromoteToLDS(AllocaAnalysis &AA) const {
1328 if (DisablePromoteAllocaToLDS) {
1329 LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
1330 return;
1331 }
1332
1333 // Don't promote the alloca to LDS for shader calling conventions as the work
1334 // item ID intrinsics are not supported for these calling conventions.
1335 // Furthermore not all LDS is available for some of the stages.
1336 const Function &ContainingFunction = *AA.Alloca->getFunction();
1337 CallingConv::ID CC = ContainingFunction.getCallingConv();
1338
1339 switch (CC) {
1342 break;
1343 default:
1344 LLVM_DEBUG(
1345 dbgs()
1346 << " promote alloca to LDS not supported with calling convention.\n");
1347 return;
1348 }
1349
1350 for (Use *Use : AA.Uses) {
1351 auto *User = Use->getUser();
1352
1353 if (CallInst *CI = dyn_cast<CallInst>(User)) {
1354 if (!isCallPromotable(CI))
1355 return;
1356
1357 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1358 AA.LDS.Worklist.push_back(User);
1359 continue;
1360 }
1361
1363 if (UseInst->getOpcode() == Instruction::PtrToInt)
1364 return;
1365
1366 if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
1367 if (LI->isVolatile())
1368 return;
1369 continue;
1370 }
1371
1372 if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
1373 if (SI->isVolatile())
1374 return;
1375 continue;
1376 }
1377
1378 if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
1379 if (RMW->isVolatile())
1380 return;
1381 continue;
1382 }
1383
1384 if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
1385 if (CAS->isVolatile())
1386 return;
1387 continue;
1388 }
1389
1390 // Only promote a select if we know that the other select operand
1391 // is from another pointer that will also be promoted.
1392 if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
1393 if (!binaryOpIsDerivedFromSameAlloca(AA.Alloca, Use->get(), ICmp, 0, 1))
1394 return;
1395
1396 // May need to rewrite constant operands.
1397 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1398 AA.LDS.Worklist.push_back(ICmp);
1399 continue;
1400 }
1401
1403 // Be conservative if an address could be computed outside the bounds of
1404 // the alloca.
1405 if (!GEP->isInBounds())
1406 return;
1408 // Do not promote vector/aggregate type instructions. It is hard to track
1409 // their users.
1410
1411 // Do not promote addrspacecast.
1412 //
1413 // TODO: If we know the address is only observed through flat pointers, we
1414 // could still promote.
1415 return;
1416 }
1417
1418 if (find(AA.LDS.Worklist, User) == AA.LDS.Worklist.end())
1419 AA.LDS.Worklist.push_back(User);
1420 }
1421
1422 AA.LDS.Enable = true;
1423}
1424
1425bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
1426
1427 FunctionType *FTy = F.getFunctionType();
1429
1430 // If the function has any arguments in the local address space, then it's
1431 // possible these arguments require the entire local memory space, so
1432 // we cannot use local memory in the pass.
1433 for (Type *ParamTy : FTy->params()) {
1434 PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
1435 if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
1436 LocalMemLimit = 0;
1437 LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
1438 "local memory disabled.\n");
1439 return false;
1440 }
1441 }
1442
1443 LocalMemLimit = ST.getAddressableLocalMemorySize();
1444 if (LocalMemLimit == 0)
1445 return false;
1446
1448 SmallPtrSet<const Constant *, 8> VisitedConstants;
1450
1451 auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
1452 for (const User *U : Val->users()) {
1453 if (const Instruction *Use = dyn_cast<Instruction>(U)) {
1454 if (Use->getFunction() == &F)
1455 return true;
1456 } else {
1457 const Constant *C = cast<Constant>(U);
1458 if (VisitedConstants.insert(C).second)
1459 Stack.push_back(C);
1460 }
1461 }
1462
1463 return false;
1464 };
1465
1466 for (GlobalVariable &GV : Mod->globals()) {
1468 continue;
1469
1470 if (visitUsers(&GV, &GV)) {
1471 UsedLDS.insert(&GV);
1472 Stack.clear();
1473 continue;
1474 }
1475
1476 // For any ConstantExpr uses, we need to recursively search the users until
1477 // we see a function.
1478 while (!Stack.empty()) {
1479 const Constant *C = Stack.pop_back_val();
1480 if (visitUsers(&GV, C)) {
1481 UsedLDS.insert(&GV);
1482 Stack.clear();
1483 break;
1484 }
1485 }
1486 }
1487
1488 const DataLayout &DL = Mod->getDataLayout();
1489 SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
1490 AllocatedSizes.reserve(UsedLDS.size());
1491
1492 for (const GlobalVariable *GV : UsedLDS) {
1493 Align Alignment =
1494 DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
1495 uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
1496
1497 // HIP uses an extern unsized array in local address space for dynamically
1498 // allocated shared memory. In that case, we have to disable the promotion.
1499 if (GV->hasExternalLinkage() && AllocSize == 0) {
1500 LocalMemLimit = 0;
1501 LLVM_DEBUG(dbgs() << "Function has a reference to externally allocated "
1502 "local memory. Promoting to local memory "
1503 "disabled.\n");
1504 return false;
1505 }
1506
1507 AllocatedSizes.emplace_back(AllocSize, Alignment);
1508 }
1509
1510 // Sort to try to estimate the worst case alignment padding
1511 //
1512 // FIXME: We should really do something to fix the addresses to a more optimal
1513 // value instead
1514 llvm::sort(AllocatedSizes, llvm::less_second());
1515
1516 // Check how much local memory is being used by global objects
1517 CurrentLocalMemUsage = 0;
1518
1519 // FIXME: Try to account for padding here. The real padding and address is
1520 // currently determined from the inverse order of uses in the function when
1521 // legalizing, which could also potentially change. We try to estimate the
1522 // worst case here, but we probably should fix the addresses earlier.
1523 for (auto Alloc : AllocatedSizes) {
1524 CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
1525 CurrentLocalMemUsage += Alloc.first;
1526 }
1527
1528 unsigned MaxOccupancy =
1529 ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1530 .second;
1531
1532 // Round up to the next tier of usage.
1533 unsigned MaxSizeWithWaveCount =
1534 ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
1535
1536 // Program may already use more LDS than is usable at maximum occupancy.
1537 if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
1538 return false;
1539
1540 LocalMemLimit = MaxSizeWithWaveCount;
1541
1542 LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
1543 << " bytes of LDS\n"
1544 << " Rounding size to " << MaxSizeWithWaveCount
1545 << " with a maximum occupancy of " << MaxOccupancy << '\n'
1546 << " and " << (LocalMemLimit - CurrentLocalMemUsage)
1547 << " available for promotion\n");
1548
1549 return true;
1550}
1551
1552// FIXME: Should try to pick the most likely to be profitable allocas first.
1553bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaAnalysis &AA,
1554 bool SufficientLDS) {
1555 LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << *AA.Alloca << '\n');
1556
1557 // Not likely to have sufficient local memory for promotion.
1558 if (!SufficientLDS)
1559 return false;
1560
1561 const DataLayout &DL = Mod->getDataLayout();
1562 IRBuilder<> Builder(AA.Alloca);
1563
1564 const Function &ContainingFunction = *AA.Alloca->getParent()->getParent();
1565 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
1566 unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
1567
1568 Align Alignment = DL.getValueOrABITypeAlignment(
1569 AA.Alloca->getAlign(), AA.Alloca->getAllocatedType());
1570
1571 // FIXME: This computed padding is likely wrong since it depends on inverse
1572 // usage order.
1573 //
1574 // FIXME: It is also possible that if we're allowed to use all of the memory
1575 // could end up using more than the maximum due to alignment padding.
1576
1577 uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
1578 uint32_t AllocSize =
1579 WorkGroupSize * DL.getTypeAllocSize(AA.Alloca->getAllocatedType());
1580 NewSize += AllocSize;
1581
1582 if (NewSize > LocalMemLimit) {
1583 LLVM_DEBUG(dbgs() << " " << AllocSize
1584 << " bytes of local memory not available to promote\n");
1585 return false;
1586 }
1587
1588 CurrentLocalMemUsage = NewSize;
1589
1590 LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
1591
1592 Function *F = AA.Alloca->getFunction();
1593
1594 Type *GVTy = ArrayType::get(AA.Alloca->getAllocatedType(), WorkGroupSize);
1597 Twine(F->getName()) + Twine('.') + AA.Alloca->getName(), nullptr,
1600 GV->setAlignment(AA.Alloca->getAlign());
1601
1602 Value *TCntY, *TCntZ;
1603
1604 std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
1605 Value *TIdX = getWorkitemID(Builder, 0);
1606 Value *TIdY = getWorkitemID(Builder, 1);
1607 Value *TIdZ = getWorkitemID(Builder, 2);
1608
1609 Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
1610 Tmp0 = Builder.CreateMul(Tmp0, TIdX);
1611 Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
1612 Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
1613 TID = Builder.CreateAdd(TID, TIdZ);
1614
1615 LLVMContext &Context = Mod->getContext();
1617
1618 Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
1619 AA.Alloca->mutateType(Offset->getType());
1620 AA.Alloca->replaceAllUsesWith(Offset);
1621 AA.Alloca->eraseFromParent();
1622
1623 SmallVector<IntrinsicInst *> DeferredIntrs;
1624
1626
1627 for (Value *V : AA.LDS.Worklist) {
1629 if (!Call) {
1630 if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
1631 Value *LHS = CI->getOperand(0);
1632 Value *RHS = CI->getOperand(1);
1633
1634 Type *NewTy = LHS->getType()->getWithNewType(NewPtrTy);
1636 CI->setOperand(0, Constant::getNullValue(NewTy));
1637
1639 CI->setOperand(1, Constant::getNullValue(NewTy));
1640
1641 continue;
1642 }
1643
1644 // The operand's value should be corrected on its own and we don't want to
1645 // touch the users.
1647 continue;
1648
1649 assert(V->getType()->isPtrOrPtrVectorTy());
1650
1651 Type *NewTy = V->getType()->getWithNewType(NewPtrTy);
1652 V->mutateType(NewTy);
1653
1654 // Adjust the types of any constant operands.
1657 SI->setOperand(1, Constant::getNullValue(NewTy));
1658
1660 SI->setOperand(2, Constant::getNullValue(NewTy));
1661 } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1662 for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1664 Phi->getIncomingValue(I)))
1665 Phi->setIncomingValue(I, Constant::getNullValue(NewTy));
1666 }
1667 }
1668
1669 continue;
1670 }
1671
1673 Builder.SetInsertPoint(Intr);
1674 switch (Intr->getIntrinsicID()) {
1675 case Intrinsic::lifetime_start:
1676 case Intrinsic::lifetime_end:
1677 // These intrinsics are for address space 0 only
1678 Intr->eraseFromParent();
1679 continue;
1680 case Intrinsic::memcpy:
1681 case Intrinsic::memmove:
1682 // These have 2 pointer operands. In case if second pointer also needs
1683 // to be replaced we defer processing of these intrinsics until all
1684 // other values are processed.
1685 DeferredIntrs.push_back(Intr);
1686 continue;
1687 case Intrinsic::memset: {
1688 MemSetInst *MemSet = cast<MemSetInst>(Intr);
1689 Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
1690 MemSet->getLength(), MemSet->getDestAlign(),
1691 MemSet->isVolatile());
1692 Intr->eraseFromParent();
1693 continue;
1694 }
1695 case Intrinsic::invariant_start:
1696 case Intrinsic::invariant_end:
1697 case Intrinsic::launder_invariant_group:
1698 case Intrinsic::strip_invariant_group: {
1700 if (Intr->getIntrinsicID() == Intrinsic::invariant_start) {
1701 Args.emplace_back(Intr->getArgOperand(0));
1702 } else if (Intr->getIntrinsicID() == Intrinsic::invariant_end) {
1703 Args.emplace_back(Intr->getArgOperand(0));
1704 Args.emplace_back(Intr->getArgOperand(1));
1705 }
1706 Args.emplace_back(Offset);
1708 Intr->getModule(), Intr->getIntrinsicID(), Offset->getType());
1709 CallInst *NewIntr =
1710 CallInst::Create(F, Args, Intr->getName(), Intr->getIterator());
1711 Intr->mutateType(NewIntr->getType());
1712 Intr->replaceAllUsesWith(NewIntr);
1713 Intr->eraseFromParent();
1714 continue;
1715 }
1716 case Intrinsic::objectsize: {
1717 Value *Src = Intr->getOperand(0);
1718
1719 CallInst *NewCall = Builder.CreateIntrinsic(
1720 Intrinsic::objectsize,
1722 {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1723 Intr->replaceAllUsesWith(NewCall);
1724 Intr->eraseFromParent();
1725 continue;
1726 }
1727 default:
1728 Intr->print(errs());
1729 llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1730 }
1731 }
1732
1733 for (IntrinsicInst *Intr : DeferredIntrs) {
1734 Builder.SetInsertPoint(Intr);
1736 assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1737
1739 auto *B = Builder.CreateMemTransferInst(
1740 ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
1741 MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
1742
1743 for (unsigned I = 0; I != 2; ++I) {
1744 if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1745 B->addDereferenceableParamAttr(I, Bytes);
1746 }
1747 }
1748
1749 Intr->eraseFromParent();
1750 }
1751
1752 return true;
1753}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * promoteAllocaUserToVector(Instruction *Inst, const DataLayout &DL, AllocaAnalysis &AA, unsigned VecStoreSize, unsigned ElementSize, function_ref< Value *()> GetCurVal)
Promotes a single user of the alloca to a vector form.
AMDGPU promote alloca to vector or LDS
static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy, const DataLayout &DL)
static void forEachWorkListItem(const InstContainer &WorkList, std::function< void(Instruction *)> Fn)
Iterates over an instruction worklist that may contain multiple instructions from the same basic bloc...
static std::optional< GEPToVectorIndex > computeGEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca, Type *VecElemTy, const DataLayout &DL)
static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI, const DataLayout &DL)
static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, BasicBlock::iterator I)
Find an insert point after an alloca, after all other allocas clustered at the start of the block.
static bool isCallPromotable(CallInst *CI)
static Value * calculateVectorIndex(Value *Ptr, AllocaAnalysis &AA)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
static bool runOnFunction(Function &F, bool PostInlining)
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t IntrinsicInst * II
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
Remove Loads Into Fake Uses
static unsigned getNumElements(Type *Ty)
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Target-Independent Code Generator Pass Configuration Options pass.
Value * RHS
Value * LHS
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Class for arbitrary precision integers.
Definition APInt.h:78
static LLVM_ABI void sdivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder)
Definition APInt.cpp:1890
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1041
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
an instruction to allocate memory on the stack
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
uint64_t getParamDereferenceableBytes(unsigned i) const
Extract the number of dereferenceable bytes for a call or parameter (0=unknown).
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
static LLVM_ABI bool isBitOrNoopPointerCastable(Type *SrcTy, Type *DestTy, const DataLayout &DL)
Check whether a bitcast, inttoptr, or ptrtoint cast between these types is valid and a no-op.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Definition DenseMap.h:241
Implements a dense probed hash-table based set.
Definition DenseSet.h:279
Class to represent fixed width SIMD vectors.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
Class to represent function types.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool hasExternalLinkage() const
void setUnnamedAddr(UnnamedAddr Val)
unsigned getAddressSpace() const
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Type * getValueType() const
MaybeAlign getAlign() const
Returns the alignment of the given variable.
void setAlignment(Align Align)
Sets the alignment attribute of the GlobalVariable.
This instruction compares its operands according to the predicate given to the constructor.
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:630
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateConstInBoundsGEP1_64(Type *Ty, Value *Ptr, uint64_t Idx0, const Twine &Name="")
Definition IRBuilder.h:1996
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
The legacy pass manager's analysis pass to compute loop information.
Definition LoopInfo.h:596
Metadata node.
Definition Metadata.h:1078
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
bool empty() const
Definition MapVector.h:77
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Value * getLength() const
Value * getRawDest() const
MaybeAlign getDestAlign() const
bool isVolatile() const
Value * getValue() const
This class wraps the llvm.memset and llvm.memset.inline intrinsics.
This class wraps the llvm.memcpy/memmove intrinsics.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition Pass.cpp:112
Class to represent pointers.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
Helper class for SSA formation on a set of values defined in multiple blocks.
Definition SSAUpdater.h:39
void Initialize(Type *Ty, StringRef Name)
Reset this object to get ready for a new set of SSA updates with type 'Ty'.
Value * GetValueInMiddleOfBlock(BasicBlock *BB)
Construct SSA form, materializing a value that is live in the middle of the specified block.
void AddAvailableValue(BasicBlock *BB, Value *V)
Indicate that a rewritten value is available in the specified block with the specified value.
This class represents the LLVM 'select' instruction.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
static unsigned getPointerOperandIndex()
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Primary interface to the complete machine description for the target machine.
const Triple & getTargetTriple() const
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isAMDGCN() const
Tests whether the target is AMDGCN.
Definition Triple.h:927
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:264
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition Type.h:304
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition Type.h:270
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void print(raw_ostream &O, bool IsForDebug=false) const
Implement operator<< on Value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
bool use_empty() const
Definition Value.h:346
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
void mutateType(Type *Ty)
Mutate the type of this Value to be of the specified type.
Definition Value.h:838
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:396
static LLVM_ABI bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Type * getElementType() const
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:180
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
@ LOCAL_ADDRESS
Address space for local memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
@ Length
Definition DWP.cpp:532
void stable_sort(R &&Range)
Definition STLExtras.h:2070
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1763
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
const Value * getPointerOperand(const Value *V)
A helper function that returns the pointer operand of a load, store or GEP instruction.
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
FunctionPass * createAMDGPUPromoteAlloca()
@ Mod
The access may modify the value stored in memory.
Definition ModRef.h:34
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
char & AMDGPUPromoteAllocaID
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
#define N
AMDGPUPromoteAllocaPass(TargetMachine &TM)
Definition AMDGPU.h:257
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Function object to check whether the second component of a container supported by std::get (like std:...
Definition STLExtras.h:1446