LLVM  14.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates allocas by either converting them into vectors or
10 // by migrating them to local address space.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/IR/IntrinsicsR600.h"
22 #include "llvm/Pass.h"
24 
25 #define DEBUG_TYPE "amdgpu-promote-alloca"
26 
27 using namespace llvm;
28 
29 namespace {
30 
31 static cl::opt<bool> DisablePromoteAllocaToVector(
32  "disable-promote-alloca-to-vector",
33  cl::desc("Disable promote alloca to vector"),
34  cl::init(false));
35 
36 static cl::opt<bool> DisablePromoteAllocaToLDS(
37  "disable-promote-alloca-to-lds",
38  cl::desc("Disable promote alloca to LDS"),
39  cl::init(false));
40 
41 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
42  "amdgpu-promote-alloca-to-vector-limit",
43  cl::desc("Maximum byte size to consider promote alloca to vector"),
44  cl::init(0));
45 
46 // FIXME: This can create globals so should be a module pass.
47 class AMDGPUPromoteAlloca : public FunctionPass {
48 public:
49  static char ID;
50 
51  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
52 
53  bool runOnFunction(Function &F) override;
54 
55  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
56 
57  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
58 
59  void getAnalysisUsage(AnalysisUsage &AU) const override {
60  AU.setPreservesCFG();
62  }
63 };
64 
65 class AMDGPUPromoteAllocaImpl {
66 private:
67  const TargetMachine &TM;
68  Module *Mod = nullptr;
69  const DataLayout *DL = nullptr;
70 
71  // FIXME: This should be per-kernel.
72  uint32_t LocalMemLimit = 0;
73  uint32_t CurrentLocalMemUsage = 0;
74  unsigned MaxVGPRs;
75 
76  bool IsAMDGCN = false;
77  bool IsAMDHSA = false;
78 
79  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
80  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
81 
82  /// BaseAlloca is the alloca root the search started from.
83  /// Val may be that alloca or a recursive user of it.
84  bool collectUsesWithPtrTypes(Value *BaseAlloca,
85  Value *Val,
86  std::vector<Value*> &WorkList) const;
87 
88  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
89  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
90  /// Returns true if both operands are derived from the same alloca. Val should
91  /// be the same value as one of the input operands of UseInst.
92  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
93  Instruction *UseInst,
94  int OpIdx0, int OpIdx1) const;
95 
96  /// Check whether we have enough local memory for promotion.
97  bool hasSufficientLocalMem(const Function &F);
98 
99  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
100 
101 public:
102  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
103  bool run(Function &F);
104 };
105 
106 class AMDGPUPromoteAllocaToVector : public FunctionPass {
107 public:
108  static char ID;
109 
110  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
111 
112  bool runOnFunction(Function &F) override;
113 
114  StringRef getPassName() const override {
115  return "AMDGPU Promote Alloca to vector";
116  }
117 
118  void getAnalysisUsage(AnalysisUsage &AU) const override {
119  AU.setPreservesCFG();
121  }
122 };
123 
124 } // end anonymous namespace
125 
126 char AMDGPUPromoteAlloca::ID = 0;
128 
129 INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
130  "AMDGPU promote alloca to vector or LDS", false, false)
131 // Move LDS uses from functions to kernels before promote alloca for accurate
132 // estimation of LDS available
133 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
134 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
135  "AMDGPU promote alloca to vector or LDS", false, false)
136 
137 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
138  "AMDGPU promote alloca to vector", false, false)
139 
140 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
141 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
142 
143 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
144  if (skipFunction(F))
145  return false;
146 
147  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
148  return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
149  }
150  return false;
151 }
152 
155  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
156  if (Changed) {
158  PA.preserveSet<CFGAnalyses>();
159  return PA;
160  }
161  return PreservedAnalyses::all();
162 }
163 
164 bool AMDGPUPromoteAllocaImpl::run(Function &F) {
165  Mod = F.getParent();
166  DL = &Mod->getDataLayout();
167 
168  const Triple &TT = TM.getTargetTriple();
169  IsAMDGCN = TT.getArch() == Triple::amdgcn;
170  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
171 
173  if (!ST.isPromoteAllocaEnabled())
174  return false;
175 
176  if (IsAMDGCN) {
177  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
178  MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
179  } else {
180  MaxVGPRs = 128;
181  }
182 
183  bool SufficientLDS = hasSufficientLocalMem(F);
184  bool Changed = false;
185  BasicBlock &EntryBB = *F.begin();
186 
188  for (Instruction &I : EntryBB) {
189  if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
190  Allocas.push_back(AI);
191  }
192 
193  for (AllocaInst *AI : Allocas) {
194  if (handleAlloca(*AI, SufficientLDS))
195  Changed = true;
196  }
197 
198  return Changed;
199 }
200 
201 std::pair<Value *, Value *>
202 AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
203  Function &F = *Builder.GetInsertBlock()->getParent();
205 
206  if (!IsAMDHSA) {
207  Function *LocalSizeYFn
208  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
209  Function *LocalSizeZFn
210  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
211 
212  CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
213  CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
214 
215  ST.makeLIDRangeMetadata(LocalSizeY);
216  ST.makeLIDRangeMetadata(LocalSizeZ);
217 
218  return std::make_pair(LocalSizeY, LocalSizeZ);
219  }
220 
221  // We must read the size out of the dispatch pointer.
222  assert(IsAMDGCN);
223 
224  // We are indexing into this struct, and want to extract the workgroup_size_*
225  // fields.
226  //
227  // typedef struct hsa_kernel_dispatch_packet_s {
228  // uint16_t header;
229  // uint16_t setup;
230  // uint16_t workgroup_size_x ;
231  // uint16_t workgroup_size_y;
232  // uint16_t workgroup_size_z;
233  // uint16_t reserved0;
234  // uint32_t grid_size_x ;
235  // uint32_t grid_size_y ;
236  // uint32_t grid_size_z;
237  //
238  // uint32_t private_segment_size;
239  // uint32_t group_segment_size;
240  // uint64_t kernel_object;
241  //
242  // #ifdef HSA_LARGE_MODEL
243  // void *kernarg_address;
244  // #elif defined HSA_LITTLE_ENDIAN
245  // void *kernarg_address;
246  // uint32_t reserved1;
247  // #else
248  // uint32_t reserved1;
249  // void *kernarg_address;
250  // #endif
251  // uint64_t reserved2;
252  // hsa_signal_t completion_signal; // uint64_t wrapper
253  // } hsa_kernel_dispatch_packet_t
254  //
255  Function *DispatchPtrFn
256  = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
257 
258  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
259  DispatchPtr->addRetAttr(Attribute::NoAlias);
260  DispatchPtr->addRetAttr(Attribute::NonNull);
261  F.removeFnAttr("amdgpu-no-dispatch-ptr");
262 
263  // Size of the dispatch packet struct.
264  DispatchPtr->addDereferenceableRetAttr(64);
265 
266  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
267  Value *CastDispatchPtr = Builder.CreateBitCast(
268  DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
269 
270  // We could do a single 64-bit load here, but it's likely that the basic
271  // 32-bit and extract sequence is already present, and it is probably easier
272  // to CSE this. The loads should be mergable later anyway.
273  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
274  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
275 
276  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
277  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
278 
279  MDNode *MD = MDNode::get(Mod->getContext(), None);
280  LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
281  LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
282  ST.makeLIDRangeMetadata(LoadZU);
283 
284  // Extract y component. Upper half of LoadZU should be zero already.
285  Value *Y = Builder.CreateLShr(LoadXY, 16);
286 
287  return std::make_pair(Y, LoadZU);
288 }
289 
290 Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
291  unsigned N) {
292  Function *F = Builder.GetInsertBlock()->getParent();
295  StringRef AttrName;
296 
297  switch (N) {
298  case 0:
299  IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
300  : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
301  AttrName = "amdgpu-no-workitem-id-x";
302  break;
303  case 1:
304  IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
305  : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
306  AttrName = "amdgpu-no-workitem-id-y";
307  break;
308 
309  case 2:
310  IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
311  : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
312  AttrName = "amdgpu-no-workitem-id-z";
313  break;
314  default:
315  llvm_unreachable("invalid dimension");
316  }
317 
318  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
319  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
320  ST.makeLIDRangeMetadata(CI);
321  F->removeFnAttr(AttrName);
322 
323  return CI;
324 }
325 
327  return FixedVectorType::get(ArrayTy->getElementType(),
328  ArrayTy->getNumElements());
329 }
330 
331 static Value *stripBitcasts(Value *V) {
332  while (Instruction *I = dyn_cast<Instruction>(V)) {
333  if (I->getOpcode() != Instruction::BitCast)
334  break;
335  V = I->getOperand(0);
336  }
337  return V;
338 }
339 
340 static Value *
342  const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
343  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
344  if (!GEP)
345  return nullptr;
346 
347  auto I = GEPIdx.find(GEP);
348  return I == GEPIdx.end() ? nullptr : I->second;
349 }
350 
352  // FIXME we only support simple cases
353  if (GEP->getNumOperands() != 3)
354  return nullptr;
355 
356  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
357  if (!I0 || !I0->isZero())
358  return nullptr;
359 
360  return GEP->getOperand(2);
361 }
362 
363 // Not an instruction handled below to turn into a vector.
364 //
365 // TODO: Check isTriviallyVectorizable for calls and handle other
366 // instructions.
367 static bool canVectorizeInst(Instruction *Inst, User *User,
368  const DataLayout &DL) {
369  switch (Inst->getOpcode()) {
370  case Instruction::Load: {
371  // Currently only handle the case where the Pointer Operand is a GEP.
372  // Also we could not vectorize volatile or atomic loads.
373  LoadInst *LI = cast<LoadInst>(Inst);
374  if (isa<AllocaInst>(User) &&
375  LI->getPointerOperandType() == User->getType() &&
376  isa<VectorType>(LI->getType()))
377  return true;
378 
379  Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
380  if (!PtrInst)
381  return false;
382 
383  return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
384  PtrInst->getOpcode() == Instruction::BitCast) &&
385  LI->isSimple();
386  }
387  case Instruction::BitCast:
388  return true;
389  case Instruction::Store: {
390  // Must be the stored pointer operand, not a stored value, plus
391  // since it should be canonical form, the User should be a GEP.
392  // Also we could not vectorize volatile or atomic stores.
393  StoreInst *SI = cast<StoreInst>(Inst);
394  if (isa<AllocaInst>(User) &&
395  SI->getPointerOperandType() == User->getType() &&
396  isa<VectorType>(SI->getValueOperand()->getType()))
397  return true;
398 
399  Instruction *UserInst = dyn_cast<Instruction>(User);
400  if (!UserInst)
401  return false;
402 
403  return (SI->getPointerOperand() == User) &&
404  (UserInst->getOpcode() == Instruction::GetElementPtr ||
405  UserInst->getOpcode() == Instruction::BitCast) &&
406  SI->isSimple();
407  }
408  default:
409  return false;
410  }
411 }
412 
413 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
414  unsigned MaxVGPRs) {
415 
416  if (DisablePromoteAllocaToVector) {
417  LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
418  return false;
419  }
420 
421  Type *AllocaTy = Alloca->getAllocatedType();
422  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
423  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
424  if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
425  ArrayTy->getNumElements() > 0)
426  VectorTy = arrayTypeToVecType(ArrayTy);
427  }
428 
429  // Use up to 1/4 of available register budget for vectorization.
430  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
431  : (MaxVGPRs * 32);
432 
433  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
434  LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
435  << MaxVGPRs << " registers available\n");
436  return false;
437  }
438 
439  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
440 
441  // FIXME: There is no reason why we can't support larger arrays, we
442  // are just being conservative for now.
443  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
444  // could also be promoted but we don't currently handle this case
445  if (!VectorTy || VectorTy->getNumElements() > 16 ||
446  VectorTy->getNumElements() < 2) {
447  LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
448  return false;
449  }
450 
451  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
452  std::vector<Value *> WorkList;
454  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
455  Type *VecEltTy = VectorTy->getElementType();
456  while (!Users.empty()) {
457  User *AllocaUser = Users.pop_back_val();
458  User *UseUser = UseUsers.pop_back_val();
459  Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
460 
461  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
462  if (!GEP) {
463  if (!canVectorizeInst(Inst, UseUser, DL))
464  return false;
465 
466  if (Inst->getOpcode() == Instruction::BitCast) {
467  Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
468  Type *ToTy = Inst->getType()->getPointerElementType();
469  if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
470  DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
471  continue;
472 
473  for (User *CastUser : Inst->users()) {
474  if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
475  continue;
476  Users.push_back(CastUser);
477  UseUsers.push_back(Inst);
478  }
479 
480  continue;
481  }
482 
483  WorkList.push_back(AllocaUser);
484  continue;
485  }
486 
488 
489  // If we can't compute a vector index from this GEP, then we can't
490  // promote this alloca to vector.
491  if (!Index) {
492  LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
493  << '\n');
494  return false;
495  }
496 
497  GEPVectorIdx[GEP] = Index;
498  Users.append(GEP->user_begin(), GEP->user_end());
499  UseUsers.append(GEP->getNumUses(), GEP);
500  }
501 
502  LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
503  << *VectorTy << '\n');
504 
505  for (Value *V : WorkList) {
506  Instruction *Inst = cast<Instruction>(V);
507  IRBuilder<> Builder(Inst);
508  switch (Inst->getOpcode()) {
509  case Instruction::Load: {
510  if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
511  break;
512 
513  Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
514  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
515  if (!Index)
516  break;
517 
518  Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
519  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
520  Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
521  Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
522  if (Inst->getType() != VecEltTy)
523  ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
524  Inst->replaceAllUsesWith(ExtractElement);
525  Inst->eraseFromParent();
526  break;
527  }
528  case Instruction::Store: {
529  StoreInst *SI = cast<StoreInst>(Inst);
530  if (SI->getValueOperand()->getType() == AllocaTy ||
531  SI->getValueOperand()->getType()->isVectorTy())
532  break;
533 
534  Value *Ptr = SI->getPointerOperand();
535  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
536  if (!Index)
537  break;
538 
539  Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
540  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
541  Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
542  Value *Elt = SI->getValueOperand();
543  if (Elt->getType() != VecEltTy)
544  Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
545  Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
546  Builder.CreateStore(NewVecValue, BitCast);
547  Inst->eraseFromParent();
548  break;
549  }
550 
551  default:
552  llvm_unreachable("Inconsistency in instructions promotable to vector");
553  }
554  }
555  return true;
556 }
557 
558 static bool isCallPromotable(CallInst *CI) {
559  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
560  if (!II)
561  return false;
562 
563  switch (II->getIntrinsicID()) {
564  case Intrinsic::memcpy:
565  case Intrinsic::memmove:
566  case Intrinsic::memset:
567  case Intrinsic::lifetime_start:
568  case Intrinsic::lifetime_end:
569  case Intrinsic::invariant_start:
570  case Intrinsic::invariant_end:
571  case Intrinsic::launder_invariant_group:
572  case Intrinsic::strip_invariant_group:
573  case Intrinsic::objectsize:
574  return true;
575  default:
576  return false;
577  }
578 }
579 
580 bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
581  Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
582  int OpIdx1) const {
583  // Figure out which operand is the one we might not be promoting.
584  Value *OtherOp = Inst->getOperand(OpIdx0);
585  if (Val == OtherOp)
586  OtherOp = Inst->getOperand(OpIdx1);
587 
588  if (isa<ConstantPointerNull>(OtherOp))
589  return true;
590 
591  Value *OtherObj = getUnderlyingObject(OtherOp);
592  if (!isa<AllocaInst>(OtherObj))
593  return false;
594 
595  // TODO: We should be able to replace undefs with the right pointer type.
596 
597  // TODO: If we know the other base object is another promotable
598  // alloca, not necessarily this alloca, we can do this. The
599  // important part is both must have the same address space at
600  // the end.
601  if (OtherObj != BaseAlloca) {
602  LLVM_DEBUG(
603  dbgs() << "Found a binary instruction with another alloca object\n");
604  return false;
605  }
606 
607  return true;
608 }
609 
610 bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
611  Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
612 
613  for (User *User : Val->users()) {
614  if (is_contained(WorkList, User))
615  continue;
616 
617  if (CallInst *CI = dyn_cast<CallInst>(User)) {
618  if (!isCallPromotable(CI))
619  return false;
620 
621  WorkList.push_back(User);
622  continue;
623  }
624 
625  Instruction *UseInst = cast<Instruction>(User);
626  if (UseInst->getOpcode() == Instruction::PtrToInt)
627  return false;
628 
629  if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
630  if (LI->isVolatile())
631  return false;
632 
633  continue;
634  }
635 
636  if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
637  if (SI->isVolatile())
638  return false;
639 
640  // Reject if the stored value is not the pointer operand.
641  if (SI->getPointerOperand() != Val)
642  return false;
643  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
644  if (RMW->isVolatile())
645  return false;
646  } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
647  if (CAS->isVolatile())
648  return false;
649  }
650 
651  // Only promote a select if we know that the other select operand
652  // is from another pointer that will also be promoted.
653  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
654  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
655  return false;
656 
657  // May need to rewrite constant operands.
658  WorkList.push_back(ICmp);
659  }
660 
661  if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
662  // Give up if the pointer may be captured.
663  if (PointerMayBeCaptured(UseInst, true, true))
664  return false;
665  // Don't collect the users of this.
666  WorkList.push_back(User);
667  continue;
668  }
669 
670  // Do not promote vector/aggregate type instructions. It is hard to track
671  // their users.
672  if (isa<InsertValueInst>(User) || isa<InsertElementInst>(User))
673  return false;
674 
675  if (!User->getType()->isPointerTy())
676  continue;
677 
678  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
679  // Be conservative if an address could be computed outside the bounds of
680  // the alloca.
681  if (!GEP->isInBounds())
682  return false;
683  }
684 
685  // Only promote a select if we know that the other select operand is from
686  // another pointer that will also be promoted.
687  if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
688  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
689  return false;
690  }
691 
692  // Repeat for phis.
693  if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
694  // TODO: Handle more complex cases. We should be able to replace loops
695  // over arrays.
696  switch (Phi->getNumIncomingValues()) {
697  case 1:
698  break;
699  case 2:
700  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
701  return false;
702  break;
703  default:
704  return false;
705  }
706  }
707 
708  WorkList.push_back(User);
709  if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
710  return false;
711  }
712 
713  return true;
714 }
715 
716 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
717 
718  FunctionType *FTy = F.getFunctionType();
720 
721  // If the function has any arguments in the local address space, then it's
722  // possible these arguments require the entire local memory space, so
723  // we cannot use local memory in the pass.
724  for (Type *ParamTy : FTy->params()) {
725  PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
726  if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
727  LocalMemLimit = 0;
728  LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
729  "local memory disabled.\n");
730  return false;
731  }
732  }
733 
734  LocalMemLimit = ST.getLocalMemorySize();
735  if (LocalMemLimit == 0)
736  return false;
737 
739  SmallPtrSet<const Constant *, 8> VisitedConstants;
741 
742  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
743  for (const User *U : Val->users()) {
744  if (const Instruction *Use = dyn_cast<Instruction>(U)) {
745  if (Use->getParent()->getParent() == &F)
746  return true;
747  } else {
748  const Constant *C = cast<Constant>(U);
749  if (VisitedConstants.insert(C).second)
750  Stack.push_back(C);
751  }
752  }
753 
754  return false;
755  };
756 
757  for (GlobalVariable &GV : Mod->globals()) {
759  continue;
760 
761  if (visitUsers(&GV, &GV)) {
762  UsedLDS.insert(&GV);
763  Stack.clear();
764  continue;
765  }
766 
767  // For any ConstantExpr uses, we need to recursively search the users until
768  // we see a function.
769  while (!Stack.empty()) {
770  const Constant *C = Stack.pop_back_val();
771  if (visitUsers(&GV, C)) {
772  UsedLDS.insert(&GV);
773  Stack.clear();
774  break;
775  }
776  }
777  }
778 
779  const DataLayout &DL = Mod->getDataLayout();
780  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
781  AllocatedSizes.reserve(UsedLDS.size());
782 
783  for (const GlobalVariable *GV : UsedLDS) {
784  Align Alignment =
785  DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
786  uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
787  AllocatedSizes.emplace_back(AllocSize, Alignment);
788  }
789 
790  // Sort to try to estimate the worst case alignment padding
791  //
792  // FIXME: We should really do something to fix the addresses to a more optimal
793  // value instead
794  llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS,
795  std::pair<uint64_t, Align> RHS) {
796  return LHS.second < RHS.second;
797  });
798 
799  // Check how much local memory is being used by global objects
800  CurrentLocalMemUsage = 0;
801 
802  // FIXME: Try to account for padding here. The real padding and address is
803  // currently determined from the inverse order of uses in the function when
804  // legalizing, which could also potentially change. We try to estimate the
805  // worst case here, but we probably should fix the addresses earlier.
806  for (auto Alloc : AllocatedSizes) {
807  CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
808  CurrentLocalMemUsage += Alloc.first;
809  }
810 
811  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
812  F);
813 
814  // Restrict local memory usage so that we don't drastically reduce occupancy,
815  // unless it is already significantly reduced.
816 
817  // TODO: Have some sort of hint or other heuristics to guess occupancy based
818  // on other factors..
819  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
820  if (OccupancyHint == 0)
821  OccupancyHint = 7;
822 
823  // Clamp to max value.
824  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
825 
826  // Check the hint but ignore it if it's obviously wrong from the existing LDS
827  // usage.
828  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
829 
830 
831  // Round up to the next tier of usage.
832  unsigned MaxSizeWithWaveCount
833  = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
834 
835  // Program is possibly broken by using more local mem than available.
836  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
837  return false;
838 
839  LocalMemLimit = MaxSizeWithWaveCount;
840 
841  LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
842  << " bytes of LDS\n"
843  << " Rounding size to " << MaxSizeWithWaveCount
844  << " with a maximum occupancy of " << MaxOccupancy << '\n'
845  << " and " << (LocalMemLimit - CurrentLocalMemUsage)
846  << " available for promotion\n");
847 
848  return true;
849 }
850 
851 // FIXME: Should try to pick the most likely to be profitable allocas first.
852 bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
853  // Array allocations are probably not worth handling, since an allocation of
854  // the array type is the canonical form.
855  if (!I.isStaticAlloca() || I.isArrayAllocation())
856  return false;
857 
858  const DataLayout &DL = Mod->getDataLayout();
860 
861  // First try to replace the alloca with a vector
862  Type *AllocaTy = I.getAllocatedType();
863 
864  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
865 
866  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
867  return true; // Promoted to vector.
868 
869  if (DisablePromoteAllocaToLDS)
870  return false;
871 
872  const Function &ContainingFunction = *I.getParent()->getParent();
873  CallingConv::ID CC = ContainingFunction.getCallingConv();
874 
875  // Don't promote the alloca to LDS for shader calling conventions as the work
876  // item ID intrinsics are not supported for these calling conventions.
877  // Furthermore not all LDS is available for some of the stages.
878  switch (CC) {
881  break;
882  default:
883  LLVM_DEBUG(
884  dbgs()
885  << " promote alloca to LDS not supported with calling convention.\n");
886  return false;
887  }
888 
889  // Not likely to have sufficient local memory for promotion.
890  if (!SufficientLDS)
891  return false;
892 
893  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
894  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
895 
896  Align Alignment =
897  DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
898 
899  // FIXME: This computed padding is likely wrong since it depends on inverse
900  // usage order.
901  //
902  // FIXME: It is also possible that if we're allowed to use all of the memory
903  // could could end up using more than the maximum due to alignment padding.
904 
905  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
906  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
907  NewSize += AllocSize;
908 
909  if (NewSize > LocalMemLimit) {
910  LLVM_DEBUG(dbgs() << " " << AllocSize
911  << " bytes of local memory not available to promote\n");
912  return false;
913  }
914 
915  CurrentLocalMemUsage = NewSize;
916 
917  std::vector<Value*> WorkList;
918 
919  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
920  LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
921  return false;
922  }
923 
924  LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
925 
926  Function *F = I.getParent()->getParent();
927 
928  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
929  GlobalVariable *GV = new GlobalVariable(
930  *Mod, GVTy, false, GlobalValue::InternalLinkage,
931  UndefValue::get(GVTy),
932  Twine(F->getName()) + Twine('.') + I.getName(),
933  nullptr,
937  GV->setAlignment(MaybeAlign(I.getAlignment()));
938 
939  Value *TCntY, *TCntZ;
940 
941  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
942  Value *TIdX = getWorkitemID(Builder, 0);
943  Value *TIdY = getWorkitemID(Builder, 1);
944  Value *TIdZ = getWorkitemID(Builder, 2);
945 
946  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
947  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
948  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
949  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
950  TID = Builder.CreateAdd(TID, TIdZ);
951 
952  Value *Indices[] = {
954  TID
955  };
956 
957  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
958  I.mutateType(Offset->getType());
959  I.replaceAllUsesWith(Offset);
960  I.eraseFromParent();
961 
962  SmallVector<IntrinsicInst *> DeferredIntrs;
963 
964  for (Value *V : WorkList) {
965  CallInst *Call = dyn_cast<CallInst>(V);
966  if (!Call) {
967  if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
968  Value *Src0 = CI->getOperand(0);
970  cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS);
971 
972  if (isa<ConstantPointerNull>(CI->getOperand(0)))
973  CI->setOperand(0, ConstantPointerNull::get(NewTy));
974 
975  if (isa<ConstantPointerNull>(CI->getOperand(1)))
976  CI->setOperand(1, ConstantPointerNull::get(NewTy));
977 
978  continue;
979  }
980 
981  // The operand's value should be corrected on its own and we don't want to
982  // touch the users.
983  if (isa<AddrSpaceCastInst>(V))
984  continue;
985 
987  cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS);
988 
989  // FIXME: It doesn't really make sense to try to do this for all
990  // instructions.
991  V->mutateType(NewTy);
992 
993  // Adjust the types of any constant operands.
994  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
995  if (isa<ConstantPointerNull>(SI->getOperand(1)))
996  SI->setOperand(1, ConstantPointerNull::get(NewTy));
997 
998  if (isa<ConstantPointerNull>(SI->getOperand(2)))
999  SI->setOperand(2, ConstantPointerNull::get(NewTy));
1000  } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
1001  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
1002  if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
1003  Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
1004  }
1005  }
1006 
1007  continue;
1008  }
1009 
1010  IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
1011  Builder.SetInsertPoint(Intr);
1012  switch (Intr->getIntrinsicID()) {
1013  case Intrinsic::lifetime_start:
1014  case Intrinsic::lifetime_end:
1015  // These intrinsics are for address space 0 only
1016  Intr->eraseFromParent();
1017  continue;
1018  case Intrinsic::memcpy:
1019  case Intrinsic::memmove:
1020  // These have 2 pointer operands. In case if second pointer also needs
1021  // to be replaced we defer processing of these intrinsics until all
1022  // other values are processed.
1023  DeferredIntrs.push_back(Intr);
1024  continue;
1025  case Intrinsic::memset: {
1026  MemSetInst *MemSet = cast<MemSetInst>(Intr);
1027  Builder.CreateMemSet(
1028  MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
1029  MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
1030  Intr->eraseFromParent();
1031  continue;
1032  }
1033  case Intrinsic::invariant_start:
1034  case Intrinsic::invariant_end:
1035  case Intrinsic::launder_invariant_group:
1036  case Intrinsic::strip_invariant_group:
1037  Intr->eraseFromParent();
1038  // FIXME: I think the invariant marker should still theoretically apply,
1039  // but the intrinsics need to be changed to accept pointers with any
1040  // address space.
1041  continue;
1042  case Intrinsic::objectsize: {
1043  Value *Src = Intr->getOperand(0);
1044  Function *ObjectSize = Intrinsic::getDeclaration(
1045  Mod, Intrinsic::objectsize,
1046  {Intr->getType(),
1048  cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)});
1049 
1050  CallInst *NewCall = Builder.CreateCall(
1051  ObjectSize,
1052  {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1053  Intr->replaceAllUsesWith(NewCall);
1054  Intr->eraseFromParent();
1055  continue;
1056  }
1057  default:
1058  Intr->print(errs());
1059  llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1060  }
1061  }
1062 
1063  for (IntrinsicInst *Intr : DeferredIntrs) {
1064  Builder.SetInsertPoint(Intr);
1065  Intrinsic::ID ID = Intr->getIntrinsicID();
1066  assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1067 
1068  MemTransferInst *MI = cast<MemTransferInst>(Intr);
1069  auto *B =
1070  Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
1071  MI->getRawSource(), MI->getSourceAlign(),
1072  MI->getLength(), MI->isVolatile());
1073 
1074  for (unsigned I = 0; I != 2; ++I) {
1075  if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
1076  B->addDereferenceableParamAttr(I, Bytes);
1077  }
1078  }
1079 
1080  Intr->eraseFromParent();
1081  }
1082 
1083  return true;
1084 }
1085 
1086 bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
1087  // Array allocations are probably not worth handling, since an allocation of
1088  // the array type is the canonical form.
1089  if (!I.isStaticAlloca() || I.isArrayAllocation())
1090  return false;
1091 
1092  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
1093 
1094  Module *Mod = I.getParent()->getParent()->getParent();
1095  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
1096 }
1097 
1099  if (DisablePromoteAllocaToVector)
1100  return false;
1101 
1103  if (!ST.isPromoteAllocaEnabled())
1104  return false;
1105 
1106  unsigned MaxVGPRs;
1107  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1108  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1109  MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
1110  } else {
1111  MaxVGPRs = 128;
1112  }
1113 
1114  bool Changed = false;
1115  BasicBlock &EntryBB = *F.begin();
1116 
1118  for (Instruction &I : EntryBB) {
1119  if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1120  Allocas.push_back(AI);
1121  }
1122 
1123  for (AllocaInst *AI : Allocas) {
1124  if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
1125  Changed = true;
1126  }
1127 
1128  return Changed;
1129 }
1130 
1132  if (skipFunction(F))
1133  return false;
1134  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
1135  return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
1136  }
1137  return false;
1138 }
1139 
1142  bool Changed = promoteAllocasToVector(F, TM);
1143  if (Changed) {
1144  PreservedAnalyses PA;
1145  PA.preserveSet<CFGAnalyses>();
1146  return PA;
1147  }
1148  return PreservedAnalyses::all();
1149 }
1150 
1152  return new AMDGPUPromoteAlloca();
1153 }
1154 
1156  return new AMDGPUPromoteAllocaToVector();
1157 }
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:148
llvm::MemIntrinsicBase::getDestAlignment
unsigned getDestAlignment() const
FIXME: Remove this function once transition to Align is over.
Definition: IntrinsicInst.h:660
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:103
llvm
---------------------— PointerInfo ------------------------------------—
Definition: AllocatorList.h:23
llvm::AMDGPUPromoteAllocaToVectorPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: AMDGPUPromoteAlloca.cpp:1141
tryPromoteAllocaToVector
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, unsigned MaxVGPRs)
Definition: AMDGPUPromoteAlloca.cpp:413
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1379
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:228
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:917
llvm::Function
Definition: Function.h:61
Pass.h
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) INITIALIZE_PASS_END(AMDGPUPromoteAlloca
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:691
llvm::GlobalValue::NotThreadLocal
@ NotThreadLocal
Definition: GlobalValue.h:179
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::GlobalObject::getAlign
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:80
llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:72
CaptureTracking.h
llvm::IRBuilder<>
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:687
ValueTracking.h
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::LoadInst::getPointerOperandType
Type * getPointerOperandType() const
Definition: Instructions.h:270
to
Should compile to
Definition: README.txt:449
GEPToVectorIndex
static Value * GEPToVectorIndex(GetElementPtrInst *GEP)
Definition: AMDGPUPromoteAlloca.cpp:351
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
llvm::GlobalValue::UnnamedAddr::Global
@ Global
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:31
llvm::errs
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition: raw_ostream.cpp:892
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:45
llvm::GlobalValue::setUnnamedAddr
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:212
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:357
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:267
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:525
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:203
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:101
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1208
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:766
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:216
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1336
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:79
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
promoteAllocasToVector
bool promoteAllocasToVector(Function &F, TargetMachine &TM)
Definition: AMDGPUPromoteAlloca.cpp:1098
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUPromoteAlloca.cpp:25
llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:1108
TargetMachine.h
llvm::ArrayType::getNumElements
uint64_t getNumElements() const
Definition: DerivedTypes.h:369
llvm::Module::globals
iterator_range< global_iterator > globals()
Definition: Module.h:611
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:113
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
Intr
unsigned Intr
Definition: AMDGPUBaseInfo.cpp:1987
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
isCallPromotable
static bool isCallPromotable(CallInst *CI)
Definition: AMDGPUPromoteAlloca.cpp:558
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:237
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:109
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::Instruction
Definition: Instruction.h:45
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1771
llvm::FunctionType::params
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:130
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4377
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:648
llvm::GlobalValue::InternalLinkage
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:55
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:153
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::None
const NoneType None
Definition: None.h:23
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MemSetInst
This class wraps the llvm.memset intrinsic.
Definition: IntrinsicInst.h:905
INITIALIZE_PASS_END
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition: PassSupport.h:58
llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:190
llvm::VectorType::isValidElementType
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:639
llvm::CallBase::addDereferenceableRetAttr
void addDereferenceableRetAttr(uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
Definition: InstrTypes.h:1584
llvm::cl::opt< bool >
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:304
llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:29
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1203
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
uint64_t
llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:239
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
canVectorizeInst
static bool canVectorizeInst(Instruction *Inst, User *User, const DataLayout &DL)
Definition: AMDGPUPromoteAlloca.cpp:367
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::ConstantPointerNull::get
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1757
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:928
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:632
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1616
llvm::CallBase::addRetAttr
void addRetAttr(Attribute::AttrKind Kind)
Adds the attribute to the return value.
Definition: InstrTypes.h:1510
TargetPassConfig.h
calculateVectorIndex
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, Value * > &GEPIdx)
Definition: AMDGPUPromoteAlloca.cpp:341
llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:152
llvm::pdb::PDB_MemoryType::Stack
@ Stack
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:354
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:79
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
SI
StandardInstrumentations SI(Debug, VerifyEach)
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1738
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:602
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:906
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:650
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::PointerMayBeCaptured
bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures, bool StoreCaptures, unsigned MaxUsesToExplore=0)
PointerMayBeCaptured - Return true if this pointer value may be captured by the enclosing function (w...
Definition: CaptureTracking.cpp:155
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:58
AMDGPU.h
llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:259
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:116
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:194
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:353
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:532
uint32_t
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:175
llvm::AMDGPUPromoteAllocaPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: AMDGPUPromoteAlloca.cpp:153
llvm::isAssumeLikeIntrinsic
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
Definition: ValueTracking.cpp:509
llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:726
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:92
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::createAMDGPUPromoteAllocaToVector
FunctionPass * createAMDGPUPromoteAllocaToVector()
Definition: AMDGPUPromoteAlloca.cpp:1155
llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:83
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:355
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:348
llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition: IntrinsicInst.h:643
or
compiles or
Definition: README.txt:606
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
stripBitcasts
static Value * stripBitcasts(Value *V)
Definition: AMDGPUPromoteAlloca.cpp:331
llvm::Module::getContext
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:261
llvm::AMDGPUPromoteAllocaID
char & AMDGPUPromoteAllocaID
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1492
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:738
llvm::createAMDGPUPromoteAlloca
FunctionPass * createAMDGPUPromoteAlloca()
Definition: AMDGPUPromoteAlloca.cpp:1151
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::GlobalValue::getAddressSpace
unsigned getAddressSpace() const
Definition: Globals.cpp:112
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:191
llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition: IntrinsicInst.h:883
llvm::PointerType::getWithSamePointeeType
static PointerType * getWithSamePointeeType(PointerType *PT, unsigned AddressSpace)
This constructs a pointer type with the same pointee type as input PointerType (or opaque pointer is ...
Definition: DerivedTypes.h:666
llvm::Type::getPointerElementType
Type * getPointerElementType() const
Definition: Type.h:380
N
#define N
llvm::AMDGPUPromoteAllocaToVectorID
char & AMDGPUPromoteAllocaToVectorID
Users
iv Induction Variable Users
Definition: IVUsers.cpp:52
llvm::PHINode
Definition: Instructions.h:2633
llvm::MemIntrinsicBase::getRawDest
Value * getRawDest() const
Definition: IntrinsicInst.h:637
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:401
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:44
llvm::GlobalValue::getValueType
Type * getValueType() const
Definition: GlobalValue.h:273
TM
const char LLVMTargetMachineRef TM
Definition: PassBuilderBindings.cpp:47
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1475
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:172
llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:267
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:62
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:414
Mod
Module * Mod
Definition: PassBuilderBindings.cpp:54
llvm::GlobalObject::setAlignment
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:117
arrayTypeToVecType
static FixedVectorType * arrayTypeToVecType(ArrayType *ArrayTy)
Definition: AMDGPUPromoteAlloca.cpp:326
handlePromoteAllocaToVector
bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs)
Definition: AMDGPUPromoteAlloca.cpp:1086
llvm::SmallVectorImpl::reserve
void reserve(size_type N)
Definition: SmallVector.h:624
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::AtomicCmpXchgInst
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:521
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:422
llvm::ArrayType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:370
llvm::FunctionType
Class to represent function types.
Definition: DerivedTypes.h:103
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:908
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
INITIALIZE_PASS
TargetPassConfig.
Definition: TargetPassConfig.cpp:359