LLVM  13.0.0git
AMDGPUPromoteAlloca.cpp
Go to the documentation of this file.
1 //===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass eliminates allocas by either converting them into vectors or
10 // by migrating them to local address space.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
19 #include "llvm/IR/IRBuilder.h"
20 #include "llvm/IR/IntrinsicsAMDGPU.h"
21 #include "llvm/IR/IntrinsicsR600.h"
22 #include "llvm/Pass.h"
24 
25 #define DEBUG_TYPE "amdgpu-promote-alloca"
26 
27 using namespace llvm;
28 
29 namespace {
30 
31 static cl::opt<bool> DisablePromoteAllocaToVector(
32  "disable-promote-alloca-to-vector",
33  cl::desc("Disable promote alloca to vector"),
34  cl::init(false));
35 
36 static cl::opt<bool> DisablePromoteAllocaToLDS(
37  "disable-promote-alloca-to-lds",
38  cl::desc("Disable promote alloca to LDS"),
39  cl::init(false));
40 
41 static cl::opt<unsigned> PromoteAllocaToVectorLimit(
42  "amdgpu-promote-alloca-to-vector-limit",
43  cl::desc("Maximum byte size to consider promote alloca to vector"),
44  cl::init(0));
45 
46 // FIXME: This can create globals so should be a module pass.
47 class AMDGPUPromoteAlloca : public FunctionPass {
48 public:
49  static char ID;
50 
51  AMDGPUPromoteAlloca() : FunctionPass(ID) {}
52 
53  bool runOnFunction(Function &F) override;
54 
55  StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
56 
57  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
58 
59  void getAnalysisUsage(AnalysisUsage &AU) const override {
60  AU.setPreservesCFG();
62  }
63 };
64 
65 class AMDGPUPromoteAllocaImpl {
66 private:
67  const TargetMachine &TM;
68  Module *Mod = nullptr;
69  const DataLayout *DL = nullptr;
70 
71  // FIXME: This should be per-kernel.
72  uint32_t LocalMemLimit = 0;
73  uint32_t CurrentLocalMemUsage = 0;
74  unsigned MaxVGPRs;
75 
76  bool IsAMDGCN = false;
77  bool IsAMDHSA = false;
78 
79  std::pair<Value *, Value *> getLocalSizeYZ(IRBuilder<> &Builder);
80  Value *getWorkitemID(IRBuilder<> &Builder, unsigned N);
81 
82  /// BaseAlloca is the alloca root the search started from.
83  /// Val may be that alloca or a recursive user of it.
84  bool collectUsesWithPtrTypes(Value *BaseAlloca,
85  Value *Val,
86  std::vector<Value*> &WorkList) const;
87 
88  /// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
89  /// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
90  /// Returns true if both operands are derived from the same alloca. Val should
91  /// be the same value as one of the input operands of UseInst.
92  bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
93  Instruction *UseInst,
94  int OpIdx0, int OpIdx1) const;
95 
96  /// Check whether we have enough local memory for promotion.
97  bool hasSufficientLocalMem(const Function &F);
98 
99  bool handleAlloca(AllocaInst &I, bool SufficientLDS);
100 
101 public:
102  AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
103  bool run(Function &F);
104 };
105 
106 class AMDGPUPromoteAllocaToVector : public FunctionPass {
107 public:
108  static char ID;
109 
110  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
111 
112  bool runOnFunction(Function &F) override;
113 
114  StringRef getPassName() const override {
115  return "AMDGPU Promote Alloca to vector";
116  }
117 
118  void getAnalysisUsage(AnalysisUsage &AU) const override {
119  AU.setPreservesCFG();
121  }
122 };
123 
124 } // end anonymous namespace
125 
126 char AMDGPUPromoteAlloca::ID = 0;
128 
129 INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
130  "AMDGPU promote alloca to vector or LDS", false, false)
131 // Move LDS uses from functions to kernels before promote alloca for accurate
132 // estimation of LDS available
133 INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
134 INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
135  "AMDGPU promote alloca to vector or LDS", false, false)
136 
137 INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
138  "AMDGPU promote alloca to vector", false, false)
139 
140 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
141 char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
142 
143 bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
144  if (skipFunction(F))
145  return false;
146 
147  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
148  return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
149  }
150  return false;
151 }
152 
155  bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
156  if (Changed) {
158  PA.preserveSet<CFGAnalyses>();
159  return PA;
160  }
161  return PreservedAnalyses::all();
162 }
163 
164 bool AMDGPUPromoteAllocaImpl::run(Function &F) {
165  Mod = F.getParent();
166  DL = &Mod->getDataLayout();
167 
168  const Triple &TT = TM.getTargetTriple();
169  IsAMDGCN = TT.getArch() == Triple::amdgcn;
170  IsAMDHSA = TT.getOS() == Triple::AMDHSA;
171 
173  if (!ST.isPromoteAllocaEnabled())
174  return false;
175 
176  if (IsAMDGCN) {
177  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
178  MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
179  } else {
180  MaxVGPRs = 128;
181  }
182 
183  bool SufficientLDS = hasSufficientLocalMem(F);
184  bool Changed = false;
185  BasicBlock &EntryBB = *F.begin();
186 
188  for (Instruction &I : EntryBB) {
189  if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
190  Allocas.push_back(AI);
191  }
192 
193  for (AllocaInst *AI : Allocas) {
194  if (handleAlloca(*AI, SufficientLDS))
195  Changed = true;
196  }
197 
198  return Changed;
199 }
200 
201 std::pair<Value *, Value *>
202 AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
203  const Function &F = *Builder.GetInsertBlock()->getParent();
205 
206  if (!IsAMDHSA) {
207  Function *LocalSizeYFn
208  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
209  Function *LocalSizeZFn
210  = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
211 
212  CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
213  CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
214 
215  ST.makeLIDRangeMetadata(LocalSizeY);
216  ST.makeLIDRangeMetadata(LocalSizeZ);
217 
218  return std::make_pair(LocalSizeY, LocalSizeZ);
219  }
220 
221  // We must read the size out of the dispatch pointer.
222  assert(IsAMDGCN);
223 
224  // We are indexing into this struct, and want to extract the workgroup_size_*
225  // fields.
226  //
227  // typedef struct hsa_kernel_dispatch_packet_s {
228  // uint16_t header;
229  // uint16_t setup;
230  // uint16_t workgroup_size_x ;
231  // uint16_t workgroup_size_y;
232  // uint16_t workgroup_size_z;
233  // uint16_t reserved0;
234  // uint32_t grid_size_x ;
235  // uint32_t grid_size_y ;
236  // uint32_t grid_size_z;
237  //
238  // uint32_t private_segment_size;
239  // uint32_t group_segment_size;
240  // uint64_t kernel_object;
241  //
242  // #ifdef HSA_LARGE_MODEL
243  // void *kernarg_address;
244  // #elif defined HSA_LITTLE_ENDIAN
245  // void *kernarg_address;
246  // uint32_t reserved1;
247  // #else
248  // uint32_t reserved1;
249  // void *kernarg_address;
250  // #endif
251  // uint64_t reserved2;
252  // hsa_signal_t completion_signal; // uint64_t wrapper
253  // } hsa_kernel_dispatch_packet_t
254  //
255  Function *DispatchPtrFn
256  = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
257 
258  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
259  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
260  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
261 
262  // Size of the dispatch packet struct.
264 
265  Type *I32Ty = Type::getInt32Ty(Mod->getContext());
266  Value *CastDispatchPtr = Builder.CreateBitCast(
267  DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
268 
269  // We could do a single 64-bit load here, but it's likely that the basic
270  // 32-bit and extract sequence is already present, and it is probably easier
271  // to CSE this. The loads should be mergable later anyway.
272  Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
273  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
274 
275  Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
276  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
277 
278  MDNode *MD = MDNode::get(Mod->getContext(), None);
279  LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
280  LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
281  ST.makeLIDRangeMetadata(LoadZU);
282 
283  // Extract y component. Upper half of LoadZU should be zero already.
284  Value *Y = Builder.CreateLShr(LoadXY, 16);
285 
286  return std::make_pair(Y, LoadZU);
287 }
288 
289 Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
290  unsigned N) {
291  const AMDGPUSubtarget &ST =
292  AMDGPUSubtarget::get(TM, *Builder.GetInsertBlock()->getParent());
294 
295  switch (N) {
296  case 0:
297  IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
298  : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
299  break;
300  case 1:
301  IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
302  : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
303  break;
304 
305  case 2:
306  IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
307  : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
308  break;
309  default:
310  llvm_unreachable("invalid dimension");
311  }
312 
313  Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
314  CallInst *CI = Builder.CreateCall(WorkitemIdFn);
315  ST.makeLIDRangeMetadata(CI);
316 
317  return CI;
318 }
319 
321  return FixedVectorType::get(ArrayTy->getElementType(),
322  ArrayTy->getNumElements());
323 }
324 
325 static Value *stripBitcasts(Value *V) {
326  while (Instruction *I = dyn_cast<Instruction>(V)) {
327  if (I->getOpcode() != Instruction::BitCast)
328  break;
329  V = I->getOperand(0);
330  }
331  return V;
332 }
333 
334 static Value *
336  const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
337  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
338  if (!GEP)
339  return nullptr;
340 
341  auto I = GEPIdx.find(GEP);
342  return I == GEPIdx.end() ? nullptr : I->second;
343 }
344 
346  // FIXME we only support simple cases
347  if (GEP->getNumOperands() != 3)
348  return nullptr;
349 
350  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
351  if (!I0 || !I0->isZero())
352  return nullptr;
353 
354  return GEP->getOperand(2);
355 }
356 
357 // Not an instruction handled below to turn into a vector.
358 //
359 // TODO: Check isTriviallyVectorizable for calls and handle other
360 // instructions.
361 static bool canVectorizeInst(Instruction *Inst, User *User,
362  const DataLayout &DL) {
363  switch (Inst->getOpcode()) {
364  case Instruction::Load: {
365  // Currently only handle the case where the Pointer Operand is a GEP.
366  // Also we could not vectorize volatile or atomic loads.
367  LoadInst *LI = cast<LoadInst>(Inst);
368  if (isa<AllocaInst>(User) &&
369  LI->getPointerOperandType() == User->getType() &&
370  isa<VectorType>(LI->getType()))
371  return true;
372 
373  Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
374  if (!PtrInst)
375  return false;
376 
377  return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
378  PtrInst->getOpcode() == Instruction::BitCast) &&
379  LI->isSimple();
380  }
381  case Instruction::BitCast:
382  return true;
383  case Instruction::Store: {
384  // Must be the stored pointer operand, not a stored value, plus
385  // since it should be canonical form, the User should be a GEP.
386  // Also we could not vectorize volatile or atomic stores.
387  StoreInst *SI = cast<StoreInst>(Inst);
388  if (isa<AllocaInst>(User) &&
389  SI->getPointerOperandType() == User->getType() &&
390  isa<VectorType>(SI->getValueOperand()->getType()))
391  return true;
392 
393  Instruction *UserInst = dyn_cast<Instruction>(User);
394  if (!UserInst)
395  return false;
396 
397  return (SI->getPointerOperand() == User) &&
398  (UserInst->getOpcode() == Instruction::GetElementPtr ||
399  UserInst->getOpcode() == Instruction::BitCast) &&
400  SI->isSimple();
401  }
402  default:
403  return false;
404  }
405 }
406 
407 static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
408  unsigned MaxVGPRs) {
409 
410  if (DisablePromoteAllocaToVector) {
411  LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
412  return false;
413  }
414 
415  Type *AllocaTy = Alloca->getAllocatedType();
416  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
417  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
418  if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
419  ArrayTy->getNumElements() > 0)
420  VectorTy = arrayTypeToVecType(ArrayTy);
421  }
422 
423  // Use up to 1/4 of available register budget for vectorization.
424  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
425  : (MaxVGPRs * 32);
426 
427  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
428  LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
429  << MaxVGPRs << " registers available\n");
430  return false;
431  }
432 
433  LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
434 
435  // FIXME: There is no reason why we can't support larger arrays, we
436  // are just being conservative for now.
437  // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
438  // could also be promoted but we don't currently handle this case
439  if (!VectorTy || VectorTy->getNumElements() > 16 ||
440  VectorTy->getNumElements() < 2) {
441  LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
442  return false;
443  }
444 
445  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
446  std::vector<Value *> WorkList;
448  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
449  Type *VecEltTy = VectorTy->getElementType();
450  while (!Users.empty()) {
451  User *AllocaUser = Users.pop_back_val();
452  User *UseUser = UseUsers.pop_back_val();
453  Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
454 
455  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
456  if (!GEP) {
457  if (!canVectorizeInst(Inst, UseUser, DL))
458  return false;
459 
460  if (Inst->getOpcode() == Instruction::BitCast) {
461  Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
462  Type *ToTy = Inst->getType()->getPointerElementType();
463  if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
464  DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
465  continue;
466 
467  for (User *CastUser : Inst->users()) {
468  if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
469  continue;
470  Users.push_back(CastUser);
471  UseUsers.push_back(Inst);
472  }
473 
474  continue;
475  }
476 
477  WorkList.push_back(AllocaUser);
478  continue;
479  }
480 
482 
483  // If we can't compute a vector index from this GEP, then we can't
484  // promote this alloca to vector.
485  if (!Index) {
486  LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
487  << '\n');
488  return false;
489  }
490 
491  GEPVectorIdx[GEP] = Index;
492  Users.append(GEP->user_begin(), GEP->user_end());
493  UseUsers.append(GEP->getNumUses(), GEP);
494  }
495 
496  LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
497  << *VectorTy << '\n');
498 
499  for (Value *V : WorkList) {
500  Instruction *Inst = cast<Instruction>(V);
501  IRBuilder<> Builder(Inst);
502  switch (Inst->getOpcode()) {
503  case Instruction::Load: {
504  if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
505  break;
506 
507  Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
508  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
509  if (!Index)
510  break;
511 
512  Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
513  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
514  Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
515  Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
516  if (Inst->getType() != VecEltTy)
517  ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
518  Inst->replaceAllUsesWith(ExtractElement);
519  Inst->eraseFromParent();
520  break;
521  }
522  case Instruction::Store: {
523  StoreInst *SI = cast<StoreInst>(Inst);
524  if (SI->getValueOperand()->getType() == AllocaTy ||
525  SI->getValueOperand()->getType()->isVectorTy())
526  break;
527 
528  Value *Ptr = SI->getPointerOperand();
529  Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
530  if (!Index)
531  break;
532 
533  Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
534  Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
535  Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
536  Value *Elt = SI->getValueOperand();
537  if (Elt->getType() != VecEltTy)
538  Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
539  Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
540  Builder.CreateStore(NewVecValue, BitCast);
541  Inst->eraseFromParent();
542  break;
543  }
544 
545  default:
546  llvm_unreachable("Inconsistency in instructions promotable to vector");
547  }
548  }
549  return true;
550 }
551 
552 static bool isCallPromotable(CallInst *CI) {
553  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
554  if (!II)
555  return false;
556 
557  switch (II->getIntrinsicID()) {
558  case Intrinsic::memcpy:
559  case Intrinsic::memmove:
560  case Intrinsic::memset:
561  case Intrinsic::lifetime_start:
562  case Intrinsic::lifetime_end:
563  case Intrinsic::invariant_start:
564  case Intrinsic::invariant_end:
565  case Intrinsic::launder_invariant_group:
566  case Intrinsic::strip_invariant_group:
567  case Intrinsic::objectsize:
568  return true;
569  default:
570  return false;
571  }
572 }
573 
574 bool AMDGPUPromoteAllocaImpl::binaryOpIsDerivedFromSameAlloca(
575  Value *BaseAlloca, Value *Val, Instruction *Inst, int OpIdx0,
576  int OpIdx1) const {
577  // Figure out which operand is the one we might not be promoting.
578  Value *OtherOp = Inst->getOperand(OpIdx0);
579  if (Val == OtherOp)
580  OtherOp = Inst->getOperand(OpIdx1);
581 
582  if (isa<ConstantPointerNull>(OtherOp))
583  return true;
584 
585  Value *OtherObj = getUnderlyingObject(OtherOp);
586  if (!isa<AllocaInst>(OtherObj))
587  return false;
588 
589  // TODO: We should be able to replace undefs with the right pointer type.
590 
591  // TODO: If we know the other base object is another promotable
592  // alloca, not necessarily this alloca, we can do this. The
593  // important part is both must have the same address space at
594  // the end.
595  if (OtherObj != BaseAlloca) {
596  LLVM_DEBUG(
597  dbgs() << "Found a binary instruction with another alloca object\n");
598  return false;
599  }
600 
601  return true;
602 }
603 
604 bool AMDGPUPromoteAllocaImpl::collectUsesWithPtrTypes(
605  Value *BaseAlloca, Value *Val, std::vector<Value *> &WorkList) const {
606 
607  for (User *User : Val->users()) {
608  if (is_contained(WorkList, User))
609  continue;
610 
611  if (CallInst *CI = dyn_cast<CallInst>(User)) {
612  if (!isCallPromotable(CI))
613  return false;
614 
615  WorkList.push_back(User);
616  continue;
617  }
618 
619  Instruction *UseInst = cast<Instruction>(User);
620  if (UseInst->getOpcode() == Instruction::PtrToInt)
621  return false;
622 
623  if (LoadInst *LI = dyn_cast<LoadInst>(UseInst)) {
624  if (LI->isVolatile())
625  return false;
626 
627  continue;
628  }
629 
630  if (StoreInst *SI = dyn_cast<StoreInst>(UseInst)) {
631  if (SI->isVolatile())
632  return false;
633 
634  // Reject if the stored value is not the pointer operand.
635  if (SI->getPointerOperand() != Val)
636  return false;
637  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UseInst)) {
638  if (RMW->isVolatile())
639  return false;
640  } else if (AtomicCmpXchgInst *CAS = dyn_cast<AtomicCmpXchgInst>(UseInst)) {
641  if (CAS->isVolatile())
642  return false;
643  }
644 
645  // Only promote a select if we know that the other select operand
646  // is from another pointer that will also be promoted.
647  if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
648  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, ICmp, 0, 1))
649  return false;
650 
651  // May need to rewrite constant operands.
652  WorkList.push_back(ICmp);
653  }
654 
655  if (UseInst->getOpcode() == Instruction::AddrSpaceCast) {
656  // Give up if the pointer may be captured.
657  if (PointerMayBeCaptured(UseInst, true, true))
658  return false;
659  // Don't collect the users of this.
660  WorkList.push_back(User);
661  continue;
662  }
663 
664  if (!User->getType()->isPointerTy())
665  continue;
666 
667  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UseInst)) {
668  // Be conservative if an address could be computed outside the bounds of
669  // the alloca.
670  if (!GEP->isInBounds())
671  return false;
672  }
673 
674  // Only promote a select if we know that the other select operand is from
675  // another pointer that will also be promoted.
676  if (SelectInst *SI = dyn_cast<SelectInst>(UseInst)) {
677  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, SI, 1, 2))
678  return false;
679  }
680 
681  // Repeat for phis.
682  if (PHINode *Phi = dyn_cast<PHINode>(UseInst)) {
683  // TODO: Handle more complex cases. We should be able to replace loops
684  // over arrays.
685  switch (Phi->getNumIncomingValues()) {
686  case 1:
687  break;
688  case 2:
689  if (!binaryOpIsDerivedFromSameAlloca(BaseAlloca, Val, Phi, 0, 1))
690  return false;
691  break;
692  default:
693  return false;
694  }
695  }
696 
697  WorkList.push_back(User);
698  if (!collectUsesWithPtrTypes(BaseAlloca, User, WorkList))
699  return false;
700  }
701 
702  return true;
703 }
704 
705 bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
706 
707  FunctionType *FTy = F.getFunctionType();
709 
710  // If the function has any arguments in the local address space, then it's
711  // possible these arguments require the entire local memory space, so
712  // we cannot use local memory in the pass.
713  for (Type *ParamTy : FTy->params()) {
714  PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
715  if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
716  LocalMemLimit = 0;
717  LLVM_DEBUG(dbgs() << "Function has local memory argument. Promoting to "
718  "local memory disabled.\n");
719  return false;
720  }
721  }
722 
723  LocalMemLimit = ST.getLocalMemorySize();
724  if (LocalMemLimit == 0)
725  return false;
726 
728  SmallPtrSet<const Constant *, 8> VisitedConstants;
730 
731  auto visitUsers = [&](const GlobalVariable *GV, const Constant *Val) -> bool {
732  for (const User *U : Val->users()) {
733  if (const Instruction *Use = dyn_cast<Instruction>(U)) {
734  if (Use->getParent()->getParent() == &F)
735  return true;
736  } else {
737  const Constant *C = cast<Constant>(U);
738  if (VisitedConstants.insert(C).second)
739  Stack.push_back(C);
740  }
741  }
742 
743  return false;
744  };
745 
746  for (GlobalVariable &GV : Mod->globals()) {
748  continue;
749 
750  if (visitUsers(&GV, &GV)) {
751  UsedLDS.insert(&GV);
752  Stack.clear();
753  continue;
754  }
755 
756  // For any ConstantExpr uses, we need to recursively search the users until
757  // we see a function.
758  while (!Stack.empty()) {
759  const Constant *C = Stack.pop_back_val();
760  if (visitUsers(&GV, C)) {
761  UsedLDS.insert(&GV);
762  Stack.clear();
763  break;
764  }
765  }
766  }
767 
768  const DataLayout &DL = Mod->getDataLayout();
769  SmallVector<std::pair<uint64_t, Align>, 16> AllocatedSizes;
770  AllocatedSizes.reserve(UsedLDS.size());
771 
772  for (const GlobalVariable *GV : UsedLDS) {
773  Align Alignment =
774  DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
775  uint64_t AllocSize = DL.getTypeAllocSize(GV->getValueType());
776  AllocatedSizes.emplace_back(AllocSize, Alignment);
777  }
778 
779  // Sort to try to estimate the worst case alignment padding
780  //
781  // FIXME: We should really do something to fix the addresses to a more optimal
782  // value instead
783  llvm::sort(AllocatedSizes, [](std::pair<uint64_t, Align> LHS,
784  std::pair<uint64_t, Align> RHS) {
785  return LHS.second < RHS.second;
786  });
787 
788  // Check how much local memory is being used by global objects
789  CurrentLocalMemUsage = 0;
790 
791  // FIXME: Try to account for padding here. The real padding and address is
792  // currently determined from the inverse order of uses in the function when
793  // legalizing, which could also potentially change. We try to estimate the
794  // worst case here, but we probably should fix the addresses earlier.
795  for (auto Alloc : AllocatedSizes) {
796  CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alloc.second);
797  CurrentLocalMemUsage += Alloc.first;
798  }
799 
800  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
801  F);
802 
803  // Restrict local memory usage so that we don't drastically reduce occupancy,
804  // unless it is already significantly reduced.
805 
806  // TODO: Have some sort of hint or other heuristics to guess occupancy based
807  // on other factors..
808  unsigned OccupancyHint = ST.getWavesPerEU(F).second;
809  if (OccupancyHint == 0)
810  OccupancyHint = 7;
811 
812  // Clamp to max value.
813  OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
814 
815  // Check the hint but ignore it if it's obviously wrong from the existing LDS
816  // usage.
817  MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
818 
819 
820  // Round up to the next tier of usage.
821  unsigned MaxSizeWithWaveCount
822  = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
823 
824  // Program is possibly broken by using more local mem than available.
825  if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
826  return false;
827 
828  LocalMemLimit = MaxSizeWithWaveCount;
829 
830  LLVM_DEBUG(dbgs() << F.getName() << " uses " << CurrentLocalMemUsage
831  << " bytes of LDS\n"
832  << " Rounding size to " << MaxSizeWithWaveCount
833  << " with a maximum occupancy of " << MaxOccupancy << '\n'
834  << " and " << (LocalMemLimit - CurrentLocalMemUsage)
835  << " available for promotion\n");
836 
837  return true;
838 }
839 
840 // FIXME: Should try to pick the most likely to be profitable allocas first.
841 bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
842  // Array allocations are probably not worth handling, since an allocation of
843  // the array type is the canonical form.
844  if (!I.isStaticAlloca() || I.isArrayAllocation())
845  return false;
846 
847  const DataLayout &DL = Mod->getDataLayout();
849 
850  // First try to replace the alloca with a vector
851  Type *AllocaTy = I.getAllocatedType();
852 
853  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
854 
855  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
856  return true; // Promoted to vector.
857 
858  if (DisablePromoteAllocaToLDS)
859  return false;
860 
861  const Function &ContainingFunction = *I.getParent()->getParent();
862  CallingConv::ID CC = ContainingFunction.getCallingConv();
863 
864  // Don't promote the alloca to LDS for shader calling conventions as the work
865  // item ID intrinsics are not supported for these calling conventions.
866  // Furthermore not all LDS is available for some of the stages.
867  switch (CC) {
870  break;
871  default:
872  LLVM_DEBUG(
873  dbgs()
874  << " promote alloca to LDS not supported with calling convention.\n");
875  return false;
876  }
877 
878  // Not likely to have sufficient local memory for promotion.
879  if (!SufficientLDS)
880  return false;
881 
882  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, ContainingFunction);
883  unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
884 
885  Align Alignment =
886  DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
887 
888  // FIXME: This computed padding is likely wrong since it depends on inverse
889  // usage order.
890  //
891  // FIXME: It is also possible that if we're allowed to use all of the memory
892  // could could end up using more than the maximum due to alignment padding.
893 
894  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
895  uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
896  NewSize += AllocSize;
897 
898  if (NewSize > LocalMemLimit) {
899  LLVM_DEBUG(dbgs() << " " << AllocSize
900  << " bytes of local memory not available to promote\n");
901  return false;
902  }
903 
904  CurrentLocalMemUsage = NewSize;
905 
906  std::vector<Value*> WorkList;
907 
908  if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
909  LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
910  return false;
911  }
912 
913  LLVM_DEBUG(dbgs() << "Promoting alloca to local memory\n");
914 
915  Function *F = I.getParent()->getParent();
916 
917  Type *GVTy = ArrayType::get(I.getAllocatedType(), WorkGroupSize);
918  GlobalVariable *GV = new GlobalVariable(
919  *Mod, GVTy, false, GlobalValue::InternalLinkage,
920  UndefValue::get(GVTy),
921  Twine(F->getName()) + Twine('.') + I.getName(),
922  nullptr,
926  GV->setAlignment(MaybeAlign(I.getAlignment()));
927 
928  Value *TCntY, *TCntZ;
929 
930  std::tie(TCntY, TCntZ) = getLocalSizeYZ(Builder);
931  Value *TIdX = getWorkitemID(Builder, 0);
932  Value *TIdY = getWorkitemID(Builder, 1);
933  Value *TIdZ = getWorkitemID(Builder, 2);
934 
935  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ, "", true, true);
936  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
937  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ, "", true, true);
938  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
939  TID = Builder.CreateAdd(TID, TIdZ);
940 
941  Value *Indices[] = {
943  TID
944  };
945 
946  Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
947  I.mutateType(Offset->getType());
948  I.replaceAllUsesWith(Offset);
949  I.eraseFromParent();
950 
951  SmallVector<IntrinsicInst *> DeferredIntrs;
952 
953  for (Value *V : WorkList) {
954  CallInst *Call = dyn_cast<CallInst>(V);
955  if (!Call) {
956  if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
957  Value *Src0 = CI->getOperand(0);
958  Type *EltTy = Src0->getType()->getPointerElementType();
960 
961  if (isa<ConstantPointerNull>(CI->getOperand(0)))
962  CI->setOperand(0, ConstantPointerNull::get(NewTy));
963 
964  if (isa<ConstantPointerNull>(CI->getOperand(1)))
965  CI->setOperand(1, ConstantPointerNull::get(NewTy));
966 
967  continue;
968  }
969 
970  // The operand's value should be corrected on its own and we don't want to
971  // touch the users.
972  if (isa<AddrSpaceCastInst>(V))
973  continue;
974 
975  Type *EltTy = V->getType()->getPointerElementType();
977 
978  // FIXME: It doesn't really make sense to try to do this for all
979  // instructions.
980  V->mutateType(NewTy);
981 
982  // Adjust the types of any constant operands.
983  if (SelectInst *SI = dyn_cast<SelectInst>(V)) {
984  if (isa<ConstantPointerNull>(SI->getOperand(1)))
985  SI->setOperand(1, ConstantPointerNull::get(NewTy));
986 
987  if (isa<ConstantPointerNull>(SI->getOperand(2)))
988  SI->setOperand(2, ConstantPointerNull::get(NewTy));
989  } else if (PHINode *Phi = dyn_cast<PHINode>(V)) {
990  for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) {
991  if (isa<ConstantPointerNull>(Phi->getIncomingValue(I)))
992  Phi->setIncomingValue(I, ConstantPointerNull::get(NewTy));
993  }
994  }
995 
996  continue;
997  }
998 
999  IntrinsicInst *Intr = cast<IntrinsicInst>(Call);
1000  Builder.SetInsertPoint(Intr);
1001  switch (Intr->getIntrinsicID()) {
1002  case Intrinsic::lifetime_start:
1003  case Intrinsic::lifetime_end:
1004  // These intrinsics are for address space 0 only
1005  Intr->eraseFromParent();
1006  continue;
1007  case Intrinsic::memcpy:
1008  case Intrinsic::memmove:
1009  // These have 2 pointer operands. In case if second pointer also needs
1010  // to be replaced we defer processing of these intrinsics until all
1011  // other values are processed.
1012  DeferredIntrs.push_back(Intr);
1013  continue;
1014  case Intrinsic::memset: {
1015  MemSetInst *MemSet = cast<MemSetInst>(Intr);
1016  Builder.CreateMemSet(
1017  MemSet->getRawDest(), MemSet->getValue(), MemSet->getLength(),
1018  MaybeAlign(MemSet->getDestAlignment()), MemSet->isVolatile());
1019  Intr->eraseFromParent();
1020  continue;
1021  }
1022  case Intrinsic::invariant_start:
1023  case Intrinsic::invariant_end:
1024  case Intrinsic::launder_invariant_group:
1025  case Intrinsic::strip_invariant_group:
1026  Intr->eraseFromParent();
1027  // FIXME: I think the invariant marker should still theoretically apply,
1028  // but the intrinsics need to be changed to accept pointers with any
1029  // address space.
1030  continue;
1031  case Intrinsic::objectsize: {
1032  Value *Src = Intr->getOperand(0);
1033  Type *SrcTy = Src->getType()->getPointerElementType();
1034  Function *ObjectSize = Intrinsic::getDeclaration(Mod,
1035  Intrinsic::objectsize,
1036  { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
1037  );
1038 
1039  CallInst *NewCall = Builder.CreateCall(
1040  ObjectSize,
1041  {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
1042  Intr->replaceAllUsesWith(NewCall);
1043  Intr->eraseFromParent();
1044  continue;
1045  }
1046  default:
1047  Intr->print(errs());
1048  llvm_unreachable("Don't know how to promote alloca intrinsic use.");
1049  }
1050  }
1051 
1052  for (IntrinsicInst *Intr : DeferredIntrs) {
1053  Builder.SetInsertPoint(Intr);
1054  Intrinsic::ID ID = Intr->getIntrinsicID();
1055  assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
1056 
1057  MemTransferInst *MI = cast<MemTransferInst>(Intr);
1058  auto *B =
1059  Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
1060  MI->getRawSource(), MI->getSourceAlign(),
1061  MI->getLength(), MI->isVolatile());
1062 
1063  for (unsigned I = 1; I != 3; ++I) {
1064  if (uint64_t Bytes = Intr->getDereferenceableBytes(I)) {
1065  B->addDereferenceableAttr(I, Bytes);
1066  }
1067  }
1068 
1069  Intr->eraseFromParent();
1070  }
1071 
1072  return true;
1073 }
1074 
1075 bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
1076  // Array allocations are probably not worth handling, since an allocation of
1077  // the array type is the canonical form.
1078  if (!I.isStaticAlloca() || I.isArrayAllocation())
1079  return false;
1080 
1081  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
1082 
1083  Module *Mod = I.getParent()->getParent()->getParent();
1084  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
1085 }
1086 
1088  if (DisablePromoteAllocaToVector)
1089  return false;
1090 
1092  if (!ST.isPromoteAllocaEnabled())
1093  return false;
1094 
1095  unsigned MaxVGPRs;
1096  if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
1097  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
1098  MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
1099  } else {
1100  MaxVGPRs = 128;
1101  }
1102 
1103  bool Changed = false;
1104  BasicBlock &EntryBB = *F.begin();
1105 
1107  for (Instruction &I : EntryBB) {
1108  if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
1109  Allocas.push_back(AI);
1110  }
1111 
1112  for (AllocaInst *AI : Allocas) {
1113  if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
1114  Changed = true;
1115  }
1116 
1117  return Changed;
1118 }
1119 
1121  if (skipFunction(F))
1122  return false;
1123  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
1124  return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
1125  }
1126  return false;
1127 }
1128 
1131  bool Changed = promoteAllocasToVector(F, TM);
1132  if (Changed) {
1133  PreservedAnalyses PA;
1134  PA.preserveSet<CFGAnalyses>();
1135  return PA;
1136  }
1137  return PreservedAnalyses::all();
1138 }
1139 
1141  return new AMDGPUPromoteAlloca();
1142 }
1143 
1145  return new AMDGPUPromoteAllocaToVector();
1146 }
llvm::PreservedAnalyses
A set of analyses that are preserved following a run of a transformation pass.
Definition: PassManager.h:155
llvm::alignTo
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition: Alignment.h:158
llvm::MemIntrinsicBase::getDestAlignment
unsigned getDestAlignment() const
FIXME: Remove this function once transition to Align is over.
Definition: IntrinsicInst.h:612
MI
IRTranslator LLVM IR MI
Definition: IRTranslator.cpp:100
llvm
Definition: AllocatorList.h:23
llvm::AMDGPUPromoteAllocaToVectorPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: AMDGPUPromoteAlloca.cpp:1130
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
tryPromoteAllocaToVector
static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL, unsigned MaxVGPRs)
Definition: AMDGPUPromoteAlloca.cpp:407
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:112
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1295
llvm::Type::isPointerTy
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:229
llvm::MemTransferInst
This class wraps the llvm.memcpy/memmove intrinsics.
Definition: IntrinsicInst.h:869
llvm::Function
Definition: Function.h:61
Pass.h
INITIALIZE_PASS_BEGIN
INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE, "AMDGPU promote alloca to vector or LDS", false, false) INITIALIZE_PASS_END(AMDGPUPromoteAlloca
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:52
llvm::PointerType::get
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Definition: Type.cpp:693
llvm::GlobalValue::NotThreadLocal
@ NotThreadLocal
Definition: GlobalValue.h:179
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::GlobalObject::getAlign
MaybeAlign getAlign() const
Returns the alignment of the given variable or function.
Definition: GlobalObject.h:82
llvm::Triple::amdgcn
@ amdgcn
Definition: Triple.h:72
CaptureTracking.h
llvm::IRBuilder<>
llvm::GlobalVariable
Definition: GlobalVariable.h:40
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:662
ValueTracking.h
llvm::Triple
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:45
llvm::LoadInst::getPointerOperandType
Type * getPointerOperandType() const
Definition: Instructions.h:269
to
Should compile to
Definition: README.txt:449
GEPToVectorIndex
static Value * GEPToVectorIndex(GetElementPtrInst *GEP)
Definition: AMDGPUPromoteAlloca.cpp:345
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::GlobalValue::UnnamedAddr::Global
@ Global
Offset
uint64_t Offset
Definition: ELFObjHandler.cpp:81
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::errs
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
Definition: raw_ostream.cpp:892
llvm::Intrinsic::not_intrinsic
@ not_intrinsic
Definition: Intrinsics.h:45
llvm::GlobalValue::setUnnamedAddr
void setUnnamedAddr(UnnamedAddr Val)
Definition: GlobalValue.h:212
llvm::SmallVectorImpl::pop_back_val
LLVM_NODISCARD T pop_back_val()
Definition: SmallVector.h:635
llvm::ArrayType
Class to represent array types.
Definition: DerivedTypes.h:359
llvm::LoadInst::getPointerOperand
Value * getPointerOperand()
Definition: Instructions.h:266
llvm::FixedVectorType
Class to represent fixed width SIMD vectors.
Definition: DerivedTypes.h:527
INITIALIZE_PASS_END
INITIALIZE_PASS_END(RegBankSelect, DEBUG_TYPE, "Assign register bank of generic virtual registers", false, false) RegBankSelect
Definition: RegBankSelect.cpp:69
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:204
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::MDNode::get
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1198
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::MemSetBase::getValue
Value * getValue() const
Definition: IntrinsicInst.h:718
llvm::Instruction::setMetadata
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1330
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::Instruction::getOpcode
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:160
promoteAllocasToVector
bool promoteAllocasToVector(Function &F, TargetMachine &TM)
Definition: AMDGPUPromoteAlloca.cpp:1087
DEBUG_TYPE
#define DEBUG_TYPE
Definition: AMDGPUPromoteAlloca.cpp:25
llvm::AMDGPUSubtarget::get
static const AMDGPUSubtarget & get(const MachineFunction &MF)
Definition: AMDGPUSubtarget.cpp:973
TargetMachine.h
llvm::ArrayType::getNumElements
uint64_t getNumElements() const
Definition: DerivedTypes.h:371
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:112
GCNSubtarget.h
E
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
llvm::SmallVectorImpl::append
void append(in_iter in_start, in_iter in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:648
llvm::User
Definition: User.h:44
Intr
unsigned Intr
Definition: AMDGPUBaseInfo.cpp:1927
C
(vector float) vec_cmpeq(*A, *B) C
Definition: README_ALTIVEC.txt:86
isCallPromotable
static bool isCallPromotable(CallInst *CI)
Definition: AMDGPUPromoteAlloca.cpp:552
Y
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
SI
@ SI
Definition: SIInstrInfo.cpp:7342
llvm::AnalysisUsage
Represent the analysis usage information of a pass.
Definition: PassAnalysisSupport.h:47
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
false
Definition: StackSlotColoring.cpp:142
llvm::MaybeAlign
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:119
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::Instruction
Definition: Instruction.h:45
llvm::UndefValue::get
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1770
llvm::FunctionType::params
ArrayRef< Type * > params() const
Definition: DerivedTypes.h:129
llvm::getUnderlyingObject
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4272
Align
uint64_t Align
Definition: ELFObjHandler.cpp:83
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:650
llvm::GlobalValue::InternalLinkage
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:55
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::MCID::Call
@ Call
Definition: MCInstrDesc.h:154
llvm::None
const NoneType None
Definition: None.h:23
llvm::CallingConv::ID
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
llvm::MemSetInst
This class wraps the llvm.memset intrinsic.
Definition: IntrinsicInst.h:857
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:378
llvm::Triple::AMDHSA
@ AMDHSA
Definition: Triple.h:190
llvm::VectorType::isValidElementType
static bool isValidElementType(Type *ElemTy)
Return true if the specified type is valid as a element type.
Definition: Type.cpp:641
llvm::cl::opt< bool >
llvm::AttributeList::ReturnIndex
@ ReturnIndex
Definition: Attributes.h:388
llvm::StoreInst
An instruction for storing to memory.
Definition: Instructions.h:303
llvm::CallingConv::AMDGPU_KERNEL
@ AMDGPU_KERNEL
Calling convention for AMDGPU code object kernels.
Definition: CallingConv.h:211
llvm::AMDGPUSubtarget
Definition: AMDGPUSubtarget.h:29
llvm::Constant
This is an important base class in LLVM.
Definition: Constant.h:41
llvm::CallBase::addDereferenceableAttr
void addDereferenceableAttr(unsigned i, uint64_t Bytes)
adds the dereferenceable attribute to the list of attributes.
Definition: InstrTypes.h:1568
llvm::Instruction::eraseFromParent
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:78
llvm::ICmpInst
This instruction compares its operands according to the predicate given to the constructor.
Definition: Instructions.h:1178
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::Function::getCallingConv
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:228
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
canVectorizeInst
static bool canVectorizeInst(Instruction *Inst, User *User, const DataLayout &DL)
Definition: AMDGPUPromoteAlloca.cpp:361
INITIALIZE_PASS_DEPENDENCY
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
llvm::ConstantPointerNull::get
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1756
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:905
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:634
llvm::is_contained
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
Definition: STLExtras.h:1570
TargetPassConfig.h
calculateVectorIndex
static Value * calculateVectorIndex(Value *Ptr, const std::map< GetElementPtrInst *, Value * > &GEPIdx)
Definition: AMDGPUPromoteAlloca.cpp:335
llvm::pdb::PDB_MemoryType::Stack
@ Stack
IRBuilder.h
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
memcpy
<%struct.s * > cast struct s *S to sbyte *< sbyte * > sbyte uint cast struct s *agg result to sbyte *< sbyte * > sbyte uint cast struct s *memtmp to sbyte *< sbyte * > sbyte uint ret void llc ends up issuing two memcpy or custom lower memcpy(of small size) to be ldmia/stmia. I think option 2 is better but the current register allocator cannot allocate a chunk of registers at a time. A feasible temporary solution is to use specific physical registers at the lowering time for small(<
llvm::SelectInst
This class represents the LLVM 'select' instruction.
Definition: Instructions.h:1715
llvm::ArrayType::get
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:605
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:897
Builder
assume Assume Builder
Definition: AssumeBundleBuilder.cpp:649
llvm::User::setOperand
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::PointerMayBeCaptured
bool PointerMayBeCaptured(const Value *V, bool ReturnCaptures, bool StoreCaptures, unsigned MaxUsesToExplore=0)
PointerMayBeCaptured - Return true if this pointer value may be captured by the enclosing function (w...
Definition: CaptureTracking.cpp:191
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:357
llvm::AnalysisUsage::setPreservesCFG
void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition: Pass.cpp:253
llvm::StringRef
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:57
AMDGPU.h
llvm::LoadInst::isSimple
bool isSimple() const
Definition: Instructions.h:258
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:256
llvm::CFGAnalyses
Represents analyses that only rely on functions' control flow.
Definition: PassManager.h:116
llvm::ConstantInt::isZero
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:192
llvm::Value::replaceAllUsesWith
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:526
uint32_t
llvm::ModRefInfo::Mod
@ Mod
The access may modify the value stored in memory.
DL
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Definition: AArch64SLSHardening.cpp:76
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::AMDGPUPromoteAllocaPass::run
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
Definition: AMDGPUPromoteAlloca.cpp:153
llvm::isAssumeLikeIntrinsic
bool isAssumeLikeIntrinsic(const Instruction *I)
Return true if it is an intrinsic that cannot be speculated but also cannot trap.
Definition: ValueTracking.cpp:512
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::AtomicRMWInst
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:702
llvm::SmallPtrSetImplBase::size
size_type size() const
Definition: SmallPtrSet.h:92
runOnFunction
static bool runOnFunction(Function &F, bool PostInlining)
Definition: EntryExitInstrumenter.cpp:69
llvm::createAMDGPUPromoteAllocaToVector
FunctionPass * createAMDGPUPromoteAllocaToVector()
Definition: AMDGPUPromoteAlloca.cpp:1144
llvm::CallingConv::SPIR_KERNEL
@ SPIR_KERNEL
SPIR_KERNEL - Calling convention for SPIR kernel functions.
Definition: CallingConv.h:147
llvm::Twine
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:80
llvm::Constant::getNullValue
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:347
llvm::MemIntrinsicBase::getLength
Value * getLength() const
Definition: IntrinsicInst.h:595
or
compiles or
Definition: README.txt:606
llvm::PreservedAnalyses::all
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition: PassManager.h:161
stripBitcasts
static Value * stripBitcasts(Value *V)
Definition: AMDGPUPromoteAlloca.cpp:325
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:377
llvm::AMDGPUPromoteAllocaID
char & AMDGPUPromoteAllocaID
llvm::sort
void sort(IteratorTy Start, IteratorTy End)
Definition: STLExtras.h:1446
llvm::Type::getPointerTo
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
Definition: Type.cpp:715
llvm::createAMDGPUPromoteAlloca
FunctionPass * createAMDGPUPromoteAlloca()
Definition: AMDGPUPromoteAlloca.cpp:1140
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:45
llvm::GlobalValue::getAddressSpace
unsigned getAddressSpace() const
Definition: Globals.cpp:112
llvm::PreservedAnalyses::preserveSet
void preserveSet()
Mark an analysis set as preserved.
Definition: PassManager.h:191
llvm::MemIntrinsic::isVolatile
bool isVolatile() const
Definition: IntrinsicInst.h:835
llvm::Type::getPointerElementType
Type * getPointerElementType() const
Definition: Type.h:378
N
#define N
llvm::AMDGPUPromoteAllocaToVectorID
char & AMDGPUPromoteAllocaToVectorID
Users
iv Induction Variable Users
Definition: IVUsers.cpp:52
llvm::PHINode
Definition: Instructions.h:2572
llvm::MemIntrinsicBase::getRawDest
Value * getRawDest() const
Definition: IntrinsicInst.h:589
llvm::CallBase::addAttribute
void addAttribute(unsigned i, Attribute::AttrKind Kind)
adds the attribute to the list of attributes.
Definition: InstrTypes.h:1493
llvm::Pass::getAnalysisUsage
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
Definition: Pass.cpp:93
llvm::AnalysisManager
A container for analyses that lazily runs them and caches their results.
Definition: InstructionSimplify.h:43
llvm::GlobalValue::getValueType
Type * getValueType() const
Definition: GlobalValue.h:273
llvm::FunctionPass
FunctionPass class - This class is used to implement most global optimizations.
Definition: Pass.h:298
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1450
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::Type::isAggregateType
bool isAggregateType() const
Return true if the type is an aggregate type.
Definition: Type.h:265
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:376
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:61
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:411
llvm::GlobalObject::setAlignment
void setAlignment(MaybeAlign Align)
Definition: Globals.cpp:117
arrayTypeToVecType
static FixedVectorType * arrayTypeToVecType(ArrayType *ArrayTy)
Definition: AMDGPUPromoteAlloca.cpp:320
handlePromoteAllocaToVector
bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs)
Definition: AMDGPUPromoteAlloca.cpp:1075
llvm::SmallVectorImpl::reserve
void reserve(size_type N)
Definition: SmallVector.h:624
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::AtomicCmpXchgInst
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:522
llvm::Value::users
iterator_range< user_iterator > users()
Definition: Value.h:434
llvm::ArrayType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:372
llvm::FunctionType
Class to represent function types.
Definition: DerivedTypes.h:102
llvm::Use
A Use represents the edge between a Value definition and its users.
Definition: Use.h:44
llvm::SmallVectorImpl::emplace_back
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:908
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:38
INITIALIZE_PASS
TargetPassConfig.
Definition: TargetPassConfig.cpp:311