16#include "llvm/IR/IntrinsicsAMDGPU.h"
17#include "llvm/IR/IntrinsicsR600.h"
21#define DEBUG_TYPE "amdgpu-attributor"
26 "amdgpu-indirect-call-specialization-threshold",
28 "A threshold controls whether an indirect call will be specialized"),
31#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
34#include "AMDGPUAttributes.def"
38#define AMDGPU_ATTRIBUTE(Name, Str) Name = 1 << Name##_POS,
42#include "AMDGPUAttributes.def"
47#define AMDGPU_ATTRIBUTE(Name, Str) {Name, Str},
48static constexpr std::pair<ImplicitArgumentMask, StringLiteral>
50#include "AMDGPUAttributes.def"
60 bool HasApertureRegs,
bool SupportsGetDoorBellID,
61 unsigned CodeObjectVersion) {
63 case Intrinsic::amdgcn_workitem_id_x:
66 case Intrinsic::amdgcn_workgroup_id_x:
68 return WORKGROUP_ID_X;
69 case Intrinsic::amdgcn_workitem_id_y:
70 case Intrinsic::r600_read_tidig_y:
72 case Intrinsic::amdgcn_workitem_id_z:
73 case Intrinsic::r600_read_tidig_z:
75 case Intrinsic::amdgcn_workgroup_id_y:
76 case Intrinsic::r600_read_tgid_y:
77 return WORKGROUP_ID_Y;
78 case Intrinsic::amdgcn_workgroup_id_z:
79 case Intrinsic::r600_read_tgid_z:
80 return WORKGROUP_ID_Z;
81 case Intrinsic::amdgcn_cluster_id_x:
84 case Intrinsic::amdgcn_cluster_id_y:
86 case Intrinsic::amdgcn_cluster_id_z:
88 case Intrinsic::amdgcn_lds_kernel_id:
90 case Intrinsic::amdgcn_dispatch_ptr:
92 case Intrinsic::amdgcn_dispatch_id:
94 case Intrinsic::amdgcn_implicitarg_ptr:
95 return IMPLICIT_ARG_PTR;
98 case Intrinsic::amdgcn_queue_ptr:
101 case Intrinsic::amdgcn_is_shared:
102 case Intrinsic::amdgcn_is_private:
110 case Intrinsic::trap:
111 case Intrinsic::debugtrap:
112 case Intrinsic::ubsantrap:
113 if (SupportsGetDoorBellID)
137 return F.hasFnAttribute(Attribute::SanitizeAddress) ||
138 F.hasFnAttribute(Attribute::SanitizeThread) ||
139 F.hasFnAttribute(Attribute::SanitizeMemory) ||
140 F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
141 F.hasFnAttribute(Attribute::SanitizeMemTag);
147 AMDGPUInformationCache(
const Module &M, AnalysisGetter &AG,
149 SetVector<Function *> *
CGSCC, TargetMachine &TM)
155 enum ConstantStatus : uint8_t {
158 ADDR_SPACE_CAST_PRIVATE_TO_FLAT = 1 << 1,
159 ADDR_SPACE_CAST_LOCAL_TO_FLAT = 1 << 2,
160 ADDR_SPACE_CAST_BOTH_TO_FLAT =
161 ADDR_SPACE_CAST_PRIVATE_TO_FLAT | ADDR_SPACE_CAST_LOCAL_TO_FLAT
165 bool hasApertureRegs(Function &
F) {
166 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
167 return ST.hasApertureRegs();
171 bool supportsGetDoorbellID(Function &
F) {
172 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
173 return ST.supportsGetDoorbellID();
176 std::optional<std::pair<unsigned, unsigned>>
177 getFlatWorkGroupSizeAttr(
const Function &
F)
const {
181 return std::make_pair(
R->first, *(
R->second));
184 std::pair<unsigned, unsigned>
185 getDefaultFlatWorkGroupSize(
const Function &
F)
const {
186 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
187 return ST.getDefaultFlatWorkGroupSize(
F.getCallingConv());
190 std::pair<unsigned, unsigned>
191 getMaximumFlatWorkGroupRange(
const Function &
F) {
192 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
193 return {
ST.getMinFlatWorkGroupSize(),
ST.getMaxFlatWorkGroupSize()};
196 SmallVector<unsigned> getMaxNumWorkGroups(
const Function &
F) {
197 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
198 return ST.getMaxNumWorkGroups(
F);
202 unsigned getCodeObjectVersion()
const {
return CodeObjectVersion; }
207 std::pair<unsigned, unsigned>
208 getWavesPerEU(
const Function &
F,
209 std::pair<unsigned, unsigned> FlatWorkGroupSize) {
210 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
211 return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(
F),
F);
214 std::optional<std::pair<unsigned, unsigned>>
215 getWavesPerEUAttr(
const Function &
F) {
221 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
222 Val->second =
ST.getMaxWavesPerEU();
224 return std::make_pair(Val->first, *(Val->second));
228 const GCNSubtarget &
ST = TM.getSubtarget<GCNSubtarget>(
F);
229 return ST.getMaxWavesPerEU();
232 unsigned getMaxAddrSpace()
const override {
239 static uint8_t visitConstExpr(
const ConstantExpr *CE) {
240 uint8_t Status = NONE;
242 if (
CE->getOpcode() == Instruction::AddrSpaceCast) {
243 unsigned SrcAS =
CE->getOperand(0)->getType()->getPointerAddressSpace();
245 Status |= ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
247 Status |= ADDR_SPACE_CAST_LOCAL_TO_FLAT;
255 static unsigned getLDSSize(
const Function &
F) {
257 {0, UINT32_MAX},
true)
262 uint8_t getConstantAccess(
const Constant *
C,
263 SmallPtrSetImpl<const Constant *> &Visited) {
264 auto It = ConstantStatus.find(
C);
265 if (It != ConstantStatus.end())
273 Result |= visitConstExpr(CE);
275 for (
const Use &U :
C->operands()) {
277 if (!OpC || !Visited.
insert(OpC).second)
280 Result |= getConstantAccess(OpC, Visited);
287 bool needsQueuePtr(
const Constant *
C, Function &Fn) {
289 bool HasAperture = hasApertureRegs(Fn);
292 if (!IsNonEntryFunc && HasAperture)
295 SmallPtrSet<const Constant *, 8> Visited;
296 uint8_t
Access = getConstantAccess(
C, Visited);
299 if (IsNonEntryFunc && (
Access & DS_GLOBAL))
302 return !HasAperture && (
Access & ADDR_SPACE_CAST_BOTH_TO_FLAT);
305 bool checkConstForAddrSpaceCastFromPrivate(
const Constant *
C) {
306 SmallPtrSet<const Constant *, 8> Visited;
307 uint8_t
Access = getConstantAccess(
C, Visited);
308 return Access & ADDR_SPACE_CAST_PRIVATE_TO_FLAT;
313 DenseMap<const Constant *, uint8_t> ConstantStatus;
314 const unsigned CodeObjectVersion;
317struct AAAMDAttributes
318 :
public StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
320 using Base = StateWrapper<BitIntegerState<uint32_t, ALL_ARGUMENT_MASK, 0>,
323 AAAMDAttributes(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
326 static AAAMDAttributes &createForPosition(
const IRPosition &IRP,
330 StringRef
getName()
const override {
return "AAAMDAttributes"; }
333 const char *getIdAddr()
const override {
return &ID; }
337 static bool classof(
const AbstractAttribute *AA) {
342 static const char ID;
344const char AAAMDAttributes::ID = 0;
346struct AAUniformWorkGroupSize
347 :
public StateWrapper<BooleanState, AbstractAttribute> {
348 using Base = StateWrapper<BooleanState, AbstractAttribute>;
349 AAUniformWorkGroupSize(
const IRPosition &IRP, Attributor &
A) : Base(IRP) {}
352 static AAUniformWorkGroupSize &createForPosition(
const IRPosition &IRP,
356 StringRef
getName()
const override {
return "AAUniformWorkGroupSize"; }
359 const char *getIdAddr()
const override {
return &ID; }
363 static bool classof(
const AbstractAttribute *AA) {
368 static const char ID;
370const char AAUniformWorkGroupSize::ID = 0;
372struct AAUniformWorkGroupSizeFunction :
public AAUniformWorkGroupSize {
373 AAUniformWorkGroupSizeFunction(
const IRPosition &IRP, Attributor &
A)
374 : AAUniformWorkGroupSize(IRP,
A) {}
378 CallingConv::ID CC =
F->getCallingConv();
380 if (CC != CallingConv::AMDGPU_KERNEL)
383 bool InitialValue =
false;
384 if (
F->hasFnAttribute(
"uniform-work-group-size"))
386 F->getFnAttribute(
"uniform-work-group-size").getValueAsString() ==
390 indicateOptimisticFixpoint();
392 indicatePessimisticFixpoint();
398 auto CheckCallSite = [&](AbstractCallSite CS) {
401 <<
"->" << getAssociatedFunction()->
getName() <<
"\n");
403 const auto *CallerInfo =
A.getAAFor<AAUniformWorkGroupSize>(
405 if (!CallerInfo || !CallerInfo->isValidState())
409 CallerInfo->getState());
414 bool AllCallSitesKnown =
true;
415 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
416 return indicatePessimisticFixpoint();
423 LLVMContext &Ctx = getAssociatedFunction()->getContext();
425 AttrList.
push_back(Attribute::get(Ctx,
"uniform-work-group-size",
426 getAssumed() ?
"true" :
"false"));
427 return A.manifestAttrs(getIRPosition(), AttrList,
431 bool isValidState()
const override {
436 const std::string getAsStr(Attributor *)
const override {
437 return "AMDWorkGroupSize[" + std::to_string(getAssumed()) +
"]";
441 void trackStatistics()
const override {}
444AAUniformWorkGroupSize &
445AAUniformWorkGroupSize::createForPosition(
const IRPosition &IRP,
448 return *
new (
A.Allocator) AAUniformWorkGroupSizeFunction(IRP,
A);
450 "AAUniformWorkGroupSize is only valid for function position");
453struct AAAMDAttributesFunction :
public AAAMDAttributes {
454 AAAMDAttributesFunction(
const IRPosition &IRP, Attributor &
A)
455 : AAAMDAttributes(IRP,
A) {}
467 if (HasSanitizerAttrs) {
468 removeAssumedBits(IMPLICIT_ARG_PTR);
469 removeAssumedBits(HOSTCALL_PTR);
470 removeAssumedBits(FLAT_SCRATCH_INIT);
474 if (HasSanitizerAttrs &&
475 (Attr.first == IMPLICIT_ARG_PTR || Attr.first == HOSTCALL_PTR ||
476 Attr.first == FLAT_SCRATCH_INIT))
479 if (
F->hasFnAttribute(Attr.second))
480 addKnownBits(Attr.first);
483 if (
F->isDeclaration())
489 indicatePessimisticFixpoint();
497 auto OrigAssumed = getAssumed();
500 const AACallEdges *AAEdges =
A.getAAFor<AACallEdges>(
501 *
this, this->getIRPosition(), DepClassTy::REQUIRED);
504 return indicatePessimisticFixpoint();
508 bool NeedsImplicit =
false;
509 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
510 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
511 bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*
F);
512 unsigned COV = InfoCache.getCodeObjectVersion();
517 const AAAMDAttributes *AAAMD =
A.getAAFor<AAAMDAttributes>(
519 if (!AAAMD || !AAAMD->isValidState())
520 return indicatePessimisticFixpoint();
525 bool NonKernelOnly =
false;
528 HasApertureRegs, SupportsGetDoorbellID, COV);
539 if (!
Callee->hasFnAttribute(Attribute::NoCallback))
540 return indicatePessimisticFixpoint();
545 if ((IsNonEntryFunc || !NonKernelOnly))
546 removeAssumedBits(AttrMask);
552 removeAssumedBits(IMPLICIT_ARG_PTR);
554 if (isAssumed(QUEUE_PTR) && checkForQueuePtr(
A)) {
558 removeAssumedBits(IMPLICIT_ARG_PTR);
560 removeAssumedBits(QUEUE_PTR);
563 if (funcRetrievesMultigridSyncArg(
A, COV)) {
564 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
565 "multigrid_sync_arg needs implicitarg_ptr");
566 removeAssumedBits(MULTIGRID_SYNC_ARG);
569 if (funcRetrievesHostcallPtr(
A, COV)) {
570 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"hostcall needs implicitarg_ptr");
571 removeAssumedBits(HOSTCALL_PTR);
574 if (funcRetrievesHeapPtr(
A, COV)) {
575 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"heap_ptr needs implicitarg_ptr");
576 removeAssumedBits(HEAP_PTR);
579 if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(
A, COV)) {
580 assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"queue_ptr needs implicitarg_ptr");
581 removeAssumedBits(QUEUE_PTR);
584 if (isAssumed(LDS_KERNEL_ID) && funcRetrievesLDSKernelId(
A)) {
585 removeAssumedBits(LDS_KERNEL_ID);
588 if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(
A, COV))
589 removeAssumedBits(DEFAULT_QUEUE);
591 if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(
A, COV))
592 removeAssumedBits(COMPLETION_ACTION);
594 if (isAssumed(FLAT_SCRATCH_INIT) && needFlatScratchInit(
A))
595 removeAssumedBits(FLAT_SCRATCH_INIT);
597 return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
598 : ChangeStatus::UNCHANGED;
603 LLVMContext &Ctx = getAssociatedFunction()->getContext();
606 if (isKnown(Attr.first))
607 AttrList.
push_back(Attribute::get(Ctx, Attr.second));
610 return A.manifestAttrs(getIRPosition(), AttrList,
614 const std::string getAsStr(Attributor *)
const override {
616 raw_string_ostream OS(Str);
619 if (isAssumed(Attr.first))
620 OS <<
' ' << Attr.second;
626 void trackStatistics()
const override {}
629 bool checkForQueuePtr(Attributor &
A) {
633 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
635 bool NeedsQueuePtr =
false;
638 unsigned SrcAS =
static_cast<AddrSpaceCastInst &
>(
I).getSrcAddressSpace();
640 NeedsQueuePtr =
true;
646 bool HasApertureRegs = InfoCache.hasApertureRegs(*
F);
652 if (!HasApertureRegs) {
653 bool UsedAssumedInformation =
false;
654 A.checkForAllInstructions(CheckAddrSpaceCasts, *
this,
655 {Instruction::AddrSpaceCast},
656 UsedAssumedInformation);
663 if (!IsNonEntryFunc && HasApertureRegs)
666 for (BasicBlock &BB : *
F) {
667 for (Instruction &
I : BB) {
668 for (
const Use &U :
I.operands()) {
670 if (InfoCache.needsQueuePtr(
C, *
F))
680 bool funcRetrievesMultigridSyncArg(Attributor &
A,
unsigned COV) {
682 AA::RangeTy
Range(Pos, 8);
683 return funcRetrievesImplicitKernelArg(
A,
Range);
686 bool funcRetrievesHostcallPtr(Attributor &
A,
unsigned COV) {
688 AA::RangeTy
Range(Pos, 8);
689 return funcRetrievesImplicitKernelArg(
A,
Range);
692 bool funcRetrievesDefaultQueue(Attributor &
A,
unsigned COV) {
694 AA::RangeTy
Range(Pos, 8);
695 return funcRetrievesImplicitKernelArg(
A,
Range);
698 bool funcRetrievesCompletionAction(Attributor &
A,
unsigned COV) {
700 AA::RangeTy
Range(Pos, 8);
701 return funcRetrievesImplicitKernelArg(
A,
Range);
704 bool funcRetrievesHeapPtr(Attributor &
A,
unsigned COV) {
708 return funcRetrievesImplicitKernelArg(
A,
Range);
711 bool funcRetrievesQueuePtr(Attributor &
A,
unsigned COV) {
715 return funcRetrievesImplicitKernelArg(
A,
Range);
718 bool funcRetrievesImplicitKernelArg(Attributor &
A, AA::RangeTy
Range) {
730 const auto *PointerInfoAA =
A.getAAFor<AAPointerInfo>(
732 if (!PointerInfoAA || !PointerInfoAA->getState().isValidState())
735 return PointerInfoAA->forallInterferingAccesses(
736 Range, [](
const AAPointerInfo::Access &Acc,
bool IsExact) {
741 bool UsedAssumedInformation =
false;
742 return !
A.checkForAllCallLikeInstructions(DoesNotLeadToKernelArgLoc, *
this,
743 UsedAssumedInformation);
746 bool funcRetrievesLDSKernelId(Attributor &
A) {
751 bool UsedAssumedInformation =
false;
752 return !
A.checkForAllCallLikeInstructions(DoesNotRetrieve, *
this,
753 UsedAssumedInformation);
758 bool needFlatScratchInit(Attributor &
A) {
759 assert(isAssumed(FLAT_SCRATCH_INIT));
768 bool UsedAssumedInformation =
false;
769 if (!
A.checkForAllInstructions(AddrSpaceCastNotFromPrivate, *
this,
770 {Instruction::AddrSpaceCast},
771 UsedAssumedInformation))
775 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
779 for (
const Use &U :
I.operands()) {
781 if (InfoCache.checkConstForAddrSpaceCastFromPrivate(
C))
803 return Callee->getIntrinsicID() !=
804 Intrinsic::amdgcn_addrspacecast_nonnull;
807 UsedAssumedInformation =
false;
811 return !
A.checkForAllCallLikeInstructions(CheckForNoFlatScratchInit, *
this,
812 UsedAssumedInformation);
816AAAMDAttributes &AAAMDAttributes::createForPosition(
const IRPosition &IRP,
819 return *
new (
A.Allocator) AAAMDAttributesFunction(IRP,
A);
824struct AAAMDSizeRangeAttribute
825 :
public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
826 using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
830 AAAMDSizeRangeAttribute(
const IRPosition &IRP, Attributor &
A,
832 :
Base(IRP, 32), AttrName(AttrName) {}
835 void trackStatistics()
const override {}
837 template <
class AttributeImpl>
ChangeStatus updateImplImpl(Attributor &
A) {
840 auto CheckCallSite = [&](AbstractCallSite CS) {
843 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
845 const auto *CallerInfo =
A.getAAFor<AttributeImpl>(
847 if (!CallerInfo || !CallerInfo->isValidState())
856 bool AllCallSitesKnown =
true;
857 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
860 return indicatePessimisticFixpoint();
868 emitAttributeIfNotDefaultAfterClamp(Attributor &
A,
869 std::pair<unsigned, unsigned>
Default) {
871 unsigned Lower = getAssumed().getLower().getZExtValue();
872 unsigned Upper = getAssumed().getUpper().getZExtValue();
882 return ChangeStatus::UNCHANGED;
885 LLVMContext &Ctx =
F->getContext();
886 SmallString<10> Buffer;
887 raw_svector_ostream OS(Buffer);
889 return A.manifestAttrs(getIRPosition(),
890 {Attribute::get(Ctx, AttrName, OS.str())},
894 const std::string getAsStr(Attributor *)
const override {
896 raw_string_ostream OS(Str);
898 OS << getAssumed().getLower() <<
',' << getAssumed().getUpper() - 1;
905struct AAAMDFlatWorkGroupSize :
public AAAMDSizeRangeAttribute {
906 AAAMDFlatWorkGroupSize(
const IRPosition &IRP, Attributor &
A)
907 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-flat-work-group-size") {}
911 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
913 bool HasAttr =
false;
914 auto Range = InfoCache.getDefaultFlatWorkGroupSize(*
F);
915 auto MaxRange = InfoCache.getMaximumFlatWorkGroupRange(*
F);
917 if (
auto Attr = InfoCache.getFlatWorkGroupSizeAttr(*
F)) {
921 if (*Attr != MaxRange) {
929 if (
Range == MaxRange)
933 ConstantRange CR(APInt(32, Min), APInt(32, Max + 1));
934 IntegerRangeState IRS(CR);
938 indicateOptimisticFixpoint();
942 return updateImplImpl<AAAMDFlatWorkGroupSize>(
A);
946 static AAAMDFlatWorkGroupSize &createForPosition(
const IRPosition &IRP,
951 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
952 return emitAttributeIfNotDefaultAfterClamp(
953 A, InfoCache.getMaximumFlatWorkGroupRange(*
F));
957 StringRef
getName()
const override {
return "AAAMDFlatWorkGroupSize"; }
960 const char *getIdAddr()
const override {
return &
ID; }
964 static bool classof(
const AbstractAttribute *AA) {
969 static const char ID;
972const char AAAMDFlatWorkGroupSize::ID = 0;
974AAAMDFlatWorkGroupSize &
975AAAMDFlatWorkGroupSize::createForPosition(
const IRPosition &IRP,
978 return *
new (
A.Allocator) AAAMDFlatWorkGroupSize(IRP,
A);
980 "AAAMDFlatWorkGroupSize is only valid for function position");
983struct TupleDecIntegerRangeState :
public AbstractState {
984 DecIntegerState<uint32_t>
X,
Y, Z;
986 bool isValidState()
const override {
987 return X.isValidState() &&
Y.isValidState() &&
Z.isValidState();
990 bool isAtFixpoint()
const override {
991 return X.isAtFixpoint() &&
Y.isAtFixpoint() &&
Z.isAtFixpoint();
995 return X.indicateOptimisticFixpoint() |
Y.indicateOptimisticFixpoint() |
996 Z.indicateOptimisticFixpoint();
1000 return X.indicatePessimisticFixpoint() |
Y.indicatePessimisticFixpoint() |
1001 Z.indicatePessimisticFixpoint();
1004 TupleDecIntegerRangeState
operator^=(
const TupleDecIntegerRangeState &
Other) {
1015 TupleDecIntegerRangeState &getAssumed() {
return *
this; }
1016 const TupleDecIntegerRangeState &getAssumed()
const {
return *
this; }
1019using AAAMDMaxNumWorkgroupsState =
1020 StateWrapper<TupleDecIntegerRangeState, AbstractAttribute, uint32_t>;
1023struct AAAMDMaxNumWorkgroups
1024 :
public StateWrapper<TupleDecIntegerRangeState, AbstractAttribute> {
1025 using Base = StateWrapper<TupleDecIntegerRangeState, AbstractAttribute>;
1027 AAAMDMaxNumWorkgroups(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1031 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1033 SmallVector<unsigned> MaxNumWorkgroups = InfoCache.getMaxNumWorkGroups(*
F);
1035 X.takeKnownMinimum(MaxNumWorkgroups[0]);
1036 Y.takeKnownMinimum(MaxNumWorkgroups[1]);
1037 Z.takeKnownMinimum(MaxNumWorkgroups[2]);
1040 indicatePessimisticFixpoint();
1046 auto CheckCallSite = [&](AbstractCallSite CS) {
1049 <<
"->" << getAssociatedFunction()->
getName() <<
'\n');
1051 const auto *CallerInfo =
A.getAAFor<AAAMDMaxNumWorkgroups>(
1053 if (!CallerInfo || !CallerInfo->isValidState())
1061 bool AllCallSitesKnown =
true;
1062 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1065 return indicatePessimisticFixpoint();
1071 static AAAMDMaxNumWorkgroups &createForPosition(
const IRPosition &IRP,
1076 LLVMContext &Ctx =
F->getContext();
1077 SmallString<32> Buffer;
1078 raw_svector_ostream OS(Buffer);
1079 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed();
1083 return A.manifestAttrs(
1085 {Attribute::get(Ctx,
"amdgpu-max-num-workgroups", OS.str())},
1089 StringRef
getName()
const override {
return "AAAMDMaxNumWorkgroups"; }
1091 const std::string getAsStr(Attributor *)
const override {
1092 std::string Buffer =
"AAAMDMaxNumWorkgroupsState[";
1093 raw_string_ostream OS(Buffer);
1094 OS <<
X.getAssumed() <<
',' <<
Y.getAssumed() <<
',' <<
Z.getAssumed()
1099 const char *getIdAddr()
const override {
return &
ID; }
1103 static bool classof(
const AbstractAttribute *AA) {
1107 void trackStatistics()
const override {}
1110 static const char ID;
1113const char AAAMDMaxNumWorkgroups::ID = 0;
1115AAAMDMaxNumWorkgroups &
1116AAAMDMaxNumWorkgroups::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1118 return *
new (
A.Allocator) AAAMDMaxNumWorkgroups(IRP,
A);
1119 llvm_unreachable(
"AAAMDMaxNumWorkgroups is only valid for function position");
1123struct AAAMDWavesPerEU :
public AAAMDSizeRangeAttribute {
1124 AAAMDWavesPerEU(
const IRPosition &IRP, Attributor &
A)
1125 : AAAMDSizeRangeAttribute(IRP,
A,
"amdgpu-waves-per-eu") {}
1129 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1132 if (
auto Attr = InfoCache.getWavesPerEUAttr(*
F)) {
1133 std::pair<unsigned, unsigned> MaxWavesPerEURange{
1134 1U, InfoCache.getMaxWavesPerEU(*
F)};
1135 if (*Attr != MaxWavesPerEURange) {
1136 auto [Min,
Max] = *Attr;
1137 ConstantRange
Range(APInt(32, Min), APInt(32, Max + 1));
1138 IntegerRangeState RangeState(
Range);
1139 this->getState() = RangeState;
1140 indicateOptimisticFixpoint();
1146 indicatePessimisticFixpoint();
1152 auto CheckCallSite = [&](AbstractCallSite CS) {
1156 <<
"->" <<
Func->getName() <<
'\n');
1159 const auto *CallerAA =
A.getAAFor<AAAMDWavesPerEU>(
1161 if (!CallerAA || !CallerAA->isValidState())
1164 ConstantRange Assumed = getAssumed();
1166 CallerAA->getAssumed().getLower().getZExtValue());
1168 CallerAA->getAssumed().getUpper().getZExtValue());
1169 ConstantRange
Range(APInt(32, Min), APInt(32, Max));
1170 IntegerRangeState RangeState(
Range);
1171 getState() = RangeState;
1172 Change |= getState() == Assumed ? ChangeStatus::UNCHANGED
1173 : ChangeStatus::CHANGED;
1178 bool AllCallSitesKnown =
true;
1179 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
true, AllCallSitesKnown))
1180 return indicatePessimisticFixpoint();
1186 static AAAMDWavesPerEU &createForPosition(
const IRPosition &IRP,
1191 auto &InfoCache =
static_cast<AMDGPUInformationCache &
>(
A.getInfoCache());
1192 return emitAttributeIfNotDefaultAfterClamp(
1193 A, {1U, InfoCache.getMaxWavesPerEU(*
F)});
1197 StringRef
getName()
const override {
return "AAAMDWavesPerEU"; }
1200 const char *getIdAddr()
const override {
return &
ID; }
1204 static bool classof(
const AbstractAttribute *AA) {
1209 static const char ID;
1212const char AAAMDWavesPerEU::ID = 0;
1214AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(
const IRPosition &IRP,
1217 return *
new (
A.Allocator) AAAMDWavesPerEU(IRP,
A);
1222static unsigned inlineAsmGetNumRequiredAGPRs(
const InlineAsm *IA,
1223 const CallBase &
Call) {
1226 unsigned AGPRDefCount = 0;
1227 unsigned AGPRUseCount = 0;
1228 unsigned MaxPhysReg = 0;
1232 for (
const InlineAsm::ConstraintInfo &CI :
IA->ParseConstraints()) {
1238 Ty = STy->getElementType(ResNo);
1253 for (StringRef Code : CI.Codes) {
1254 unsigned RegCount = 0;
1255 if (
Code.starts_with(
"a")) {
1266 MaxPhysReg = std::max(MaxPhysReg, std::min(RegIdx + NumRegs, 256u));
1276 AGPRDefCount =
alignTo(AGPRDefCount, RegCount);
1278 AGPRDefCount += RegCount;
1279 if (CI.isEarlyClobber) {
1280 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1281 AGPRUseCount += RegCount;
1284 AGPRUseCount =
alignTo(AGPRUseCount, RegCount);
1285 AGPRUseCount += RegCount;
1290 unsigned MaxVirtReg = std::max(AGPRUseCount, AGPRDefCount);
1295 return std::min(MaxVirtReg + MaxPhysReg, 256u);
1298struct AAAMDGPUMinAGPRAlloc
1299 :
public StateWrapper<DecIntegerState<>, AbstractAttribute> {
1300 using Base = StateWrapper<DecIntegerState<>, AbstractAttribute>;
1301 AAAMDGPUMinAGPRAlloc(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1303 static AAAMDGPUMinAGPRAlloc &createForPosition(
const IRPosition &IRP,
1306 return *
new (
A.Allocator) AAAMDGPUMinAGPRAlloc(IRP,
A);
1308 "AAAMDGPUMinAGPRAlloc is only valid for function position");
1313 auto [MinNumAGPR, MaxNumAGPR] =
1316 if (MinNumAGPR == 0)
1317 indicateOptimisticFixpoint();
1320 const std::string getAsStr(Attributor *
A)
const override {
1321 std::string Str =
"amdgpu-agpr-alloc=";
1322 raw_string_ostream OS(Str);
1327 void trackStatistics()
const override {}
1330 DecIntegerState<> Maximum;
1337 const Value *CalleeOp = CB.getCalledOperand();
1342 unsigned NumRegs = inlineAsmGetNumRequiredAGPRs(IA, CB);
1346 switch (CB.getIntrinsicID()) {
1349 case Intrinsic::write_register:
1350 case Intrinsic::read_register:
1351 case Intrinsic::read_volatile_register: {
1356 auto [
Kind, RegIdx, NumRegs] =
1370 case Intrinsic::trap:
1371 case Intrinsic::debugtrap:
1372 case Intrinsic::ubsantrap:
1373 return CB.hasFnAttr(Attribute::NoCallback) ||
1374 !CB.hasFnAttr(
"trap-func-name");
1380 return CB.hasFnAttr(Attribute::NoCallback);
1384 auto *CBEdges =
A.getAAFor<AACallEdges>(
1386 if (!CBEdges || CBEdges->hasUnknownCallee()) {
1391 for (
const Function *PossibleCallee : CBEdges->getOptimisticEdges()) {
1392 const auto *CalleeInfo =
A.getAAFor<AAAMDGPUMinAGPRAlloc>(
1394 if (!CalleeInfo || !CalleeInfo->isValidState()) {
1405 bool UsedAssumedInformation =
false;
1406 if (!
A.checkForAllCallLikeInstructions(CheckForMinAGPRAllocs, *
this,
1407 UsedAssumedInformation))
1408 return indicatePessimisticFixpoint();
1414 LLVMContext &Ctx = getAssociatedFunction()->getContext();
1415 SmallString<4> Buffer;
1416 raw_svector_ostream OS(Buffer);
1419 return A.manifestAttrs(
1420 getIRPosition(), {Attribute::get(Ctx,
"amdgpu-agpr-alloc", OS.str())});
1423 StringRef
getName()
const override {
return "AAAMDGPUMinAGPRAlloc"; }
1424 const char *getIdAddr()
const override {
return &
ID; }
1428 static bool classof(
const AbstractAttribute *AA) {
1432 static const char ID;
1435const char AAAMDGPUMinAGPRAlloc::ID = 0;
1439struct AAAMDGPUClusterDims
1440 :
public StateWrapper<BooleanState, AbstractAttribute> {
1441 using Base = StateWrapper<BooleanState, AbstractAttribute>;
1442 AAAMDGPUClusterDims(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
1445 static AAAMDGPUClusterDims &createForPosition(
const IRPosition &IRP,
1449 StringRef
getName()
const override {
return "AAAMDGPUClusterDims"; }
1452 const char *getIdAddr()
const override {
return &
ID; }
1456 static bool classof(
const AbstractAttribute *AA) {
1460 virtual const AMDGPU::ClusterDimsAttr &getClusterDims()
const = 0;
1463 static const char ID;
1466const char AAAMDGPUClusterDims::ID = 0;
1468struct AAAMDGPUClusterDimsFunction :
public AAAMDGPUClusterDims {
1469 AAAMDGPUClusterDimsFunction(
const IRPosition &IRP, Attributor &
A)
1470 : AAAMDGPUClusterDims(IRP,
A) {}
1474 assert(
F &&
"empty associated function");
1481 indicatePessimisticFixpoint();
1483 indicateOptimisticFixpoint();
1487 const std::string getAsStr(Attributor *
A)
const override {
1497 void trackStatistics()
const override {}
1500 auto OldState = Attr;
1502 auto CheckCallSite = [&](AbstractCallSite CS) {
1503 const auto *CallerAA =
A.getAAFor<AAAMDGPUClusterDims>(
1505 DepClassTy::REQUIRED);
1506 if (!CallerAA || !CallerAA->isValidState())
1509 return merge(CallerAA->getClusterDims());
1512 bool UsedAssumedInformation =
false;
1513 if (!
A.checkForAllCallSites(CheckCallSite, *
this,
1515 UsedAssumedInformation))
1516 return indicatePessimisticFixpoint();
1518 return OldState == Attr ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
1523 return ChangeStatus::UNCHANGED;
1524 return A.manifestAttrs(
1526 {Attribute::get(getAssociatedFunction()->
getContext(), AttrName,
1531 const AMDGPU::ClusterDimsAttr &getClusterDims()
const override {
1536 bool merge(
const AMDGPU::ClusterDimsAttr &
Other) {
1551 if (
Other.isUnknown())
1576 AMDGPU::ClusterDimsAttr Attr;
1578 static constexpr char AttrName[] =
"amdgpu-cluster-dims";
1581AAAMDGPUClusterDims &
1582AAAMDGPUClusterDims::createForPosition(
const IRPosition &IRP, Attributor &
A) {
1584 return *
new (
A.Allocator) AAAMDGPUClusterDimsFunction(IRP,
A);
1585 llvm_unreachable(
"AAAMDGPUClusterDims is only valid for function position");
1588static bool runImpl(
Module &M, AnalysisGetter &AG, TargetMachine &TM,
1589 AMDGPUAttributorOptions
Options,
1591 SetVector<Function *> Functions;
1592 for (Function &
F : M) {
1593 if (!
F.isIntrinsic())
1597 CallGraphUpdater CGUpdater;
1599 AMDGPUInformationCache InfoCache(M, AG,
Allocator,
nullptr, TM);
1600 DenseSet<const char *>
Allowed(
1601 {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
1603 &AAAMDMaxNumWorkgroups::ID, &AAAMDWavesPerEU::ID,
1609 AttributorConfig AC(CGUpdater);
1610 AC.IsClosedWorldModule =
Options.IsClosedWorld;
1612 AC.IsModulePass =
true;
1613 AC.DefaultInitializeLiveInternals =
false;
1614 AC.IndirectCalleeSpecializationCallback =
1615 [](Attributor &
A,
const AbstractAttribute &AA, CallBase &CB,
1620 AC.IPOAmendableCB = [](
const Function &
F) {
1621 return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
1624 Attributor
A(Functions, InfoCache, AC);
1627 StringRef LTOPhaseStr =
to_string(LTOPhase);
1628 dbgs() <<
"[AMDGPUAttributor] Running at phase " << LTOPhaseStr <<
'\n'
1629 <<
"[AMDGPUAttributor] Module " <<
M.getName() <<
" is "
1630 << (AC.IsClosedWorldModule ?
"" :
"not ")
1631 <<
"assumed to be a closed world.\n";
1634 for (
auto *
F : Functions) {
1638 CallingConv::ID CC =
F->getCallingConv();
1645 if (!
F->isDeclaration() &&
ST.hasClusters())
1648 if (
ST.hasGFX90AInsts())
1652 Value *Ptr =
nullptr;
1654 Ptr = LI->getPointerOperand();
1656 Ptr =
SI->getPointerOperand();
1658 Ptr = RMW->getPointerOperand();
1660 Ptr = CmpX->getPointerOperand();
1666 if (
II->getIntrinsicID() == Intrinsic::amdgcn_make_buffer_rsrc)
1673 return A.run() == ChangeStatus::CHANGED;
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool isDSAddress(const Constant *C)
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static cl::opt< unsigned > IndirectCallSpecializationThreshold("amdgpu-indirect-call-specialization-threshold", cl::desc("A threshold controls whether an indirect call will be specialized"), cl::init(3))
static ImplicitArgumentMask intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit, bool HasApertureRegs, bool SupportsGetDoorBellID, unsigned CodeObjectVersion)
static bool hasSanitizerAttributes(const Function &F)
Returns true if sanitizer attributes are present on a function.
ImplicitArgumentPositions
static bool castRequiresQueuePtr(unsigned SrcAS)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
AMD GCN specific subclass of TargetSubtarget.
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B)
Machine Check Debug Module
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
static StringRef getName(Value *V)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static ClusterDimsAttr get(const Function &F)
std::string to_string() const
bool isVariableDims() const
uint64_t getZExtValue() const
Get zero extended value.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Value * getArgOperand(unsigned i) const
LLVM_ABI Intrinsic::ID getIntrinsicID() const
Returns the intrinsic ID of the intrinsic called or Intrinsic::not_intrinsic if the called function i...
const APInt & getLower() const
Return the lower value for this range.
const APInt & getUpper() const
Return the upper value for this range.
This is an important base class in LLVM.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
A Module instance is used to store all the information related to an LLVM module.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
bool insert(const value_type &X)
Insert a new element into the SetVector.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
void push_back(const T &Elt)
std::string str() const
str - Get the contents as an std::string.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
LLVM_ABI bool isDroppable() const
A droppable user is a user for which uses can be dropped without affecting correctness and should be ...
Type * getType() const
All values are typed, get the type of this value.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI)
unsigned getAMDHSACodeObjectVersion(const Module &M)
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion)
std::tuple< char, unsigned, unsigned > parseAsmPhysRegName(StringRef RegName)
Returns a valid charcode or 0 in the first entry if this is a valid physical register name.
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion)
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion)
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ CE
Windows NT (Windows on ARM)
initializer< Ty > init(const Ty &Val)
NodeAddr< CodeNode * > Code
NodeAddr< FuncNode * > Func
Context & getContext() const
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
ThinOrFullLTOPhase
This enumerates the LLVM full LTO or ThinLTO optimization phases.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
const char * to_string(ThinOrFullLTOPhase Phase)
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R)
Helper function to clamp a state S of type StateType with the information in R and indicate/return if...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual bool hasNonAsmUnknownCallee() const =0
Is there any call with a unknown callee, excluding any inline asm.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
Instruction * getRemoteInst() const
Return the actual instruction that causes the access.
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
Wrapper for FunctionAnalysisManager.
The fixpoint analysis framework that orchestrates the attribute deduction.
DecIntegerState & takeAssumedMaximum(base_t Value)
Take maximum of assumed and Value.
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
@ IRP_FUNCTION
An attribute for a function (scope).
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Helper to tie a abstract state implementation to an abstract attribute.