50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
67#define DEBUG_TYPE "openmp-opt"
70 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
74 "openmp-opt-enable-merging",
80 cl::desc(
"Disable function internalization."),
91 "openmp-hide-memory-transfer-latency",
92 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
97 "openmp-opt-disable-deglobalization",
98 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
102 "openmp-opt-disable-spmdization",
103 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
107 "openmp-opt-disable-folding",
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
122 "openmp-opt-print-module-after",
123 cl::desc(
"Print the current module after OpenMP optimizations."),
127 "openmp-opt-print-module-before",
128 cl::desc(
"Print the current module before OpenMP optimizations."),
132 "openmp-opt-inline-device",
143 cl::desc(
"Maximal number of attributor iterations."),
148 cl::desc(
"Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits<unsigned>::max()));
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
156 "Number of OpenMP runtime functions identified");
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr unsigned MEMBER##Idx = IDX;
216#undef KERNEL_ENVIRONMENT_IDX
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr unsigned MEMBER##Idx = IDX;
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
239#undef KERNEL_ENVIRONMENT_GETTER
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
261 constexpr int InitKernelEnvironmentArgNo = 0;
276struct AAHeapToShared;
283 OMPInformationCache(
Module &M, AnalysisGetter &AG,
287 OpenMPPostLink(OpenMPPostLink) {
290 const Triple
T(OMPBuilder.M.getTargetTriple());
291 switch (
T.getArch()) {
295 assert(OMPBuilder.Config.IsTargetDevice &&
296 "OpenMP AMDGPU/NVPTX is only prepared to deal with device code.");
297 OMPBuilder.Config.IsGPU =
true;
300 OMPBuilder.Config.IsGPU =
false;
303 OMPBuilder.initialize();
304 initializeRuntimeFunctions(M);
305 initializeInternalControlVars();
309 struct InternalControlVarInfo {
317 StringRef EnvVarName;
323 ConstantInt *InitValue;
336 struct RuntimeFunctionInfo {
357 using UseVector = SmallVector<Use *, 16>;
360 void clearUsesMap() { UsesMap.clear(); }
363 operator bool()
const {
return Declaration; }
366 UseVector &getOrCreateUseVector(Function *
F) {
367 std::shared_ptr<UseVector> &UV = UsesMap[
F];
369 UV = std::make_shared<UseVector>();
375 const UseVector *getUseVector(Function &
F)
const {
376 auto I = UsesMap.find(&
F);
377 if (
I != UsesMap.end())
378 return I->second.get();
383 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
387 size_t getNumArgs()
const {
return ArgumentTypes.size(); }
392 void foreachUse(SmallVectorImpl<Function *> &SCC,
393 function_ref<
bool(Use &, Function &)> CB) {
394 for (Function *
F : SCC)
400 void foreachUse(function_ref<
bool(Use &, Function &)> CB, Function *
F) {
401 SmallVector<unsigned, 8> ToBeDeleted;
405 UseVector &UV = getOrCreateUseVector(
F);
415 while (!ToBeDeleted.
empty()) {
425 DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
429 decltype(UsesMap)::iterator
begin() {
return UsesMap.begin(); }
430 decltype(UsesMap)::iterator
end() {
return UsesMap.end(); }
434 OpenMPIRBuilder OMPBuilder;
438 RuntimeFunction::OMPRTL___last>
442 DenseMap<Function *, RuntimeFunction> RuntimeFunctionIDMap;
446 InternalControlVar::ICV___last>
451 void initializeInternalControlVars() {
452#define ICV_RT_SET(_Name, RTL) \
454 auto &ICV = ICVs[_Name]; \
457#define ICV_RT_GET(Name, RTL) \
459 auto &ICV = ICVs[Name]; \
462#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
464 auto &ICV = ICVs[Enum]; \
467 ICV.InitKind = Init; \
468 ICV.EnvVarName = _EnvVarName; \
469 switch (ICV.InitKind) { \
470 case ICV_IMPLEMENTATION_DEFINED: \
471 ICV.InitValue = nullptr; \
474 ICV.InitValue = ConstantInt::get( \
475 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
478 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
484#include "llvm/Frontend/OpenMP/OMPKinds.def"
490 static bool declMatchesRTFTypes(Function *
F,
Type *RTFRetType,
497 if (
F->getReturnType() != RTFRetType)
499 if (
F->arg_size() != RTFArgTypes.
size())
502 auto *RTFTyIt = RTFArgTypes.
begin();
503 for (Argument &Arg :
F->args()) {
504 if (Arg.getType() != *RTFTyIt)
514 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
515 unsigned NumUses = 0;
516 if (!RFI.Declaration)
518 OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
521 NumOpenMPRuntimeFunctionsIdentified += 1;
522 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
526 for (Use &U : RFI.Declaration->uses()) {
528 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
529 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
533 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
542 auto &RFI = RFIs[RTF];
544 collectUses(RFI,
false);
548 void recollectUses() {
549 for (
int Idx = 0; Idx < RFIs.size(); ++Idx)
554 void setCallingConvention(FunctionCallee Callee, CallInst *CI) {
569 RuntimeFunctionInfo &RFI = RFIs[Fn];
571 if (!RFI.Declaration || RFI.Declaration->isDeclaration())
579 void initializeRuntimeFunctions(
Module &M) {
582#define OMP_TYPE(VarName, ...) \
583 Type *VarName = OMPBuilder.VarName; \
586#define OMP_ARRAY_TYPE(VarName, ...) \
587 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
589 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
590 (void)VarName##PtrTy;
592#define OMP_FUNCTION_TYPE(VarName, ...) \
593 FunctionType *VarName = OMPBuilder.VarName; \
595 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
598#define OMP_STRUCT_TYPE(VarName, ...) \
599 StructType *VarName = OMPBuilder.VarName; \
601 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
604#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
606 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
607 Function *F = M.getFunction(_Name); \
608 RTLFunctions.insert(F); \
609 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
610 RuntimeFunctionIDMap[F] = _Enum; \
611 auto &RFI = RFIs[_Enum]; \
614 RFI.IsVarArg = _IsVarArg; \
615 RFI.ReturnType = OMPBuilder._ReturnType; \
616 RFI.ArgumentTypes = std::move(ArgsTypes); \
617 RFI.Declaration = F; \
618 unsigned NumUses = collectUses(RFI); \
621 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
623 if (RFI.Declaration) \
624 dbgs() << TAG << "-> got " << NumUses << " uses in " \
625 << RFI.getNumFunctionsWithUses() \
626 << " different functions.\n"; \
630#include "llvm/Frontend/OpenMP/OMPKinds.def"
635 for (Function &
F : M) {
636 for (StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
637 if (
F.hasFnAttribute(Attribute::NoInline) &&
638 F.getName().starts_with(Prefix) &&
639 !
F.hasFnAttribute(Attribute::OptimizeNone))
640 F.removeFnAttr(Attribute::NoInline);
648 DenseSet<const Function *> RTLFunctions;
651 bool OpenMPPostLink =
false;
654template <
typename Ty,
bool InsertInval
idates = true>
656 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
657 bool insert(
const Ty &Elem) {
658 if (InsertInvalidates)
659 BooleanState::indicatePessimisticFixpoint();
660 return Set.insert(Elem);
663 const Ty &operator[](
int Idx)
const {
return Set[Idx]; }
664 bool operator==(
const BooleanStateWithSetVector &
RHS)
const {
665 return BooleanState::operator==(
RHS) && Set ==
RHS.Set;
667 bool operator!=(
const BooleanStateWithSetVector &
RHS)
const {
668 return !(*
this ==
RHS);
671 bool empty()
const {
return Set.empty(); }
672 size_t size()
const {
return Set.size(); }
675 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &
RHS) {
676 BooleanState::operator^=(
RHS);
677 Set.insert_range(
RHS.Set);
686 typename decltype(Set)::iterator
begin() {
return Set.begin(); }
687 typename decltype(Set)::iterator
end() {
return Set.end(); }
688 typename decltype(Set)::const_iterator
begin()
const {
return Set.begin(); }
689 typename decltype(Set)::const_iterator
end()
const {
return Set.end(); }
692template <
typename Ty,
bool InsertInval
idates = true>
693using BooleanStateWithPtrSetVector =
694 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
698 bool IsAtFixpoint =
false;
702 BooleanStateWithPtrSetVector<CallBase,
false>
703 ReachedKnownParallelRegions;
706 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
711 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
715 CallBase *KernelInitCB =
nullptr;
719 ConstantStruct *KernelEnvC =
nullptr;
723 CallBase *KernelDeinitCB =
nullptr;
726 bool IsKernelEntry =
false;
729 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
734 BooleanStateWithSetVector<uint8_t> ParallelLevels;
737 bool NestedParallelism =
false;
742 KernelInfoState() =
default;
743 KernelInfoState(
bool BestState) {
745 indicatePessimisticFixpoint();
749 bool isValidState()
const override {
return true; }
752 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
757 ParallelLevels.indicatePessimisticFixpoint();
758 ReachingKernelEntries.indicatePessimisticFixpoint();
759 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
760 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
761 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
762 NestedParallelism =
true;
763 return ChangeStatus::CHANGED;
769 ParallelLevels.indicateOptimisticFixpoint();
770 ReachingKernelEntries.indicateOptimisticFixpoint();
771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
772 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
773 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
774 return ChangeStatus::UNCHANGED;
778 KernelInfoState &getAssumed() {
return *
this; }
779 const KernelInfoState &getAssumed()
const {
return *
this; }
782 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
784 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
786 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
788 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
790 if (ParallelLevels !=
RHS.ParallelLevels)
792 if (NestedParallelism !=
RHS.NestedParallelism)
798 bool mayContainParallelRegion() {
799 return !ReachedKnownParallelRegions.empty() ||
800 !ReachedUnknownParallelRegions.empty();
804 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
806 static KernelInfoState getBestState(KernelInfoState &KIS) {
807 return getBestState();
811 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
814 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
816 if (KIS.KernelInitCB) {
817 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
820 KernelInitCB = KIS.KernelInitCB;
822 if (KIS.KernelDeinitCB) {
823 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
826 KernelDeinitCB = KIS.KernelDeinitCB;
828 if (KIS.KernelEnvC) {
829 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
832 KernelEnvC = KIS.KernelEnvC;
834 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
835 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
836 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
837 NestedParallelism |= KIS.NestedParallelism;
841 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
842 return (*
this ^= KIS);
852 AllocaInst *Array =
nullptr;
854 SmallVector<Value *, 8> StoredValues;
856 SmallVector<StoreInst *, 8> LastAccesses;
858 OffloadArray() =
default;
864 bool initialize(AllocaInst &Array, Instruction &Before) {
865 if (!getValues(Array, Before))
868 this->Array = &Array;
872 static const unsigned DeviceIDArgNum = 1;
873 static const unsigned BasePtrsArgNum = 3;
874 static const unsigned PtrsArgNum = 4;
875 static const unsigned SizesArgNum = 5;
881 bool getValues(AllocaInst &Array, Instruction &Before) {
883 const DataLayout &
DL = Array.getDataLayout();
884 std::optional<TypeSize> ArraySize = Array.getAllocationSize(
DL);
885 if (!ArraySize || !ArraySize->isFixed())
888 const uint64_t NumValues = ArraySize->getFixedValue() /
PointerSize;
889 StoredValues.assign(NumValues,
nullptr);
890 LastAccesses.assign(NumValues,
nullptr);
898 for (Instruction &
I : *BB) {
912 LastAccesses[Idx] = S;
922 const unsigned NumValues = StoredValues.size();
923 for (
unsigned I = 0;
I < NumValues; ++
I) {
924 if (!StoredValues[
I] || !LastAccesses[
I])
934 using OptimizationRemarkGetter =
935 function_ref<OptimizationRemarkEmitter &(
Function *)>;
937 OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
938 OptimizationRemarkGetter OREGetter,
939 OMPInformationCache &OMPInfoCache, Attributor &A)
940 : M(*(*SCC.
begin())->
getParent()), SCC(SCC), CGUpdater(CGUpdater),
941 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
944 bool remarksEnabled() {
945 auto &Ctx = M.getContext();
946 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
950 bool run(
bool IsModulePass) {
960 Changed |= runAttributor(IsModulePass);
963 OMPInfoCache.recollectUses();
966 Changed |= rewriteDeviceCodeStateMachine();
968 if (remarksEnabled())
969 analysisGlobalization();
976 Changed |= runAttributor(IsModulePass);
979 OMPInfoCache.recollectUses();
981 Changed |= deleteParallelRegions();
984 Changed |= hideMemTransfersLatency();
985 Changed |= deduplicateRuntimeCalls();
987 if (mergeParallelRegions()) {
988 deduplicateRuntimeCalls();
994 if (OMPInfoCache.OpenMPPostLink)
995 Changed |= removeRuntimeSymbols();
1002 void printICVs()
const {
1006 for (Function *
F : SCC) {
1007 for (
auto ICV : ICVs) {
1008 auto ICVInfo = OMPInfoCache.ICVs[ICV];
1009 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1010 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1012 << (ICVInfo.InitValue
1013 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1014 :
"IMPLEMENTATION_DEFINED");
1023 void printKernels()
const {
1024 for (Function *
F : SCC) {
1028 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
1029 return ORA <<
"OpenMP GPU kernel "
1030 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1039 static CallInst *getCallIfRegularCall(
1040 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1051 static CallInst *getCallIfRegularCall(
1052 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1063 bool mergeParallelRegions() {
1064 const unsigned CallbackCalleeOperand = 2;
1065 const unsigned CallbackFirstArgOperand = 3;
1069 OMPInformationCache::RuntimeFunctionInfo &RFI =
1070 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1072 if (!RFI.Declaration)
1076 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1077 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1078 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1082 LoopInfo *LI =
nullptr;
1083 DominatorTree *DT =
nullptr;
1085 SmallDenseMap<BasicBlock *, SmallPtrSet<Instruction *, 4>> BB2PRMap;
1087 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1088 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1089 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1091 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1092 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1094 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1095 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1099 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1100 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1101 ReplacementValue = &Inner;
1105 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1109 auto CreateSequentialRegion = [&](
Function *OuterFn,
1115 BasicBlock *ParentBB = SeqStartI->getParent();
1117 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1121 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1124 "Expected a different CFG");
1128 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1129 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1131 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1132 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1134 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1138 auto FiniCB = [&](InsertPointTy CodeGenIP) {
return Error::success(); };
1142 for (Instruction &
I : *SeqStartBB) {
1143 SmallPtrSet<Instruction *, 4> OutsideUsers;
1144 for (User *Usr :
I.users()) {
1152 OutsideUsers.
insert(&UsrI);
1155 if (OutsideUsers.
empty())
1160 const DataLayout &
DL = M.getDataLayout();
1161 AllocaInst *AllocaI =
new AllocaInst(
1162 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1167 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1171 for (Instruction *UsrI : OutsideUsers) {
1172 LoadInst *LoadI =
new LoadInst(
I.getType(), AllocaI,
1173 I.getName() +
".seq.output.load",
1179 OpenMPIRBuilder::LocationDescription Loc(
1180 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1182 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB));
1184 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel));
1199 auto Merge = [&](
const SmallVectorImpl<CallInst *> &MergableCIs,
1203 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1205 auto Remark = [&](OptimizationRemark
OR) {
1206 OR <<
"Parallel region merged with parallel region"
1207 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1210 if (CI != MergableCIs.
back())
1218 Function *OriginalFn = BB->getParent();
1220 <<
" parallel regions in " << OriginalFn->
getName()
1224 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1226 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1230 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1231 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1236 for (
auto *It = MergableCIs.
begin(), *End = MergableCIs.
end() - 1;
1245 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1249 OpenMPIRBuilder::LocationDescription Loc(InsertPointTy(BB, BB->end()),
1251 IRBuilder<>::InsertPoint AllocaIP(
1257 cantFail(OMPInfoCache.OMPBuilder.createParallel(
1258 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1259 OMP_PROC_BIND_default,
false));
1263 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1269 SmallVector<Value *, 8>
Args;
1270 for (
auto *CI : MergableCIs) {
1272 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1276 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1286 for (
unsigned U = CallbackFirstArgOperand,
E = CI->
arg_size(); U <
E;
1290 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1293 if (CI != MergableCIs.back()) {
1296 cantFail(OMPInfoCache.OMPBuilder.createBarrier(
1305 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1306 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1307 CGUpdater.reanalyzeFunction(*OriginalFn);
1309 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1317 CallInst *CI = getCallIfRegularCall(U, &RFI);
1324 RFI.foreachUse(SCC, DetectPRsCB);
1330 for (
auto &It : BB2PRMap) {
1331 auto &CIs = It.getSecond();
1346 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1349 if (
I.isTerminator())
1356 if (IsBeforeMergableRegion) {
1358 if (!CalledFunction)
1365 for (
const auto &RFI : UnmergableCallsInfo) {
1366 if (CalledFunction == RFI.Declaration)
1381 for (
auto It = BB->
begin(), End = BB->
end(); It != End;) {
1385 if (CIs.count(&
I)) {
1391 if (IsMergable(
I, MergableCIs.
empty()))
1396 for (; It != End; ++It) {
1398 if (CIs.count(&SkipI)) {
1400 <<
" due to " <<
I <<
"\n");
1407 if (MergableCIs.
size() > 1) {
1408 MergableCIsVector.
push_back(MergableCIs);
1410 <<
" parallel regions in block " << BB->
getName()
1415 MergableCIs.
clear();
1418 if (!MergableCIsVector.
empty()) {
1421 for (
auto &MergableCIs : MergableCIsVector)
1422 Merge(MergableCIs, BB);
1423 MergableCIsVector.clear();
1430 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1431 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1432 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1433 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1440 bool deleteParallelRegions() {
1441 const unsigned CallbackCalleeOperand = 2;
1443 OMPInformationCache::RuntimeFunctionInfo &RFI =
1444 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1446 if (!RFI.Declaration)
1451 CallInst *CI = getCallIfRegularCall(U);
1458 if (!Fn->onlyReadsMemory())
1460 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1466 auto Remark = [&](OptimizationRemark
OR) {
1467 return OR <<
"Removing parallel region with no side-effects.";
1473 ++NumOpenMPParallelRegionsDeleted;
1477 RFI.foreachUse(SCC, DeleteCallCB);
1483 bool deduplicateRuntimeCalls() {
1487 OMPRTL_omp_get_num_threads,
1488 OMPRTL_omp_in_parallel,
1489 OMPRTL_omp_get_cancellation,
1490 OMPRTL_omp_get_supported_active_levels,
1491 OMPRTL_omp_get_level,
1492 OMPRTL_omp_get_ancestor_thread_num,
1493 OMPRTL_omp_get_team_size,
1494 OMPRTL_omp_get_active_level,
1495 OMPRTL_omp_in_final,
1496 OMPRTL_omp_get_proc_bind,
1497 OMPRTL_omp_get_num_places,
1498 OMPRTL_omp_get_num_procs,
1499 OMPRTL_omp_get_place_num,
1500 OMPRTL_omp_get_partition_num_places,
1501 OMPRTL_omp_get_partition_place_nums};
1504 SmallSetVector<Value *, 16> GTIdArgs;
1505 collectGlobalThreadIdArguments(GTIdArgs);
1507 <<
" global thread ID arguments\n");
1509 for (Function *
F : SCC) {
1510 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1511 Changed |= deduplicateRuntimeCalls(
1512 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1516 Value *GTIdArg =
nullptr;
1517 for (Argument &Arg :
F->args())
1518 if (GTIdArgs.
count(&Arg)) {
1522 Changed |= deduplicateRuntimeCalls(
1523 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1530 bool removeRuntimeSymbols() {
1535 if (GlobalVariable *GV = M.getNamedGlobal(
"__llvm_rpc_client")) {
1536 if (GV->hasNUsesOrMore(1))
1540 GV->eraseFromParent();
1552 bool hideMemTransfersLatency() {
1553 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1556 auto *RTCall = getCallIfRegularCall(U, &RFI);
1560 OffloadArray OffloadArrays[3];
1561 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1564 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1567 bool WasSplit =
false;
1568 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1569 if (WaitMovementPoint)
1570 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1575 if (OMPInfoCache.runtimeFnsAvailable(
1576 {OMPRTL___tgt_target_data_begin_mapper_issue,
1577 OMPRTL___tgt_target_data_begin_mapper_wait}))
1578 RFI.foreachUse(SCC, SplitMemTransfers);
1583 void analysisGlobalization() {
1584 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1586 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1587 if (CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1588 auto Remark = [&](OptimizationRemarkMissed ORM) {
1590 <<
"Found thread data sharing on the GPU. "
1591 <<
"Expect degraded performance due to data globalization.";
1599 RFI.foreachUse(SCC, CheckGlobalization);
1604 bool getValuesInOffloadArrays(CallInst &RuntimeCall,
1606 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1616 Value *BasePtrsArg =
1628 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1636 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1648 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1659 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1662 std::string ValuesStr;
1663 raw_string_ostream
Printer(ValuesStr);
1664 std::string Separator =
" --- ";
1666 for (
auto *BP : OAs[0].StoredValues) {
1670 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1673 for (
auto *
P : OAs[1].StoredValues) {
1680 for (
auto *S : OAs[2].StoredValues) {
1684 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1689 Instruction *canBeMovedDownwards(CallInst &RuntimeCall) {
1694 bool IsWorthIt =
false;
1713 return RuntimeCall.
getParent()->getTerminator();
1717 bool splitTargetDataBeginRTC(CallInst &RuntimeCall,
1718 Instruction &WaitMovementPoint) {
1722 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1725 IRBuilder.Builder.SetInsertPoint(&Entry,
1726 Entry.getFirstNonPHIOrDbgOrAlloca());
1728 IRBuilder.AsyncInfo,
nullptr,
"handle");
1735 FunctionCallee IssueDecl =
IRBuilder.getOrCreateRuntimeFunction(
1736 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1739 SmallVector<Value *, 16>
Args;
1740 for (
auto &Arg : RuntimeCall.
args())
1741 Args.push_back(Arg.get());
1742 Args.push_back(Handle);
1746 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1751 FunctionCallee WaitDecl =
IRBuilder.getOrCreateRuntimeFunction(
1752 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1754 Value *WaitParams[2] = {
1756 OffloadArray::DeviceIDArgNum),
1760 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1761 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1766 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1767 bool GlobalOnly,
bool &SingleChoice) {
1768 if (CurrentIdent == NextIdent)
1769 return CurrentIdent;
1774 SingleChoice = !CurrentIdent;
1786 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1787 Function &
F,
bool GlobalOnly) {
1788 bool SingleChoice =
true;
1789 Value *Ident =
nullptr;
1791 CallInst *CI = getCallIfRegularCall(U, &RFI);
1792 if (!CI || &
F != &Caller)
1795 true, SingleChoice);
1798 RFI.foreachUse(SCC, CombineIdentStruct);
1800 if (!Ident || !SingleChoice) {
1803 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1805 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1808 uint32_t SrcLocStrSize;
1810 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1811 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1818 bool deduplicateRuntimeCalls(Function &
F,
1819 OMPInformationCache::RuntimeFunctionInfo &RFI,
1820 Value *ReplVal =
nullptr) {
1821 auto *UV = RFI.getUseVector(
F);
1822 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1826 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1827 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1831 "Unexpected replacement value!");
1834 auto CanBeMoved = [
this](CallBase &CB) {
1835 unsigned NumArgs = CB.arg_size();
1838 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1840 for (
unsigned U = 1;
U < NumArgs; ++
U)
1848 OMPInfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(
F);
1852 for (Use *U : *UV) {
1853 if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1858 if (!CanBeMoved(*CI))
1866 assert(IP &&
"Expected insertion point!");
1876 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1884 CallInst *CI = getCallIfRegularCall(U, &RFI);
1885 if (!CI || CI == ReplVal || &
F != &Caller)
1889 auto Remark = [&](OptimizationRemark
OR) {
1890 return OR <<
"OpenMP runtime call "
1891 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1900 ++NumOpenMPRuntimeCallsDeduplicated;
1904 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1910 void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> >IdArgs) {
1917 auto CallArgOpIsGTId = [&](
Function &
F,
unsigned ArgNo, CallInst &RefCI) {
1918 if (!
F.hasLocalLinkage())
1920 for (Use &U :
F.uses()) {
1921 if (CallInst *CI = getCallIfRegularCall(U)) {
1923 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1924 getCallIfRegularCall(
1925 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1934 auto AddUserArgs = [&](
Value >Id) {
1935 for (Use &U : GTId.uses())
1939 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1944 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1945 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1947 GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &
F) {
1948 if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1956 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1957 AddUserArgs(*GTIdArgs[U]);
1965 DenseMap<Function *, std::optional<Kernel>> UniqueKernelMap;
1968 Kernel getUniqueKernelFor(Function &
F);
1971 Kernel getUniqueKernelFor(Instruction &
I) {
1972 return getUniqueKernelFor(*
I.getFunction());
1977 bool rewriteDeviceCodeStateMachine();
1993 template <
typename RemarkKind,
typename RemarkCallBack>
1994 void emitRemark(Instruction *
I, StringRef RemarkName,
1995 RemarkCallBack &&RemarkCB)
const {
1997 auto &ORE = OREGetter(
F);
2001 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2002 <<
" [" << RemarkName <<
"]";
2006 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2010 template <
typename RemarkKind,
typename RemarkCallBack>
2011 void emitRemark(Function *
F, StringRef RemarkName,
2012 RemarkCallBack &&RemarkCB)
const {
2013 auto &ORE = OREGetter(
F);
2017 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2018 <<
" [" << RemarkName <<
"]";
2022 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2029 SmallVectorImpl<Function *> &SCC;
2033 CallGraphUpdater &CGUpdater;
2036 OptimizationRemarkGetter OREGetter;
2039 OMPInformationCache &OMPInfoCache;
2045 bool runAttributor(
bool IsModulePass) {
2049 registerAAs(IsModulePass);
2054 <<
" functions, result: " <<
Changed <<
".\n");
2056 if (
Changed == ChangeStatus::CHANGED)
2057 OMPInfoCache.invalidateAnalyses();
2059 return Changed == ChangeStatus::CHANGED;
2066 void registerAAs(
bool IsModulePass);
2071 static void registerAAsForFunction(Attributor &A,
const Function &
F);
2075 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2076 !OMPInfoCache.CGSCC->contains(&
F))
2081 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2083 return *CachedKernel;
2090 return *CachedKernel;
2093 CachedKernel =
nullptr;
2094 if (!
F.hasLocalLinkage()) {
2097 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2098 return ORA <<
"Potentially unknown OpenMP target region caller.";
2106 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2109 if (
Cmp->isEquality())
2110 return getUniqueKernelFor(*Cmp);
2115 if (CB->isCallee(&U))
2116 return getUniqueKernelFor(*CB);
2118 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2119 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2121 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2122 return getUniqueKernelFor(*CB);
2130 SmallPtrSet<Kernel, 2> PotentialKernels;
2131 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2132 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2136 if (PotentialKernels.
size() == 1)
2137 K = *PotentialKernels.
begin();
2140 UniqueKernelMap[&
F] =
K;
2145bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2146 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2147 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
2150 if (!KernelParallelRFI)
2157 for (Function *
F : SCC) {
2161 bool UnknownUse =
false;
2162 bool KernelParallelUse =
false;
2163 unsigned NumDirectCalls = 0;
2166 OMPInformationCache::foreachUse(*
F, [&](Use &U) {
2168 if (CB->isCallee(&U)) {
2174 ToBeReplacedStateMachineUses.
push_back(&U);
2180 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2181 const unsigned int WrapperFunctionArgNo = 6;
2182 if (!KernelParallelUse && CI &&
2184 KernelParallelUse =
true;
2185 ToBeReplacedStateMachineUses.
push_back(&U);
2193 if (!KernelParallelUse)
2199 if (UnknownUse || NumDirectCalls != 1 ||
2200 ToBeReplacedStateMachineUses.
size() > 2) {
2201 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2202 return ORA <<
"Parallel region is used in "
2203 << (UnknownUse ?
"unknown" :
"unexpected")
2204 <<
" ways. Will not attempt to rewrite the state machine.";
2214 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
2215 return ORA <<
"Parallel region is not called from a unique kernel. "
2216 "Will not attempt to rewrite the state machine.";
2228 Type *Int8Ty = Type::getInt8Ty(
M.getContext());
2230 auto *
ID =
new GlobalVariable(
2234 for (Use *U : ToBeReplacedStateMachineUses)
2236 ID,
U->get()->getType()));
2238 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2247struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2248 using Base = StateWrapper<BooleanState, AbstractAttribute>;
2249 AAICVTracker(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
2252 bool isAssumedTracked()
const {
return getAssumed(); }
2255 bool isKnownTracked()
const {
return getAssumed(); }
2258 static AAICVTracker &createForPosition(
const IRPosition &IRP, Attributor &
A);
2262 const Instruction *
I,
2263 Attributor &
A)
const {
2264 return std::nullopt;
2270 virtual std::optional<Value *>
2278 StringRef
getName()
const override {
return "AAICVTracker"; }
2281 const char *getIdAddr()
const override {
return &
ID; }
2284 static bool classof(
const AbstractAttribute *AA) {
2288 static const char ID;
2291struct AAICVTrackerFunction :
public AAICVTracker {
2292 AAICVTrackerFunction(
const IRPosition &IRP, Attributor &
A)
2293 : AAICVTracker(IRP,
A) {}
2296 const std::string getAsStr(Attributor *)
const override {
2297 return "ICVTrackerFunction";
2301 void trackStatistics()
const override {}
2305 return ChangeStatus::UNCHANGED;
2310 InternalControlVar::ICV___last>
2311 ICVReplacementValuesMap;
2318 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2321 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2323 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2325 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2331 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2332 HasChanged = ChangeStatus::CHANGED;
2338 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2339 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2340 HasChanged = ChangeStatus::CHANGED;
2346 SetterRFI.foreachUse(TrackValues,
F);
2348 bool UsedAssumedInformation =
false;
2349 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2350 UsedAssumedInformation,
2356 if (HasChanged == ChangeStatus::CHANGED)
2357 ValuesMap.try_emplace(Entry);
2365 std::optional<Value *> getValueForCall(Attributor &
A,
const Instruction &
I,
2369 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2370 CB->hasFnAttr(
"no_openmp_routines") ||
2371 CB->hasFnAttr(
"no_openmp_constructs"))
2372 return std::nullopt;
2374 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2375 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2376 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2377 Function *CalledFunction = CB->getCalledFunction();
2380 if (CalledFunction ==
nullptr)
2382 if (CalledFunction == GetterRFI.Declaration)
2383 return std::nullopt;
2384 if (CalledFunction == SetterRFI.Declaration) {
2385 if (ICVReplacementValuesMap[ICV].
count(&
I))
2386 return ICVReplacementValuesMap[ICV].lookup(&
I);
2395 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2398 if (ICVTrackingAA->isAssumedTracked()) {
2399 std::optional<Value *> URV =
2400 ICVTrackingAA->getUniqueReplacementValue(ICV);
2411 std::optional<Value *>
2413 return std::nullopt;
2418 const Instruction *
I,
2419 Attributor &
A)
const override {
2420 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2421 if (ValuesMap.count(
I))
2422 return ValuesMap.lookup(
I);
2425 SmallPtrSet<const Instruction *, 16> Visited;
2428 std::optional<Value *> ReplVal;
2430 while (!Worklist.
empty()) {
2432 if (!Visited.
insert(CurrInst).second)
2440 if (ValuesMap.count(CurrInst)) {
2441 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2444 ReplVal = NewReplVal;
2450 if (ReplVal != NewReplVal)
2456 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2462 ReplVal = NewReplVal;
2468 if (ReplVal != NewReplVal)
2473 if (CurrBB ==
I->getParent() && ReplVal)
2478 if (
const Instruction *Terminator = Pred->getTerminator())
2486struct AAICVTrackerFunctionReturned : AAICVTracker {
2487 AAICVTrackerFunctionReturned(
const IRPosition &IRP, Attributor &
A)
2488 : AAICVTracker(IRP,
A) {}
2491 const std::string getAsStr(Attributor *)
const override {
2492 return "ICVTrackerFunctionReturned";
2496 void trackStatistics()
const override {}
2500 return ChangeStatus::UNCHANGED;
2505 InternalControlVar::ICV___last>
2506 ICVReplacementValuesMap;
2509 std::optional<Value *>
2511 return ICVReplacementValuesMap[ICV];
2516 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2519 if (!ICVTrackingAA->isAssumedTracked())
2520 return indicatePessimisticFixpoint();
2523 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2524 std::optional<Value *> UniqueICVValue;
2527 std::optional<Value *> NewReplVal =
2528 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2531 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2534 UniqueICVValue = NewReplVal;
2539 bool UsedAssumedInformation =
false;
2540 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2541 UsedAssumedInformation,
2543 UniqueICVValue =
nullptr;
2545 if (UniqueICVValue == ReplVal)
2548 ReplVal = UniqueICVValue;
2549 Changed = ChangeStatus::CHANGED;
2556struct AAICVTrackerCallSite : AAICVTracker {
2557 AAICVTrackerCallSite(
const IRPosition &IRP, Attributor &
A)
2558 : AAICVTracker(IRP,
A) {}
2561 assert(getAnchorScope() &&
"Expected anchor function");
2565 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2567 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2568 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2569 if (Getter.Declaration == getAssociatedFunction()) {
2570 AssociatedICV = ICVInfo.Kind;
2576 indicatePessimisticFixpoint();
2580 if (!ReplVal || !*ReplVal)
2581 return ChangeStatus::UNCHANGED;
2584 A.deleteAfterManifest(*getCtxI());
2586 return ChangeStatus::CHANGED;
2590 const std::string getAsStr(Attributor *)
const override {
2591 return "ICVTrackerCallSite";
2595 void trackStatistics()
const override {}
2598 std::optional<Value *> ReplVal;
2601 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2605 if (!ICVTrackingAA->isAssumedTracked())
2606 return indicatePessimisticFixpoint();
2608 std::optional<Value *> NewReplVal =
2609 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2611 if (ReplVal == NewReplVal)
2612 return ChangeStatus::UNCHANGED;
2614 ReplVal = NewReplVal;
2615 return ChangeStatus::CHANGED;
2620 std::optional<Value *>
2626struct AAICVTrackerCallSiteReturned : AAICVTracker {
2627 AAICVTrackerCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
2628 : AAICVTracker(IRP,
A) {}
2631 const std::string getAsStr(Attributor *)
const override {
2632 return "ICVTrackerCallSiteReturned";
2636 void trackStatistics()
const override {}
2640 return ChangeStatus::UNCHANGED;
2645 InternalControlVar::ICV___last>
2646 ICVReplacementValuesMap;
2650 std::optional<Value *>
2652 return ICVReplacementValuesMap[ICV];
2657 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2659 DepClassTy::REQUIRED);
2662 if (!ICVTrackingAA->isAssumedTracked())
2663 return indicatePessimisticFixpoint();
2666 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2667 std::optional<Value *> NewReplVal =
2668 ICVTrackingAA->getUniqueReplacementValue(ICV);
2670 if (ReplVal == NewReplVal)
2673 ReplVal = NewReplVal;
2674 Changed = ChangeStatus::CHANGED;
2682static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2688 return hasFunctionEndAsUniqueSuccessor(
Successor);
2691struct AAExecutionDomainFunction :
public AAExecutionDomain {
2692 AAExecutionDomainFunction(
const IRPosition &IRP, Attributor &
A)
2693 : AAExecutionDomain(IRP,
A) {}
2695 ~AAExecutionDomainFunction()
override {
delete RPOT; }
2699 assert(
F &&
"Expected anchor function");
2700 RPOT =
new ReversePostOrderTraversal<Function *>(
F);
2703 const std::string getAsStr(Attributor *)
const override {
2704 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2705 for (
auto &It : BEDMap) {
2709 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2710 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2711 It.getSecond().IsReachingAlignedBarrierOnly;
2713 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2714 std::to_string(AlignedBlocks) +
" of " +
2715 std::to_string(TotalBlocks) +
2716 " executed by initial thread / aligned";
2720 void trackStatistics()
const override {}
2724 for (
const BasicBlock &BB : *getAnchorScope()) {
2725 if (!isExecutedByInitialThreadOnly(BB))
2727 dbgs() <<
TAG <<
" Basic block @" << getAnchorScope()->getName() <<
" "
2728 << BB.
getName() <<
" is executed by a single thread.\n";
2737 SmallPtrSet<CallBase *, 16> DeletedBarriers;
2738 auto HandleAlignedBarrier = [&](CallBase *CB) {
2739 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2740 if (!ED.IsReachedFromAlignedBarrierOnly ||
2741 ED.EncounteredNonLocalSideEffect)
2743 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2754 DeletedBarriers.
insert(CB);
2755 A.deleteAfterManifest(*CB);
2756 ++NumBarriersEliminated;
2757 Changed = ChangeStatus::CHANGED;
2758 }
else if (!ED.AlignedBarriers.empty()) {
2759 Changed = ChangeStatus::CHANGED;
2761 ED.AlignedBarriers.end());
2762 SmallSetVector<CallBase *, 16> Visited;
2763 while (!Worklist.
empty()) {
2765 if (!Visited.
insert(LastCB))
2769 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2771 if (!DeletedBarriers.
count(LastCB)) {
2772 ++NumBarriersEliminated;
2773 A.deleteAfterManifest(*LastCB);
2779 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2780 Worklist.
append(LastED.AlignedBarriers.begin(),
2781 LastED.AlignedBarriers.end());
2787 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2788 for (
auto *AssumeCB : ED.EncounteredAssumes)
2789 A.deleteAfterManifest(*AssumeCB);
2792 for (
auto *CB : AlignedBarriers)
2793 HandleAlignedBarrier(CB);
2797 HandleAlignedBarrier(
nullptr);
2802 bool isNoOpFence(
const FenceInst &FI)
const override {
2803 return getState().isValidState() && !NonNoOpFences.count(&FI);
2809 mergeInPredecessorBarriersAndAssumptions(Attributor &
A, ExecutionDomainTy &ED,
2810 const ExecutionDomainTy &PredED);
2815 bool mergeInPredecessor(Attributor &
A, ExecutionDomainTy &ED,
2816 const ExecutionDomainTy &PredED,
2817 bool InitialEdgeOnly =
false);
2820 bool handleCallees(Attributor &
A, ExecutionDomainTy &EntryBBED);
2827 bool isExecutedByInitialThreadOnly(
const BasicBlock &BB)
const override {
2828 if (!isValidState())
2830 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2831 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2834 bool isExecutedInAlignedRegion(Attributor &
A,
2835 const Instruction &
I)
const override {
2836 assert(
I.getFunction() == getAnchorScope() &&
2837 "Instruction is out of scope!");
2838 if (!isValidState())
2841 bool ForwardIsOk =
true;
2850 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2852 const auto &It = CEDMap.find({CB, PRE});
2853 if (It == CEDMap.end())
2855 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2856 ForwardIsOk =
false;
2860 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2861 ForwardIsOk =
false;
2869 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2871 const auto &It = CEDMap.find({CB, POST});
2872 if (It == CEDMap.end())
2874 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2887 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2889 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2899 ExecutionDomainTy getExecutionDomain(
const BasicBlock &BB)
const override {
2901 "No request should be made against an invalid state!");
2902 return BEDMap.lookup(&BB);
2904 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2905 getExecutionDomain(
const CallBase &CB)
const override {
2907 "No request should be made against an invalid state!");
2908 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2910 ExecutionDomainTy getFunctionExecutionDomain()
const override {
2912 "No request should be made against an invalid state!");
2913 return InterProceduralED;
2919 static bool isInitialThreadOnlyEdge(Attributor &
A, BranchInst *
Edge,
2920 BasicBlock &SuccessorBB) {
2921 if (!
Edge || !
Edge->isConditional())
2923 if (
Edge->getSuccessor(0) != &SuccessorBB)
2927 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2935 if (
C->isAllOnesValue()) {
2937 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2938 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2939 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2942 ConstantStruct *KernelEnvC =
2944 ConstantInt *ExecModeC =
2945 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2952 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2957 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2965 ExecutionDomainTy InterProceduralED;
2969 DenseMap<const BasicBlock *, ExecutionDomainTy> BEDMap;
2970 DenseMap<PointerIntPair<const CallBase *, 1, Direction>, ExecutionDomainTy>
2972 SmallSetVector<CallBase *, 16> AlignedBarriers;
2974 ReversePostOrderTraversal<Function *> *RPOT =
nullptr;
2977 static bool setAndRecord(
bool &R,
bool V) {
2985 SmallPtrSet<const FenceInst *, 8> NonNoOpFences;
2988void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2989 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2990 for (
auto *EA : PredED.EncounteredAssumes)
2991 ED.addAssumeInst(
A, *EA);
2993 for (
auto *AB : PredED.AlignedBarriers)
2994 ED.addAlignedBarrier(
A, *AB);
2997bool AAExecutionDomainFunction::mergeInPredecessor(
2998 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
2999 bool InitialEdgeOnly) {
3003 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3004 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3005 ED.IsExecutedByInitialThreadOnly));
3007 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3008 ED.IsReachedFromAlignedBarrierOnly &&
3009 PredED.IsReachedFromAlignedBarrierOnly);
3010 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3011 ED.EncounteredNonLocalSideEffect |
3012 PredED.EncounteredNonLocalSideEffect);
3014 if (ED.IsReachedFromAlignedBarrierOnly)
3015 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3017 ED.clearAssumeInstAndAlignedBarriers();
3021bool AAExecutionDomainFunction::handleCallees(Attributor &
A,
3022 ExecutionDomainTy &EntryBBED) {
3024 auto PredForCallSite = [&](AbstractCallSite ACS) {
3025 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3027 DepClassTy::OPTIONAL);
3028 if (!EDAA || !EDAA->getState().isValidState())
3031 EDAA->getExecutionDomain(*
cast<CallBase>(ACS.getInstruction())));
3035 ExecutionDomainTy ExitED;
3036 bool AllCallSitesKnown;
3037 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3039 AllCallSitesKnown)) {
3040 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3041 mergeInPredecessor(
A, EntryBBED, CSInED);
3042 ExitED.IsReachingAlignedBarrierOnly &=
3043 CSOutED.IsReachingAlignedBarrierOnly;
3050 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3051 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3052 EntryBBED.EncounteredNonLocalSideEffect =
false;
3053 ExitED.IsReachingAlignedBarrierOnly =
false;
3055 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3056 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3057 EntryBBED.EncounteredNonLocalSideEffect =
true;
3058 ExitED.IsReachingAlignedBarrierOnly =
false;
3063 auto &FnED = BEDMap[
nullptr];
3064 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3065 FnED.IsReachedFromAlignedBarrierOnly &
3066 EntryBBED.IsReachedFromAlignedBarrierOnly);
3067 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3068 FnED.IsReachingAlignedBarrierOnly &
3069 ExitED.IsReachingAlignedBarrierOnly);
3070 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3071 EntryBBED.IsExecutedByInitialThreadOnly);
3075ChangeStatus AAExecutionDomainFunction::updateImpl(Attributor &
A) {
3082 auto HandleAlignedBarrier = [&](CallBase &CB, ExecutionDomainTy &ED) {
3083 Changed |= AlignedBarriers.insert(&CB);
3085 auto &CallInED = CEDMap[{&CB, PRE}];
3086 Changed |= mergeInPredecessor(
A, CallInED, ED);
3087 CallInED.IsReachingAlignedBarrierOnly =
true;
3089 ED.EncounteredNonLocalSideEffect =
false;
3090 ED.IsReachedFromAlignedBarrierOnly =
true;
3092 ED.clearAssumeInstAndAlignedBarriers();
3093 ED.addAlignedBarrier(
A, CB);
3094 auto &CallOutED = CEDMap[{&CB, POST}];
3095 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3099 A.getAAFor<AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3106 for (
auto &RIt : *RPOT) {
3109 bool IsEntryBB = &BB == &EntryBB;
3112 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3113 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3114 ExecutionDomainTy ED;
3121 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3125 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3127 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3129 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3135 for (Instruction &
I : BB) {
3136 bool UsedAssumedInformation;
3137 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3138 false, DepClassTy::OPTIONAL,
3146 ED.addAssumeInst(
A, *AI);
3150 if (
II->isAssumeLikeIntrinsic())
3155 if (!ED.EncounteredNonLocalSideEffect) {
3157 if (ED.IsReachedFromAlignedBarrierOnly)
3162 case AtomicOrdering::NotAtomic:
3164 case AtomicOrdering::Unordered:
3166 case AtomicOrdering::Monotonic:
3168 case AtomicOrdering::Acquire:
3170 case AtomicOrdering::Release:
3172 case AtomicOrdering::AcquireRelease:
3174 case AtomicOrdering::SequentiallyConsistent:
3178 NonNoOpFences.insert(FI);
3183 bool IsAlignedBarrier =
3187 AlignedBarrierLastInBlock &= IsNoSync;
3188 IsExplicitlyAligned &= IsNoSync;
3194 if (IsAlignedBarrier) {
3195 HandleAlignedBarrier(*CB, ED);
3196 AlignedBarrierLastInBlock =
true;
3197 IsExplicitlyAligned =
true;
3203 if (!ED.EncounteredNonLocalSideEffect &&
3205 ED.EncounteredNonLocalSideEffect =
true;
3207 ED.IsReachedFromAlignedBarrierOnly =
false;
3215 auto &CallInED = CEDMap[{CB, PRE}];
3216 Changed |= mergeInPredecessor(
A, CallInED, ED);
3222 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3223 const auto *EDAA =
A.getAAFor<AAExecutionDomain>(
3225 if (EDAA && EDAA->getState().isValidState()) {
3226 const auto &CalleeED = EDAA->getFunctionExecutionDomain();
3227 ED.IsReachedFromAlignedBarrierOnly =
3228 CalleeED.IsReachedFromAlignedBarrierOnly;
3229 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3230 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3231 ED.EncounteredNonLocalSideEffect |=
3232 CalleeED.EncounteredNonLocalSideEffect;
3234 ED.EncounteredNonLocalSideEffect =
3235 CalleeED.EncounteredNonLocalSideEffect;
3236 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3238 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3241 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3242 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3243 auto &CallOutED = CEDMap[{CB, POST}];
3244 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3249 ED.IsReachedFromAlignedBarrierOnly =
false;
3250 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3253 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3255 auto &CallOutED = CEDMap[{CB, POST}];
3256 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3259 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3265 const auto *MemAA =
A.getAAFor<AAMemoryLocation>(
3273 if (MemAA && MemAA->getState().isValidState() &&
3274 MemAA->checkForAllAccessesToMemoryKind(
3279 auto &InfoCache =
A.getInfoCache();
3280 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3284 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3287 if (!ED.EncounteredNonLocalSideEffect &&
3289 ED.EncounteredNonLocalSideEffect =
true;
3292 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3294 !BB.getTerminator()->getNumSuccessors()) {
3296 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3298 auto &FnED = BEDMap[
nullptr];
3299 if (IsKernel && !IsExplicitlyAligned)
3300 FnED.IsReachingAlignedBarrierOnly =
false;
3301 Changed |= mergeInPredecessor(
A, FnED, ED);
3303 if (!FnED.IsReachingAlignedBarrierOnly) {
3304 IsEndAndNotReachingAlignedBarriersOnly =
true;
3305 SyncInstWorklist.
push_back(BB.getTerminator());
3306 auto &BBED = BEDMap[&BB];
3307 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3311 ExecutionDomainTy &StoredED = BEDMap[&BB];
3312 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3313 !IsEndAndNotReachingAlignedBarriersOnly;
3319 if (ED.IsExecutedByInitialThreadOnly !=
3320 StoredED.IsExecutedByInitialThreadOnly ||
3321 ED.IsReachedFromAlignedBarrierOnly !=
3322 StoredED.IsReachedFromAlignedBarrierOnly ||
3323 ED.EncounteredNonLocalSideEffect !=
3324 StoredED.EncounteredNonLocalSideEffect)
3328 StoredED = std::move(ED);
3333 SmallSetVector<BasicBlock *, 16> Visited;
3334 while (!SyncInstWorklist.
empty()) {
3337 bool HitAlignedBarrierOrKnownEnd =
false;
3342 auto &CallOutED = CEDMap[{CB, POST}];
3343 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3344 auto &CallInED = CEDMap[{CB, PRE}];
3345 HitAlignedBarrierOrKnownEnd =
3346 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3347 if (HitAlignedBarrierOrKnownEnd)
3349 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3351 if (HitAlignedBarrierOrKnownEnd)
3355 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3357 if (!Visited.
insert(PredBB))
3359 auto &PredED = BEDMap[PredBB];
3360 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3362 SyncInstWorklist.
push_back(PredBB->getTerminator());
3365 if (SyncBB != &EntryBB)
3368 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3371 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3376struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3377 using Base = StateWrapper<BooleanState, AbstractAttribute>;
3378 AAHeapToShared(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3381 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3385 virtual bool isAssumedHeapToShared(CallBase &CB)
const = 0;
3389 virtual bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const = 0;
3392 StringRef
getName()
const override {
return "AAHeapToShared"; }
3395 const char *getIdAddr()
const override {
return &
ID; }
3399 static bool classof(
const AbstractAttribute *AA) {
3404 static const char ID;
3407struct AAHeapToSharedFunction :
public AAHeapToShared {
3408 AAHeapToSharedFunction(
const IRPosition &IRP, Attributor &
A)
3409 : AAHeapToShared(IRP,
A) {}
3411 const std::string getAsStr(Attributor *)
const override {
3412 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3413 " malloc calls eligible.";
3417 void trackStatistics()
const override {}
3421 void findPotentialRemovedFreeCalls(Attributor &
A) {
3422 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3423 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3425 PotentialRemovedFreeCalls.clear();
3427 for (CallBase *CB : MallocCalls) {
3429 for (
auto *U : CB->
users()) {
3431 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3435 if (FreeCalls.
size() != 1)
3438 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3444 indicatePessimisticFixpoint();
3448 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3449 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3450 if (!RFI.Declaration)
3454 [](
const IRPosition &,
const AbstractAttribute *,
3455 bool &) -> std::optional<Value *> {
return nullptr; };
3458 for (User *U : RFI.Declaration->
users())
3462 MallocCalls.insert(CB);
3467 findPotentialRemovedFreeCalls(
A);
3470 bool isAssumedHeapToShared(CallBase &CB)
const override {
3471 return isValidState() && MallocCalls.count(&CB);
3474 bool isAssumedHeapToSharedRemovedFree(CallBase &CB)
const override {
3475 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3479 if (MallocCalls.empty())
3480 return ChangeStatus::UNCHANGED;
3482 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3483 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3487 DepClassTy::OPTIONAL);
3490 for (CallBase *CB : MallocCalls) {
3492 if (HS &&
HS->isAssumedHeapToStack(*CB))
3497 for (
auto *U : CB->
users()) {
3499 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3502 if (FreeCalls.
size() != 1)
3509 <<
" with shared memory."
3510 <<
" Shared memory usage is limited to "
3516 <<
" with " << AllocSize->getZExtValue()
3517 <<
" bytes of shared memory\n");
3522 Type *Int8Ty = Type::getInt8Ty(
M->getContext());
3523 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3524 auto *SharedMem =
new GlobalVariable(
3528 static_cast<unsigned>(AddressSpace::Shared));
3530 SharedMem, PointerType::getUnqual(
M->getContext()));
3532 auto Remark = [&](OptimizationRemark
OR) {
3533 return OR <<
"Replaced globalized variable with "
3534 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3535 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3536 <<
"of shared memory.";
3538 A.emitRemark<OptimizationRemark>(CB,
"OMP111",
Remark);
3540 MaybeAlign Alignment = CB->getRetAlign();
3542 "HeapToShared on allocation without alignment attribute");
3543 SharedMem->setAlignment(*Alignment);
3546 A.deleteAfterManifest(*CB);
3547 A.deleteAfterManifest(*FreeCalls.
front());
3549 SharedMemoryUsed += AllocSize->getZExtValue();
3550 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3551 Changed = ChangeStatus::CHANGED;
3558 if (MallocCalls.empty())
3559 return indicatePessimisticFixpoint();
3560 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3561 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3562 if (!RFI.Declaration)
3563 return ChangeStatus::UNCHANGED;
3567 auto NumMallocCalls = MallocCalls.size();
3570 for (User *U : RFI.Declaration->
users()) {
3572 if (CB->getCaller() !=
F)
3574 if (!MallocCalls.count(CB))
3577 MallocCalls.remove(CB);
3580 const auto *ED =
A.getAAFor<AAExecutionDomain>(
3582 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3583 MallocCalls.remove(CB);
3587 findPotentialRemovedFreeCalls(
A);
3589 if (NumMallocCalls != MallocCalls.size())
3590 return ChangeStatus::CHANGED;
3592 return ChangeStatus::UNCHANGED;
3596 SmallSetVector<CallBase *, 4> MallocCalls;
3598 SmallPtrSet<CallBase *, 4> PotentialRemovedFreeCalls;
3600 unsigned SharedMemoryUsed = 0;
3603struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3604 using Base = StateWrapper<KernelInfoState, AbstractAttribute>;
3605 AAKernelInfo(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
3609 static bool requiresCalleeForCallBase() {
return false; }
3612 void trackStatistics()
const override {}
3615 const std::string getAsStr(Attributor *)
const override {
3616 if (!isValidState())
3618 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3620 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3622 std::string(
" #PRs: ") +
3623 (ReachedKnownParallelRegions.isValidState()
3624 ? std::to_string(ReachedKnownParallelRegions.size())
3626 ", #Unknown PRs: " +
3627 (ReachedUnknownParallelRegions.isValidState()
3628 ? std::to_string(ReachedUnknownParallelRegions.size())
3630 ", #Reaching Kernels: " +
3631 (ReachingKernelEntries.isValidState()
3632 ? std::to_string(ReachingKernelEntries.size())
3635 (ParallelLevels.isValidState()
3636 ? std::to_string(ParallelLevels.size())
3638 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3642 static AAKernelInfo &createForPosition(
const IRPosition &IRP, Attributor &
A);
3645 StringRef
getName()
const override {
return "AAKernelInfo"; }
3648 const char *getIdAddr()
const override {
return &
ID; }
3651 static bool classof(
const AbstractAttribute *AA) {
3655 static const char ID;
3660struct AAKernelInfoFunction : AAKernelInfo {
3661 AAKernelInfoFunction(
const IRPosition &IRP, Attributor &
A)
3662 : AAKernelInfo(IRP,
A) {}
3664 SmallPtrSet<Instruction *, 4> GuardedInstructions;
3666 SmallPtrSetImpl<Instruction *> &getGuardedInstructions() {
3667 return GuardedInstructions;
3670 void setConfigurationOfKernelEnvironment(ConstantStruct *ConfigC) {
3672 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3673 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3677#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3678 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3679 ConstantStruct *ConfigC = \
3680 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3681 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3682 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3683 assert(NewConfigC && "Failed to create new configuration environment"); \
3684 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3695#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3702 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3706 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3707 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3708 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3709 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3713 auto StoreCallBase = [](
Use &U,
3714 OMPInformationCache::RuntimeFunctionInfo &RFI,
3716 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3718 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3720 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3726 StoreCallBase(U, InitRFI, KernelInitCB);
3730 DeinitRFI.foreachUse(
3732 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3738 if (!KernelInitCB || !KernelDeinitCB)
3742 ReachingKernelEntries.insert(Fn);
3743 IsKernelEntry =
true;
3751 KernelConfigurationSimplifyCB =
3753 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3754 if (!isAtFixpoint()) {
3757 UsedAssumedInformation =
true;
3763 A.registerGlobalVariableSimplificationCallback(
3764 *KernelEnvGV, KernelConfigurationSimplifyCB);
3767 bool CanChangeToSPMD = OMPInfoCache.runtimeFnsAvailable(
3768 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
3769 OMPRTL___kmpc_barrier_simple_spmd});
3773 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3778 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3782 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3784 setExecModeOfKernelEnvironment(AssumedExecModeC);
3791 setMinThreadsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinThreads));
3794 auto [MinTeams, MaxTeams] =
3797 setMinTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinTeams));
3799 setMaxTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MaxTeams));
3802 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3803 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3805 setMayUseNestedParallelismOfKernelEnvironment(
3806 AssumedMayUseNestedParallelismC);
3810 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3813 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3814 setUseGenericStateMachineOfKernelEnvironment(
3815 AssumedUseGenericStateMachineC);
3821 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3823 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3827 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3844 if (SPMDCompatibilityTracker.isValidState())
3845 return AddDependence(
A,
this, QueryingAA);
3847 if (!ReachedKnownParallelRegions.isValidState())
3848 return AddDependence(
A,
this, QueryingAA);
3854 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3855 CustomStateMachineUseCB);
3856 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3857 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3858 CustomStateMachineUseCB);
3859 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3860 CustomStateMachineUseCB);
3861 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3862 CustomStateMachineUseCB);
3866 if (SPMDCompatibilityTracker.isAtFixpoint())
3873 if (!SPMDCompatibilityTracker.isValidState())
3874 return AddDependence(
A,
this, QueryingAA);
3877 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3886 if (!SPMDCompatibilityTracker.isValidState())
3887 return AddDependence(
A,
this, QueryingAA);
3888 if (SPMDCompatibilityTracker.empty())
3889 return AddDependence(
A,
this, QueryingAA);
3890 if (!mayContainParallelRegion())
3891 return AddDependence(
A,
this, QueryingAA);
3894 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3898 static std::string sanitizeForGlobalName(std::string S) {
3902 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3903 (C >=
'0' && C <=
'9') || C ==
'_');
3914 if (!KernelInitCB || !KernelDeinitCB)
3915 return ChangeStatus::UNCHANGED;
3919 bool HasBuiltStateMachine =
true;
3920 if (!changeToSPMDMode(
A,
Changed)) {
3922 HasBuiltStateMachine = buildCustomStateMachine(
A,
Changed);
3924 HasBuiltStateMachine =
false;
3928 ConstantStruct *ExistingKernelEnvC =
3930 ConstantInt *OldUseGenericStateMachineVal =
3931 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3932 ExistingKernelEnvC);
3933 if (!HasBuiltStateMachine)
3934 setUseGenericStateMachineOfKernelEnvironment(
3935 OldUseGenericStateMachineVal);
3938 GlobalVariable *KernelEnvGV =
3942 Changed = ChangeStatus::CHANGED;
3948 void insertInstructionGuardsHelper(Attributor &
A) {
3949 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3951 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3953 LoopInfo *LI =
nullptr;
3954 DominatorTree *DT =
nullptr;
3955 MemorySSAUpdater *MSU =
nullptr;
3985 DT, LI, MSU,
"region.guarded.end");
3988 MSU,
"region.barrier");
3991 DT, LI, MSU,
"region.exit");
3993 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3996 "Expected a different CFG");
3999 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
4002 A.registerManifestAddedBasicBlock(*RegionEndBB);
4003 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
4004 A.registerManifestAddedBasicBlock(*RegionExitBB);
4005 A.registerManifestAddedBasicBlock(*RegionStartBB);
4006 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4008 bool HasBroadcastValues =
false;
4011 for (Instruction &
I : *RegionStartBB) {
4013 for (Use &U :
I.uses()) {
4019 if (OutsideUses.
empty())
4022 HasBroadcastValues =
true;
4026 auto *SharedMem =
new GlobalVariable(
4027 M,
I.getType(),
false,
4029 sanitizeForGlobalName(
4030 (
I.getName() +
".guarded.output.alloc").str()),
4032 static_cast<unsigned>(AddressSpace::Shared));
4035 new StoreInst(&
I, SharedMem,
4038 LoadInst *LoadI =
new LoadInst(
4039 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4043 for (Use *U : OutsideUses)
4044 A.changeUseAfterManifest(*U, *LoadI);
4047 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4052 OpenMPIRBuilder::LocationDescription Loc(
4053 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4055 uint32_t SrcLocStrSize;
4064 OpenMPIRBuilder::LocationDescription LocRegionCheckTid(
4065 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4067 FunctionCallee HardwareTidFn =
4069 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4073 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4075 OMPInfoCache.OMPBuilder.
Builder
4076 .
CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4081 FunctionCallee BarrierFn =
4083 M, OMPRTL___kmpc_barrier_simple_spmd);
4089 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4092 if (HasBroadcastValues) {
4097 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4101 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4102 SmallPtrSet<BasicBlock *, 8> Visited;
4103 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4105 if (!Visited.
insert(BB).second)
4111 while (++IP != IPEnd) {
4112 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4115 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4117 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4118 LastEffect =
nullptr;
4125 for (
auto &Reorder : Reorders)
4126 Reorder.first->moveBefore(Reorder.second->getIterator());
4131 for (Instruction *GuardedI : SPMDCompatibilityTracker) {
4133 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4136 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4139 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4142 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4143 for (Instruction &
I : *BB) {
4146 if (SPMDCompatibilityTracker.contains(&
I)) {
4147 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4148 if (GuardedRegionStart)
4149 GuardedRegionEnd = &
I;
4151 GuardedRegionStart = GuardedRegionEnd = &
I;
4158 if (GuardedRegionStart) {
4160 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4161 GuardedRegionStart =
nullptr;
4162 GuardedRegionEnd =
nullptr;
4167 for (
auto &GR : GuardedRegions)
4168 CreateGuardedRegion(GR.first, GR.second);
4171 void forceSingleThreadPerWorkgroupHelper(Attributor &
A) {
4180 auto &Ctx = getAnchorValue().getContext();
4187 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4192 A.registerManifestAddedBasicBlock(*InitBB);
4193 A.registerManifestAddedBasicBlock(*UserCodeBB);
4194 A.registerManifestAddedBasicBlock(*ReturnBB);
4203 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4204 FunctionCallee ThreadIdInBlockFn =
4206 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4209 CallInst *ThreadIdInBlock =
4211 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4217 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4218 "thread.is_main", InitBB);
4224 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4226 if (!SPMDCompatibilityTracker.isAssumed()) {
4227 for (Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4228 if (!NonCompatibleI)
4233 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4236 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4237 ORA <<
"Value has potential side effects preventing SPMD-mode "
4240 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4241 "the called function to override";
4245 A.emitRemark<OptimizationRemarkAnalysis>(NonCompatibleI,
"OMP121",
4249 << *NonCompatibleI <<
"\n");
4261 Kernel = CB->getCaller();
4266 ConstantStruct *ExistingKernelEnvC =
4269 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4275 Changed = ChangeStatus::CHANGED;
4279 if (mayContainParallelRegion())
4280 insertInstructionGuardsHelper(
A);
4282 forceSingleThreadPerWorkgroupHelper(
A);
4287 "Initially non-SPMD kernel has SPMD exec mode!");
4288 setExecModeOfKernelEnvironment(
4292 ++NumOpenMPTargetRegionKernelsSPMD;
4294 auto Remark = [&](OptimizationRemark
OR) {
4295 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4297 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP120",
Remark);
4307 if (!ReachedKnownParallelRegions.isValidState())
4310 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4311 if (!OMPInfoCache.runtimeFnsAvailable(
4312 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4313 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4314 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4317 ConstantStruct *ExistingKernelEnvC =
4324 ConstantInt *UseStateMachineC =
4325 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4326 ExistingKernelEnvC);
4327 ConstantInt *ModeC =
4328 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4333 if (UseStateMachineC->
isZero() ||
4337 Changed = ChangeStatus::CHANGED;
4340 setUseGenericStateMachineOfKernelEnvironment(
4347 if (!mayContainParallelRegion()) {
4348 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4350 auto Remark = [&](OptimizationRemark
OR) {
4351 return OR <<
"Removing unused state machine from generic-mode kernel.";
4353 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP130",
Remark);
4359 if (ReachedUnknownParallelRegions.empty()) {
4360 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4362 auto Remark = [&](OptimizationRemark
OR) {
4363 return OR <<
"Rewriting generic-mode kernel with a customized state "
4366 A.emitRemark<OptimizationRemark>(KernelInitCB,
"OMP131",
Remark);
4368 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4370 auto Remark = [&](OptimizationRemarkAnalysis
OR) {
4371 return OR <<
"Generic-mode kernel is executed with a customized state "
4372 "machine that requires a fallback.";
4374 A.emitRemark<OptimizationRemarkAnalysis>(KernelInitCB,
"OMP132",
Remark);
4377 for (CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4378 if (!UnknownParallelRegionCB)
4380 auto Remark = [&](OptimizationRemarkAnalysis ORA) {
4381 return ORA <<
"Call may contain unknown parallel regions. Use "
4382 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4385 A.emitRemark<OptimizationRemarkAnalysis>(UnknownParallelRegionCB,
4420 auto &Ctx = getAnchorValue().getContext();
4424 BasicBlock *InitBB = KernelInitCB->getParent();
4426 KernelInitCB->getNextNode(),
"thread.user_code.check");
4430 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4432 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4434 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4437 Kernel, UserCodeEntryBB);
4440 Kernel, UserCodeEntryBB);
4442 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4443 A.registerManifestAddedBasicBlock(*InitBB);
4444 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4445 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4446 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4451 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4453 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4459 ConstantInt::getAllOnesValue(KernelInitCB->getType()),
4460 "thread.is_worker", InitBB);
4465 FunctionCallee BlockHwSizeFn =
4467 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4468 FunctionCallee WarpSizeFn =
4470 M, OMPRTL___kmpc_get_warp_size);
4471 CallInst *BlockHwSize =
4473 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4475 CallInst *WarpSize =
4477 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4480 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4484 "thread.is_main_or_worker", IsWorkerCheckBB);
4487 IsMainOrWorker, IsWorkerCheckBB);
4490 const DataLayout &
DL =
M.getDataLayout();
4491 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4493 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4498 OpenMPIRBuilder::LocationDescription(
4499 IRBuilder<>::InsertPoint(StateMachineBeginBB,
4500 StateMachineBeginBB->
end()),
4503 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4504 Value *GTid = KernelInitCB;
4506 FunctionCallee BarrierFn =
4508 M, OMPRTL___kmpc_barrier_simple_generic);
4511 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4515 (
unsigned int)AddressSpace::Generic) {
4516 WorkFnAI =
new AddrSpaceCastInst(
4517 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4518 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4522 FunctionCallee KernelParallelFn =
4524 M, OMPRTL___kmpc_kernel_parallel);
4526 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4527 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4529 Instruction *WorkFn =
new LoadInst(VoidPtrTy, WorkFnAI,
"worker.work_fn",
4530 StateMachineBeginBB);
4533 FunctionType *ParallelRegionFnTy = FunctionType::get(
4534 Type::getVoidTy(Ctx), {Type::getInt16Ty(Ctx), Type::getInt32Ty(Ctx)},
4540 StateMachineBeginBB);
4541 IsDone->setDebugLoc(DLoc);
4543 IsDone, StateMachineBeginBB)
4547 StateMachineDoneBarrierBB, IsActiveWorker,
4548 StateMachineIsActiveCheckBB)
4554 const unsigned int WrapperFunctionArgNo = 6;
4559 for (
int I = 0,
E = ReachedKnownParallelRegions.size();
I <
E; ++
I) {
4560 auto *CB = ReachedKnownParallelRegions[
I];
4562 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4564 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4565 StateMachineEndParallelBB);
4567 ->setDebugLoc(DLoc);
4573 Kernel, StateMachineEndParallelBB);
4574 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4575 A.registerManifestAddedBasicBlock(*PRNextBB);
4580 if (
I + 1 <
E || !ReachedUnknownParallelRegions.empty()) {
4583 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4591 StateMachineIfCascadeCurrentBB)
4593 StateMachineIfCascadeCurrentBB = PRNextBB;
4599 if (!ReachedUnknownParallelRegions.empty()) {
4600 StateMachineIfCascadeCurrentBB->
setName(
4601 "worker_state_machine.parallel_region.fallback.execute");
4603 StateMachineIfCascadeCurrentBB)
4604 ->setDebugLoc(DLoc);
4607 StateMachineIfCascadeCurrentBB)
4610 FunctionCallee EndParallelFn =
4612 M, OMPRTL___kmpc_kernel_end_parallel);
4613 CallInst *EndParallel =
4615 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4621 ->setDebugLoc(DLoc);
4631 KernelInfoState StateBefore = getState();
4637 struct UpdateKernelEnvCRAII {
4638 AAKernelInfoFunction &AA;
4640 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4642 ~UpdateKernelEnvCRAII() {
4646 ConstantStruct *ExistingKernelEnvC =
4649 if (!AA.isValidState()) {
4650 AA.KernelEnvC = ExistingKernelEnvC;
4654 if (!AA.ReachedKnownParallelRegions.isValidState())
4655 AA.setUseGenericStateMachineOfKernelEnvironment(
4656 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4657 ExistingKernelEnvC));
4659 if (!AA.SPMDCompatibilityTracker.isValidState())
4660 AA.setExecModeOfKernelEnvironment(
4661 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4663 ConstantInt *MayUseNestedParallelismC =
4664 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4666 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4667 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4668 AA.setMayUseNestedParallelismOfKernelEnvironment(
4669 NewMayUseNestedParallelismC);
4679 if (!
I.mayWriteToMemory())
4682 const auto *UnderlyingObjsAA =
A.getAAFor<AAUnderlyingObjects>(
4684 DepClassTy::OPTIONAL);
4685 auto *
HS =
A.getAAFor<AAHeapToStack>(
4687 DepClassTy::OPTIONAL);
4688 if (UnderlyingObjsAA &&
4689 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4690 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4694 auto *CB = dyn_cast<CallBase>(&Obj);
4695 return CB && HS && HS->isAssumedHeapToStack(*CB);
4701 SPMDCompatibilityTracker.insert(&
I);
4705 bool UsedAssumedInformationInCheckRWInst =
false;
4706 if (!SPMDCompatibilityTracker.isAtFixpoint())
4707 if (!
A.checkForAllReadWriteInstructions(
4708 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4709 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4711 bool UsedAssumedInformationFromReachingKernels =
false;
4712 if (!IsKernelEntry) {
4713 updateParallelLevels(
A);
4715 bool AllReachingKernelsKnown =
true;
4716 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4717 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4719 if (!SPMDCompatibilityTracker.empty()) {
4720 if (!ParallelLevels.isValidState())
4721 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4722 else if (!ReachingKernelEntries.isValidState())
4723 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4729 for (
auto *
Kernel : ReachingKernelEntries) {
4730 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4732 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4733 CBAA->SPMDCompatibilityTracker.isAssumed())
4737 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4738 UsedAssumedInformationFromReachingKernels =
true;
4740 if (SPMD != 0 &&
Generic != 0)
4741 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4747 bool AllParallelRegionStatesWereFixed =
true;
4748 bool AllSPMDStatesWereFixed =
true;
4751 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4755 getState() ^= CBAA->getState();
4756 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4757 AllParallelRegionStatesWereFixed &=
4758 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4759 AllParallelRegionStatesWereFixed &=
4760 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4764 bool UsedAssumedInformationInCheckCallInst =
false;
4765 if (!
A.checkForAllCallLikeInstructions(
4766 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4768 <<
"Failed to visit all call-like instructions!\n";);
4769 return indicatePessimisticFixpoint();
4774 if (!UsedAssumedInformationInCheckCallInst &&
4775 AllParallelRegionStatesWereFixed) {
4776 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4777 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4782 if (!UsedAssumedInformationInCheckRWInst &&
4783 !UsedAssumedInformationInCheckCallInst &&
4784 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4785 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4787 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4788 : ChangeStatus::CHANGED;
4793 void updateReachingKernelEntries(Attributor &
A,
4794 bool &AllReachingKernelsKnown) {
4795 auto PredCallSite = [&](AbstractCallSite ACS) {
4798 assert(Caller &&
"Caller is nullptr");
4800 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4802 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4803 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4809 ReachingKernelEntries.indicatePessimisticFixpoint();
4814 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4816 AllReachingKernelsKnown))
4817 ReachingKernelEntries.indicatePessimisticFixpoint();
4821 void updateParallelLevels(Attributor &
A) {
4822 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4823 OMPInformationCache::RuntimeFunctionInfo &Parallel60RFI =
4824 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_60];
4826 auto PredCallSite = [&](AbstractCallSite ACS) {
4829 assert(Caller &&
"Caller is nullptr");
4833 if (CAA && CAA->ParallelLevels.isValidState()) {
4839 if (Caller == Parallel60RFI.Declaration) {
4840 ParallelLevels.indicatePessimisticFixpoint();
4844 ParallelLevels ^= CAA->ParallelLevels;
4851 ParallelLevels.indicatePessimisticFixpoint();
4856 bool AllCallSitesKnown =
true;
4857 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4860 ParallelLevels.indicatePessimisticFixpoint();
4867struct AAKernelInfoCallSite : AAKernelInfo {
4868 AAKernelInfoCallSite(
const IRPosition &IRP, Attributor &
A)
4869 : AAKernelInfo(IRP,
A) {}
4873 AAKernelInfo::initialize(
A);
4876 auto *AssumptionAA =
A.getAAFor<AAAssumptionInfo>(
4880 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4881 indicateOptimisticFixpoint();
4889 indicateOptimisticFixpoint();
4898 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4899 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4900 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4902 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4906 if (!AssumptionAA ||
4907 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4908 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4909 ReachedUnknownParallelRegions.insert(&CB);
4913 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4914 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4915 SPMDCompatibilityTracker.insert(&CB);
4920 indicateOptimisticFixpoint();
4926 if (NumCallees > 1) {
4927 indicatePessimisticFixpoint();
4934 case OMPRTL___kmpc_is_spmd_exec_mode:
4935 case OMPRTL___kmpc_distribute_static_fini:
4936 case OMPRTL___kmpc_for_static_fini:
4937 case OMPRTL___kmpc_global_thread_num:
4938 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4939 case OMPRTL___kmpc_get_hardware_num_blocks:
4940 case OMPRTL___kmpc_single:
4941 case OMPRTL___kmpc_end_single:
4942 case OMPRTL___kmpc_master:
4943 case OMPRTL___kmpc_end_master:
4944 case OMPRTL___kmpc_barrier:
4945 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4946 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4947 case OMPRTL___kmpc_error:
4948 case OMPRTL___kmpc_flush:
4949 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4950 case OMPRTL___kmpc_get_warp_size:
4951 case OMPRTL_omp_get_thread_num:
4952 case OMPRTL_omp_get_num_threads:
4953 case OMPRTL_omp_get_max_threads:
4954 case OMPRTL_omp_in_parallel:
4955 case OMPRTL_omp_get_dynamic:
4956 case OMPRTL_omp_get_cancellation:
4957 case OMPRTL_omp_get_nested:
4958 case OMPRTL_omp_get_schedule:
4959 case OMPRTL_omp_get_thread_limit:
4960 case OMPRTL_omp_get_supported_active_levels:
4961 case OMPRTL_omp_get_max_active_levels:
4962 case OMPRTL_omp_get_level:
4963 case OMPRTL_omp_get_ancestor_thread_num:
4964 case OMPRTL_omp_get_team_size:
4965 case OMPRTL_omp_get_active_level:
4966 case OMPRTL_omp_in_final:
4967 case OMPRTL_omp_get_proc_bind:
4968 case OMPRTL_omp_get_num_places:
4969 case OMPRTL_omp_get_num_procs:
4970 case OMPRTL_omp_get_place_proc_ids:
4971 case OMPRTL_omp_get_place_num:
4972 case OMPRTL_omp_get_partition_num_places:
4973 case OMPRTL_omp_get_partition_place_nums:
4974 case OMPRTL_omp_get_wtime:
4976 case OMPRTL___kmpc_distribute_static_init_4:
4977 case OMPRTL___kmpc_distribute_static_init_4u:
4978 case OMPRTL___kmpc_distribute_static_init_8:
4979 case OMPRTL___kmpc_distribute_static_init_8u:
4980 case OMPRTL___kmpc_for_static_init_4:
4981 case OMPRTL___kmpc_for_static_init_4u:
4982 case OMPRTL___kmpc_for_static_init_8:
4983 case OMPRTL___kmpc_for_static_init_8u: {
4985 unsigned ScheduleArgOpNo = 2;
4986 auto *ScheduleTypeCI =
4988 unsigned ScheduleTypeVal =
4989 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4991 case OMPScheduleType::UnorderedStatic:
4992 case OMPScheduleType::UnorderedStaticChunked:
4993 case OMPScheduleType::OrderedDistribute:
4994 case OMPScheduleType::OrderedDistributeChunked:
4997 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4998 SPMDCompatibilityTracker.insert(&CB);
5002 case OMPRTL___kmpc_target_init:
5005 case OMPRTL___kmpc_target_deinit:
5006 KernelDeinitCB = &CB;
5008 case OMPRTL___kmpc_parallel_60:
5009 if (!handleParallel60(
A, CB))
5010 indicatePessimisticFixpoint();
5012 case OMPRTL___kmpc_omp_task:
5014 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5015 SPMDCompatibilityTracker.insert(&CB);
5016 ReachedUnknownParallelRegions.insert(&CB);
5018 case OMPRTL___kmpc_alloc_shared:
5019 case OMPRTL___kmpc_free_shared:
5025 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5026 SPMDCompatibilityTracker.insert(&CB);
5032 indicateOptimisticFixpoint();
5036 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5037 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5038 CheckCallee(getAssociatedFunction(), 1);
5041 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5042 for (
auto *Callee : OptimisticEdges) {
5043 CheckCallee(Callee, OptimisticEdges.size());
5054 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5055 KernelInfoState StateBefore = getState();
5057 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5058 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5062 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5065 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5067 return indicatePessimisticFixpoint();
5068 if (getState() == FnAA->getState())
5069 return ChangeStatus::UNCHANGED;
5070 getState() = FnAA->getState();
5071 return ChangeStatus::CHANGED;
5074 return indicatePessimisticFixpoint();
5077 if (It->getSecond() == OMPRTL___kmpc_parallel_60) {
5078 if (!handleParallel60(
A, CB))
5079 return indicatePessimisticFixpoint();
5080 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5081 : ChangeStatus::CHANGED;
5087 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5088 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5089 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5091 auto *HeapToStackAA =
A.getAAFor<AAHeapToStack>(
5093 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5101 case OMPRTL___kmpc_alloc_shared:
5102 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5103 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5104 SPMDCompatibilityTracker.insert(&CB);
5106 case OMPRTL___kmpc_free_shared:
5107 if ((!HeapToStackAA ||
5108 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5110 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5111 SPMDCompatibilityTracker.insert(&CB);
5114 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5115 SPMDCompatibilityTracker.insert(&CB);
5117 return ChangeStatus::CHANGED;
5121 A.getAAFor<AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5122 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5123 if (Function *
F = getAssociatedFunction())
5126 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5127 for (
auto *Callee : OptimisticEdges) {
5128 CheckCallee(Callee, OptimisticEdges.size());
5134 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5135 : ChangeStatus::CHANGED;
5140 bool handleParallel60(Attributor &
A, CallBase &CB) {
5141 const unsigned int NonWrapperFunctionArgNo = 5;
5142 const unsigned int WrapperFunctionArgNo = 6;
5143 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5144 ? NonWrapperFunctionArgNo
5145 : WrapperFunctionArgNo;
5149 if (!ParallelRegion)
5152 ReachedKnownParallelRegions.insert(&CB);
5154 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5156 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5157 !FnAA->ReachedKnownParallelRegions.empty() ||
5158 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5159 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5160 !FnAA->ReachedUnknownParallelRegions.empty();
5165struct AAFoldRuntimeCall
5166 :
public StateWrapper<BooleanState, AbstractAttribute> {
5167 using Base = StateWrapper<BooleanState, AbstractAttribute>;
5169 AAFoldRuntimeCall(
const IRPosition &IRP, Attributor &
A) :
Base(IRP) {}
5172 void trackStatistics()
const override {}
5175 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5179 StringRef
getName()
const override {
return "AAFoldRuntimeCall"; }
5182 const char *getIdAddr()
const override {
return &
ID; }
5186 static bool classof(
const AbstractAttribute *AA) {
5190 static const char ID;
5193struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5194 AAFoldRuntimeCallCallSiteReturned(
const IRPosition &IRP, Attributor &
A)
5195 : AAFoldRuntimeCall(IRP,
A) {}
5198 const std::string getAsStr(Attributor *)
const override {
5199 if (!isValidState())
5202 std::string Str(
"simplified value: ");
5204 if (!SimplifiedValue)
5205 return Str + std::string(
"none");
5207 if (!*SimplifiedValue)
5208 return Str + std::string(
"nullptr");
5211 return Str + std::to_string(CI->getSExtValue());
5213 return Str + std::string(
"unknown");
5218 indicatePessimisticFixpoint();
5222 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5223 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5224 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5225 "Expected a known OpenMP runtime function");
5227 RFKind = It->getSecond();
5230 A.registerSimplificationCallback(
5232 [&](
const IRPosition &IRP,
const AbstractAttribute *AA,
5233 bool &UsedAssumedInformation) -> std::optional<Value *> {
5234 assert((isValidState() || SimplifiedValue ==
nullptr) &&
5235 "Unexpected invalid state!");
5237 if (!isAtFixpoint()) {
5238 UsedAssumedInformation =
true;
5240 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5242 return SimplifiedValue;
5249 case OMPRTL___kmpc_is_spmd_exec_mode:
5252 case OMPRTL___kmpc_parallel_level:
5255 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5256 Changed =
Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5258 case OMPRTL___kmpc_get_hardware_num_blocks:
5271 if (SimplifiedValue && *SimplifiedValue) {
5274 A.deleteAfterManifest(
I);
5277 auto Remark = [&](OptimizationRemark
OR) {
5279 return OR <<
"Replacing OpenMP runtime call "
5281 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5282 return OR <<
"Replacing OpenMP runtime call "
5287 A.emitRemark<OptimizationRemark>(CB,
"OMP180",
Remark);
5290 << **SimplifiedValue <<
"\n");
5292 Changed = ChangeStatus::CHANGED;
5299 SimplifiedValue =
nullptr;
5300 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5306 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5308 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5309 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5310 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5313 if (!CallerKernelInfoAA ||
5314 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5315 return indicatePessimisticFixpoint();
5317 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5319 DepClassTy::REQUIRED);
5321 if (!AA || !AA->isValidState()) {
5322 SimplifiedValue =
nullptr;
5323 return indicatePessimisticFixpoint();
5326 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5327 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5332 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5333 ++KnownNonSPMDCount;
5335 ++AssumedNonSPMDCount;
5339 if ((AssumedSPMDCount + KnownSPMDCount) &&
5340 (AssumedNonSPMDCount + KnownNonSPMDCount))
5341 return indicatePessimisticFixpoint();
5343 auto &Ctx = getAnchorValue().getContext();
5344 if (KnownSPMDCount || AssumedSPMDCount) {
5345 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5346 "Expected only SPMD kernels!");
5349 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
true);
5350 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5351 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5352 "Expected only non-SPMD kernels!");
5355 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx),
false);
5360 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5363 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5364 : ChangeStatus::CHANGED;
5369 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5371 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5374 if (!CallerKernelInfoAA ||
5375 !CallerKernelInfoAA->ParallelLevels.isValidState())
5376 return indicatePessimisticFixpoint();
5378 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5379 return indicatePessimisticFixpoint();
5381 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5382 assert(!SimplifiedValue &&
5383 "SimplifiedValue should keep none at this point");
5384 return ChangeStatus::UNCHANGED;
5387 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5388 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5389 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5391 DepClassTy::REQUIRED);
5392 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5393 return indicatePessimisticFixpoint();
5395 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5396 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5401 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5402 ++KnownNonSPMDCount;
5404 ++AssumedNonSPMDCount;
5408 if ((AssumedSPMDCount + KnownSPMDCount) &&
5409 (AssumedNonSPMDCount + KnownNonSPMDCount))
5410 return indicatePessimisticFixpoint();
5412 auto &Ctx = getAnchorValue().getContext();
5416 if (AssumedSPMDCount || KnownSPMDCount) {
5417 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5418 "Expected only SPMD kernels!");
5419 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 1);
5421 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5422 "Expected only non-SPMD kernels!");
5423 SimplifiedValue = ConstantInt::get(Type::getInt8Ty(Ctx), 0);
5425 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5426 : ChangeStatus::CHANGED;
5429 ChangeStatus foldKernelFnAttribute(Attributor &
A, llvm::StringRef Attr) {
5431 int32_t CurrentAttrValue = -1;
5432 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5434 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5437 if (!CallerKernelInfoAA ||
5438 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5439 return indicatePessimisticFixpoint();
5442 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5443 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5445 if (NextAttrVal == -1 ||
5446 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5447 return indicatePessimisticFixpoint();
5448 CurrentAttrValue = NextAttrVal;
5451 if (CurrentAttrValue != -1) {
5452 auto &Ctx = getAnchorValue().getContext();
5454 ConstantInt::get(Type::getInt32Ty(Ctx), CurrentAttrValue);
5456 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5457 : ChangeStatus::CHANGED;
5463 std::optional<Value *> SimplifiedValue;
5473 auto &RFI = OMPInfoCache.RFIs[RF];
5474 RFI.foreachUse(SCC, [&](Use &U, Function &
F) {
5475 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5478 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5480 DepClassTy::NONE,
false,
5486void OpenMPOpt::registerAAs(
bool IsModulePass) {
5496 A.getOrCreateAAFor<AAKernelInfo>(
5498 DepClassTy::NONE,
false,
5502 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5503 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5504 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5506 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5507 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5514 for (
int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
5517 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5520 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5527 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5531 GetterRFI.foreachUse(SCC, CreateAA);
5540 for (
auto *
F : SCC) {
5541 if (
F->isDeclaration())
5547 if (
F->hasLocalLinkage()) {
5549 const auto *CB = dyn_cast<CallBase>(U.getUser());
5550 return CB && CB->isCallee(&U) &&
5551 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5555 registerAAsForFunction(
A, *
F);
5559void OpenMPOpt::registerAAsForFunction(Attributor &
A,
const Function &
F) {
5565 if (
F.hasFnAttribute(Attribute::Convergent))
5570 bool UsedAssumedInformation =
false;
5573 A.getOrCreateAAFor<AAAddressSpace>(
5579 A.getOrCreateAAFor<AAIndirectCallInfo>(
5584 A.getOrCreateAAFor<AAAddressSpace>(
5593 if (
II->getIntrinsicID() == Intrinsic::assume) {
5594 A.getOrCreateAAFor<AAPotentialValues>(
5602const char AAICVTracker::ID = 0;
5603const char AAKernelInfo::ID = 0;
5605const char AAHeapToShared::ID = 0;
5606const char AAFoldRuntimeCall::ID = 0;
5608AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5610 AAICVTracker *AA =
nullptr;
5618 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5621 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5624 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5627 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5636 AAExecutionDomainFunction *
AA =
nullptr;
5646 "AAExecutionDomain can only be created for function position!");
5648 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5655AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5657 AAHeapToSharedFunction *
AA =
nullptr;
5667 "AAHeapToShared can only be created for function position!");
5669 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5676AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5678 AAKernelInfo *AA =
nullptr;
5688 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5691 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5698AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5700 AAFoldRuntimeCall *AA =
nullptr;
5709 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5711 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5732 if (Kernels.contains(&
F))
5734 return !
F.use_empty();
5741 return ORA <<
"Could not internalize function. "
5742 <<
"Some optimizations may not be possible. [OMP140]";
5754 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5758 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5771 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5773 Functions.insert(&
F);
5791 OMPInformationCache InfoCache(M, AG, Allocator,
nullptr, PostLink);
5793 unsigned MaxFixpointIterations =
5805 return F.hasFnAttribute(
"kernel");
5810 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5816 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5817 !
F.hasFnAttribute(Attribute::NoInline))
5818 F.addFnAttr(Attribute::AlwaysInline);
5848 Module &M = *
C.begin()->getFunction().getParent();
5870 OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
5871 &Functions, PostLink);
5873 unsigned MaxFixpointIterations =
5887 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5888 bool Changed = OMPOpt.run(
false);
5907 if (
F.hasKernelCallingConv()) {
5912 ++NumOpenMPTargetRegionKernels;
5915 ++NumNonOpenMPTargetRegionKernels;
5922 Metadata *MD = M.getModuleFlag(
"openmp");
5930 Metadata *MD = M.getModuleFlag(
"openmp-device");
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
This file defines the DenseSet and SmallDenseSet classes.
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
Loop::LoopBounds::Direction Direction
Machine Check Debug Module
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicable functions on the device."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
std::pair< BasicBlock *, BasicBlock * > Edge
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, const llvm::StringTable &StandardNames, VectorLibrary VecLib)
Initialize the set of available library functions based on the specified target triple.
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
size_t size() const
size - Get the array size.
iterator begin()
Instruction iterator methods.
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
LLVM_ABI bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
LLVM_ABI Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
LLVM_ABI Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
static ErrorSuccess success()
Create a success value.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
BasicBlock * getBlock() const
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
LLVM_ABI bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
LLVM_ABI bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
LLVM_ABI bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
LLVM_ABI void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
LLVM_ABI StringRef getName() const
Return the name of the corresponding LLVM basic block, or an empty string.
A Module instance is used to store all the information related to an LLVM module.
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
size_type count(const_arg_type key) const
Count the number of elements of a given key in the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
LLVM_ABI bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
Type * getType() const
All values are typed, get the type of this value.
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
Abstract Attribute helper functions.
LLVM_ABI bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
LLVM_ABI bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
LLVM_ABI bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ BasicBlock
Various leaf nodes.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
SetVector< Kernel > KernelSet
Set of kernels in the module.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
NodeAddr< UseNode * > Use
friend class Instruction
Iterator for Instructions in a `BasicBlock.
LLVM_ABI iterator begin() const
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
bool operator!=(uint64_t V1, const APInt &V2)
constexpr from_range_t from_range
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
AnalysisManager< LazyCallGraph::SCC, LazyCallGraph & > CGSCCAnalysisManager
The CGSCC analysis manager.
@ ThinLTOPostLink
ThinLTO postlink (backend compile) phase.
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
auto dyn_cast_or_null(const Y &Val)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
ArrayRef(const T &OneElt) -> ArrayRef< T >
std::string toString(const APInt &I, unsigned Radix, bool Signed, bool formatAsCLiteral=false, bool UpperCase=true, bool InsertSeparators=false)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ OPTIONAL
The target may be valid if the source is not.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
BumpPtrAllocatorImpl<> BumpPtrAllocator
The standard BumpPtrAllocator which just uses the default template parameters.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
static LLVM_ABI AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
AAExecutionDomain(const IRPosition &IRP, Attributor &A)
static LLVM_ABI const char ID
Unique ID (due to the unique address)
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static LLVM_ABI bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
Base struct for all "concrete attribute" deductions.
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static LLVM_ABI bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Value * >( const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
std::function< std::optional< Constant * >( const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static LLVM_ABI bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...