50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
67#define DEBUG_TYPE "openmp-opt"
70 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
74 "openmp-opt-enable-merging",
80 cl::desc(
"Disable function internalization."),
91 "openmp-hide-memory-transfer-latency",
92 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
97 "openmp-opt-disable-deglobalization",
98 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
102 "openmp-opt-disable-spmdization",
103 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
107 "openmp-opt-disable-folding",
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
122 "openmp-opt-print-module-after",
123 cl::desc(
"Print the current module after OpenMP optimizations."),
127 "openmp-opt-print-module-before",
128 cl::desc(
"Print the current module before OpenMP optimizations."),
132 "openmp-opt-inline-device",
143 cl::desc(
"Maximal number of attributor iterations."),
148 cl::desc(
"Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits<unsigned>::max()));
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
156 "Number of OpenMP runtime functions identified");
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr const unsigned MEMBER##Idx = IDX;
216#undef KERNEL_ENVIRONMENT_IDX
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr const unsigned MEMBER##Idx = IDX;
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
239#undef KERNEL_ENVIRONMENT_GETTER
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
261 constexpr const int InitKernelEnvironmentArgNo = 0;
262 return cast<GlobalVariable>(
276struct AAHeapToShared;
287 OpenMPPostLink(OpenMPPostLink) {
290 OMPBuilder.initialize();
291 initializeRuntimeFunctions(M);
292 initializeInternalControlVars();
296 struct InternalControlVarInfo {
323 struct RuntimeFunctionInfo {
347 void clearUsesMap() { UsesMap.
clear(); }
350 operator bool()
const {
return Declaration; }
353 UseVector &getOrCreateUseVector(
Function *
F) {
354 std::shared_ptr<UseVector> &UV = UsesMap[
F];
356 UV = std::make_shared<UseVector>();
362 const UseVector *getUseVector(
Function &
F)
const {
363 auto I = UsesMap.find(&
F);
364 if (
I != UsesMap.end())
365 return I->second.get();
370 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
374 size_t getNumArgs()
const {
return ArgumentTypes.
size(); }
392 UseVector &UV = getOrCreateUseVector(
F);
402 while (!ToBeDeleted.
empty()) {
416 decltype(UsesMap)::iterator
begin() {
return UsesMap.
begin(); }
417 decltype(UsesMap)::iterator
end() {
return UsesMap.
end(); }
425 RuntimeFunction::OMPRTL___last>
433 InternalControlVar::ICV___last>
438 void initializeInternalControlVars() {
439#define ICV_RT_SET(_Name, RTL) \
441 auto &ICV = ICVs[_Name]; \
444#define ICV_RT_GET(Name, RTL) \
446 auto &ICV = ICVs[Name]; \
449#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
451 auto &ICV = ICVs[Enum]; \
454 ICV.InitKind = Init; \
455 ICV.EnvVarName = _EnvVarName; \
456 switch (ICV.InitKind) { \
457 case ICV_IMPLEMENTATION_DEFINED: \
458 ICV.InitValue = nullptr; \
461 ICV.InitValue = ConstantInt::get( \
462 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
465 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
471#include "llvm/Frontend/OpenMP/OMPKinds.def"
477 static bool declMatchesRTFTypes(
Function *
F,
Type *RTFRetType,
484 if (
F->getReturnType() != RTFRetType)
486 if (
F->arg_size() != RTFArgTypes.
size())
489 auto *RTFTyIt = RTFArgTypes.
begin();
491 if (Arg.getType() != *RTFTyIt)
501 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
502 unsigned NumUses = 0;
503 if (!RFI.Declaration)
508 NumOpenMPRuntimeFunctionsIdentified += 1;
509 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
513 for (
Use &U : RFI.Declaration->uses()) {
514 if (
Instruction *UserI = dyn_cast<Instruction>(
U.getUser())) {
515 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
516 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
520 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
529 auto &RFI = RFIs[RTF];
531 collectUses(RFI,
false);
535 void recollectUses() {
536 for (
int Idx = 0;
Idx < RFIs.size(); ++
Idx)
556 RuntimeFunctionInfo &RFI = RFIs[Fn];
558 if (RFI.Declaration && RFI.Declaration->isDeclaration())
566 void initializeRuntimeFunctions(
Module &M) {
569#define OMP_TYPE(VarName, ...) \
570 Type *VarName = OMPBuilder.VarName; \
573#define OMP_ARRAY_TYPE(VarName, ...) \
574 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
576 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
577 (void)VarName##PtrTy;
579#define OMP_FUNCTION_TYPE(VarName, ...) \
580 FunctionType *VarName = OMPBuilder.VarName; \
582 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
585#define OMP_STRUCT_TYPE(VarName, ...) \
586 StructType *VarName = OMPBuilder.VarName; \
588 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
591#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
593 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
594 Function *F = M.getFunction(_Name); \
595 RTLFunctions.insert(F); \
596 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
597 RuntimeFunctionIDMap[F] = _Enum; \
598 auto &RFI = RFIs[_Enum]; \
601 RFI.IsVarArg = _IsVarArg; \
602 RFI.ReturnType = OMPBuilder._ReturnType; \
603 RFI.ArgumentTypes = std::move(ArgsTypes); \
604 RFI.Declaration = F; \
605 unsigned NumUses = collectUses(RFI); \
608 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
610 if (RFI.Declaration) \
611 dbgs() << TAG << "-> got " << NumUses << " uses in " \
612 << RFI.getNumFunctionsWithUses() \
613 << " different functions.\n"; \
617#include "llvm/Frontend/OpenMP/OMPKinds.def"
623 for (
StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
624 if (
F.hasFnAttribute(Attribute::NoInline) &&
625 F.getName().starts_with(Prefix) &&
626 !
F.hasFnAttribute(Attribute::OptimizeNone))
627 F.removeFnAttr(Attribute::NoInline);
638 bool OpenMPPostLink =
false;
641template <
typename Ty,
bool InsertInval
idates = true>
643 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
644 bool insert(
const Ty &Elem) {
645 if (InsertInvalidates)
647 return Set.insert(Elem);
650 const Ty &operator[](
int Idx)
const {
return Set[
Idx]; }
651 bool operator==(
const BooleanStateWithSetVector &RHS)
const {
652 return BooleanState::operator==(RHS) && Set ==
RHS.Set;
654 bool operator!=(
const BooleanStateWithSetVector &RHS)
const {
655 return !(*
this ==
RHS);
658 bool empty()
const {
return Set.empty(); }
659 size_t size()
const {
return Set.size(); }
662 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &RHS) {
663 BooleanState::operator^=(RHS);
664 Set.insert(
RHS.Set.begin(),
RHS.Set.end());
673 typename decltype(Set)::iterator
begin() {
return Set.
begin(); }
674 typename decltype(Set)::iterator
end() {
return Set.
end(); }
679template <
typename Ty,
bool InsertInval
idates = true>
680using BooleanStateWithPtrSetVector =
681 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
685 bool IsAtFixpoint =
false;
689 BooleanStateWithPtrSetVector<
CallBase,
false>
690 ReachedKnownParallelRegions;
693 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
698 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
713 bool IsKernelEntry =
false;
716 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
721 BooleanStateWithSetVector<uint8_t> ParallelLevels;
724 bool NestedParallelism =
false;
729 KernelInfoState() =
default;
730 KernelInfoState(
bool BestState) {
739 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
744 ParallelLevels.indicatePessimisticFixpoint();
745 ReachingKernelEntries.indicatePessimisticFixpoint();
746 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
747 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
748 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
749 NestedParallelism =
true;
756 ParallelLevels.indicateOptimisticFixpoint();
757 ReachingKernelEntries.indicateOptimisticFixpoint();
758 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
759 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
760 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
765 KernelInfoState &getAssumed() {
return *
this; }
766 const KernelInfoState &getAssumed()
const {
return *
this; }
768 bool operator==(
const KernelInfoState &RHS)
const {
769 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
771 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
773 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
775 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
777 if (ParallelLevels !=
RHS.ParallelLevels)
779 if (NestedParallelism !=
RHS.NestedParallelism)
785 bool mayContainParallelRegion() {
786 return !ReachedKnownParallelRegions.empty() ||
787 !ReachedUnknownParallelRegions.empty();
791 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
793 static KernelInfoState getBestState(KernelInfoState &KIS) {
794 return getBestState();
798 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
801 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
803 if (KIS.KernelInitCB) {
804 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
807 KernelInitCB = KIS.KernelInitCB;
809 if (KIS.KernelDeinitCB) {
810 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
813 KernelDeinitCB = KIS.KernelDeinitCB;
815 if (KIS.KernelEnvC) {
816 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
819 KernelEnvC = KIS.KernelEnvC;
821 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
822 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
823 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
824 NestedParallelism |= KIS.NestedParallelism;
828 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
829 return (*
this ^= KIS);
845 OffloadArray() =
default;
852 if (!
Array.getAllocatedType()->isArrayTy())
855 if (!getValues(Array,
Before))
858 this->Array = &
Array;
862 static const unsigned DeviceIDArgNum = 1;
863 static const unsigned BasePtrsArgNum = 3;
864 static const unsigned PtrsArgNum = 4;
865 static const unsigned SizesArgNum = 5;
873 const uint64_t NumValues =
Array.getAllocatedType()->getArrayNumElements();
874 StoredValues.
assign(NumValues,
nullptr);
875 LastAccesses.
assign(NumValues,
nullptr);
880 if (BB !=
Before.getParent())
890 if (!isa<StoreInst>(&
I))
893 auto *S = cast<StoreInst>(&
I);
900 LastAccesses[
Idx] = S;
910 const unsigned NumValues = StoredValues.
size();
911 for (
unsigned I = 0;
I < NumValues; ++
I) {
912 if (!StoredValues[
I] || !LastAccesses[
I])
922 using OptimizationRemarkGetter =
926 OptimizationRemarkGetter OREGetter,
927 OMPInformationCache &OMPInfoCache,
Attributor &A)
929 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache),
A(
A) {}
932 bool remarksEnabled() {
933 auto &Ctx =
M.getContext();
934 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
938 bool run(
bool IsModulePass) {
942 bool Changed =
false;
948 Changed |= runAttributor(IsModulePass);
951 OMPInfoCache.recollectUses();
954 Changed |= rewriteDeviceCodeStateMachine();
956 if (remarksEnabled())
957 analysisGlobalization();
964 Changed |= runAttributor(IsModulePass);
967 OMPInfoCache.recollectUses();
969 Changed |= deleteParallelRegions();
972 Changed |= hideMemTransfersLatency();
973 Changed |= deduplicateRuntimeCalls();
975 if (mergeParallelRegions()) {
976 deduplicateRuntimeCalls();
982 if (OMPInfoCache.OpenMPPostLink)
983 Changed |= removeRuntimeSymbols();
990 void printICVs()
const {
995 for (
auto ICV : ICVs) {
996 auto ICVInfo = OMPInfoCache.ICVs[ICV];
998 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1000 << (ICVInfo.InitValue
1001 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1002 :
"IMPLEMENTATION_DEFINED");
1005 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPICVTracker",
Remark);
1011 void printKernels()
const {
1017 return ORA <<
"OpenMP GPU kernel "
1018 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1021 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPGPU",
Remark);
1027 static CallInst *getCallIfRegularCall(
1028 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1029 CallInst *CI = dyn_cast<CallInst>(
U.getUser());
1039 static CallInst *getCallIfRegularCall(
1040 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1041 CallInst *CI = dyn_cast<CallInst>(&V);
1051 bool mergeParallelRegions() {
1052 const unsigned CallbackCalleeOperand = 2;
1053 const unsigned CallbackFirstArgOperand = 3;
1057 OMPInformationCache::RuntimeFunctionInfo &RFI =
1058 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1060 if (!RFI.Declaration)
1064 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1065 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1066 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1069 bool Changed =
false;
1075 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1076 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1077 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1079 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1080 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1082 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1083 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1086 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1087 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1088 ReplacementValue = &Inner;
1092 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1096 auto CreateSequentialRegion = [&](
Function *OuterFn,
1104 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1108 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1111 "Expected a different CFG");
1115 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1116 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1118 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1119 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1121 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1124 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1130 for (
User *Usr :
I.users()) {
1138 OutsideUsers.
insert(&UsrI);
1141 if (OutsideUsers.
empty())
1148 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1149 I.getName() +
".seq.output.alloc", OuterFn->
front().
begin());
1153 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1159 I.getName() +
".seq.output.load",
1166 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1167 InsertPointTy SeqAfterIP =
1168 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
1170 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
1189 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1192 OR <<
"Parallel region merged with parallel region"
1193 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1196 if (CI != MergableCIs.
back())
1202 emitRemark<OptimizationRemark>(MergableCIs.
front(),
"OMP150",
Remark);
1206 <<
" parallel regions in " << OriginalFn->
getName()
1210 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1212 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1216 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1217 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1222 for (
auto *It = MergableCIs.
begin(), *
End = MergableCIs.
end() - 1;
1231 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1242 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1243 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1244 OMP_PROC_BIND_default,
false);
1248 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1255 for (
auto *CI : MergableCIs) {
1257 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1261 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1271 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1275 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1278 if (CI != MergableCIs.back()) {
1281 OMPInfoCache.OMPBuilder.createBarrier(
1290 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1291 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1292 CGUpdater.reanalyzeFunction(*OriginalFn);
1294 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1302 CallInst *CI = getCallIfRegularCall(U, &RFI);
1309 RFI.foreachUse(SCC, DetectPRsCB);
1315 for (
auto &It : BB2PRMap) {
1316 auto &CIs = It.getSecond();
1331 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1334 if (
I.isTerminator())
1337 if (!isa<CallInst>(&
I))
1341 if (IsBeforeMergableRegion) {
1343 if (!CalledFunction)
1350 for (
const auto &RFI : UnmergableCallsInfo) {
1351 if (CalledFunction == RFI.Declaration)
1359 if (!isa<IntrinsicInst>(CI))
1370 if (CIs.count(&
I)) {
1376 if (IsMergable(
I, MergableCIs.
empty()))
1381 for (; It !=
End; ++It) {
1383 if (CIs.count(&SkipI)) {
1385 <<
" due to " <<
I <<
"\n");
1392 if (MergableCIs.
size() > 1) {
1393 MergableCIsVector.
push_back(MergableCIs);
1395 <<
" parallel regions in block " << BB->
getName()
1400 MergableCIs.
clear();
1403 if (!MergableCIsVector.
empty()) {
1406 for (
auto &MergableCIs : MergableCIsVector)
1407 Merge(MergableCIs, BB);
1408 MergableCIsVector.clear();
1415 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1416 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1417 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1418 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1425 bool deleteParallelRegions() {
1426 const unsigned CallbackCalleeOperand = 2;
1428 OMPInformationCache::RuntimeFunctionInfo &RFI =
1429 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1431 if (!RFI.Declaration)
1434 bool Changed =
false;
1436 CallInst *CI = getCallIfRegularCall(U);
1439 auto *Fn = dyn_cast<Function>(
1443 if (!Fn->onlyReadsMemory())
1445 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1452 return OR <<
"Removing parallel region with no side-effects.";
1454 emitRemark<OptimizationRemark>(CI,
"OMP160",
Remark);
1458 ++NumOpenMPParallelRegionsDeleted;
1462 RFI.foreachUse(SCC, DeleteCallCB);
1468 bool deduplicateRuntimeCalls() {
1469 bool Changed =
false;
1472 OMPRTL_omp_get_num_threads,
1473 OMPRTL_omp_in_parallel,
1474 OMPRTL_omp_get_cancellation,
1475 OMPRTL_omp_get_supported_active_levels,
1476 OMPRTL_omp_get_level,
1477 OMPRTL_omp_get_ancestor_thread_num,
1478 OMPRTL_omp_get_team_size,
1479 OMPRTL_omp_get_active_level,
1480 OMPRTL_omp_in_final,
1481 OMPRTL_omp_get_proc_bind,
1482 OMPRTL_omp_get_num_places,
1483 OMPRTL_omp_get_num_procs,
1484 OMPRTL_omp_get_place_num,
1485 OMPRTL_omp_get_partition_num_places,
1486 OMPRTL_omp_get_partition_place_nums};
1490 collectGlobalThreadIdArguments(GTIdArgs);
1492 <<
" global thread ID arguments\n");
1495 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1496 Changed |= deduplicateRuntimeCalls(
1497 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1501 Value *GTIdArg =
nullptr;
1503 if (GTIdArgs.
count(&Arg)) {
1507 Changed |= deduplicateRuntimeCalls(
1508 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1515 bool removeRuntimeSymbols() {
1521 if (!
GV->getType()->isPointerTy())
1529 GlobalVariable *Client = dyn_cast<GlobalVariable>(
C->stripPointerCasts());
1538 GV->eraseFromParent();
1551 bool hideMemTransfersLatency() {
1552 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1553 bool Changed =
false;
1555 auto *RTCall = getCallIfRegularCall(U, &RFI);
1559 OffloadArray OffloadArrays[3];
1560 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1563 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1566 bool WasSplit =
false;
1567 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1568 if (WaitMovementPoint)
1569 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1571 Changed |= WasSplit;
1574 if (OMPInfoCache.runtimeFnsAvailable(
1575 {OMPRTL___tgt_target_data_begin_mapper_issue,
1576 OMPRTL___tgt_target_data_begin_mapper_wait}))
1577 RFI.foreachUse(SCC, SplitMemTransfers);
1582 void analysisGlobalization() {
1583 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1585 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1586 if (
CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1589 <<
"Found thread data sharing on the GPU. "
1590 <<
"Expect degraded performance due to data globalization.";
1592 emitRemark<OptimizationRemarkMissed>(CI,
"OMP112",
Remark);
1598 RFI.foreachUse(SCC, CheckGlobalization);
1603 bool getValuesInOffloadArrays(
CallInst &RuntimeCall,
1605 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1615 Value *BasePtrsArg =
1624 if (!isa<AllocaInst>(V))
1626 auto *BasePtrsArray = cast<AllocaInst>(V);
1627 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1632 if (!isa<AllocaInst>(V))
1634 auto *PtrsArray = cast<AllocaInst>(V);
1635 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1641 if (isa<GlobalValue>(V))
1642 return isa<Constant>(V);
1643 if (!isa<AllocaInst>(V))
1646 auto *SizesArray = cast<AllocaInst>(V);
1647 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1658 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1661 std::string ValuesStr;
1663 std::string Separator =
" --- ";
1665 for (
auto *BP : OAs[0].StoredValues) {
1669 LLVM_DEBUG(
dbgs() <<
"\t\toffload_baseptrs: " << ValuesStr <<
"\n");
1672 for (
auto *
P : OAs[1].StoredValues) {
1679 for (
auto *S : OAs[2].StoredValues) {
1683 LLVM_DEBUG(
dbgs() <<
"\t\toffload_sizes: " << ValuesStr <<
"\n");
1693 bool IsWorthIt =
false;
1712 return RuntimeCall.
getParent()->getTerminator();
1716 bool splitTargetDataBeginRTC(
CallInst &RuntimeCall,
1721 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1725 Entry.getFirstNonPHIOrDbgOrAlloca());
1727 IRBuilder.AsyncInfo,
nullptr,
"handle");
1735 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1739 for (
auto &Arg : RuntimeCall.
args())
1740 Args.push_back(Arg.get());
1741 Args.push_back(Handle);
1745 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1751 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1753 Value *WaitParams[2] = {
1755 OffloadArray::DeviceIDArgNum),
1759 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1760 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1765 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1766 bool GlobalOnly,
bool &SingleChoice) {
1767 if (CurrentIdent == NextIdent)
1768 return CurrentIdent;
1772 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1773 SingleChoice = !CurrentIdent;
1785 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1787 bool SingleChoice =
true;
1788 Value *Ident =
nullptr;
1790 CallInst *CI = getCallIfRegularCall(U, &RFI);
1791 if (!CI || &
F != &Caller)
1794 true, SingleChoice);
1797 RFI.foreachUse(SCC, CombineIdentStruct);
1799 if (!Ident || !SingleChoice) {
1802 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1804 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1809 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1810 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1817 bool deduplicateRuntimeCalls(
Function &
F,
1818 OMPInformationCache::RuntimeFunctionInfo &RFI,
1819 Value *ReplVal =
nullptr) {
1820 auto *UV = RFI.getUseVector(
F);
1821 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1825 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1826 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1828 assert((!ReplVal || (isa<Argument>(ReplVal) &&
1829 cast<Argument>(ReplVal)->
getParent() == &
F)) &&
1830 "Unexpected replacement value!");
1833 auto CanBeMoved = [
this](
CallBase &CB) {
1834 unsigned NumArgs = CB.arg_size();
1837 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1839 for (
unsigned U = 1;
U < NumArgs; ++
U)
1840 if (isa<Instruction>(CB.getArgOperand(U)))
1851 for (
Use *U : *UV) {
1852 if (
CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1857 if (!CanBeMoved(*CI))
1865 assert(IP &&
"Expected insertion point!");
1866 cast<Instruction>(ReplVal)->moveBefore(IP);
1872 if (
CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1875 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1881 bool Changed =
false;
1883 CallInst *CI = getCallIfRegularCall(U, &RFI);
1884 if (!CI || CI == ReplVal || &
F != &Caller)
1889 return OR <<
"OpenMP runtime call "
1890 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1893 emitRemark<OptimizationRemark>(CI,
"OMP170",
Remark);
1895 emitRemark<OptimizationRemark>(&
F,
"OMP170",
Remark);
1899 ++NumOpenMPRuntimeCallsDeduplicated;
1903 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1917 if (!
F.hasLocalLinkage())
1919 for (
Use &U :
F.uses()) {
1920 if (
CallInst *CI = getCallIfRegularCall(U)) {
1922 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1923 getCallIfRegularCall(
1924 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1933 auto AddUserArgs = [&](
Value >Id) {
1934 for (
Use &U : GTId.uses())
1935 if (
CallInst *CI = dyn_cast<CallInst>(
U.getUser()))
1938 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1943 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1944 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1946 GlobThreadNumRFI.foreachUse(SCC, [&](
Use &U,
Function &
F) {
1947 if (
CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1955 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1956 AddUserArgs(*GTIdArgs[U]);
1971 return getUniqueKernelFor(*
I.getFunction());
1976 bool rewriteDeviceCodeStateMachine();
1992 template <
typename RemarkKind,
typename RemarkCallBack>
1994 RemarkCallBack &&RemarkCB)
const {
1996 auto &ORE = OREGetter(
F);
2000 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2001 <<
" [" << RemarkName <<
"]";
2005 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2009 template <
typename RemarkKind,
typename RemarkCallBack>
2011 RemarkCallBack &&RemarkCB)
const {
2012 auto &ORE = OREGetter(
F);
2016 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2017 <<
" [" << RemarkName <<
"]";
2021 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2035 OptimizationRemarkGetter OREGetter;
2038 OMPInformationCache &OMPInfoCache;
2044 bool runAttributor(
bool IsModulePass) {
2048 registerAAs(IsModulePass);
2053 <<
" functions, result: " << Changed <<
".\n");
2055 if (Changed == ChangeStatus::CHANGED)
2056 OMPInfoCache.invalidateAnalyses();
2058 return Changed == ChangeStatus::CHANGED;
2065 void registerAAs(
bool IsModulePass);
2074 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2075 !OMPInfoCache.CGSCC->contains(&
F))
2080 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2082 return *CachedKernel;
2089 return *CachedKernel;
2092 CachedKernel =
nullptr;
2093 if (!
F.hasLocalLinkage()) {
2097 return ORA <<
"Potentially unknown OpenMP target region caller.";
2099 emitRemark<OptimizationRemarkAnalysis>(&
F,
"OMP100",
Remark);
2105 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2106 if (
auto *Cmp = dyn_cast<ICmpInst>(
U.getUser())) {
2108 if (
Cmp->isEquality())
2109 return getUniqueKernelFor(*Cmp);
2112 if (
auto *CB = dyn_cast<CallBase>(
U.getUser())) {
2114 if (CB->isCallee(&U))
2115 return getUniqueKernelFor(*CB);
2117 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2118 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2120 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2121 return getUniqueKernelFor(*CB);
2130 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2131 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2135 if (PotentialKernels.
size() == 1)
2136 K = *PotentialKernels.
begin();
2139 UniqueKernelMap[&
F] =
K;
2144bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2145 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2146 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2148 bool Changed =
false;
2149 if (!KernelParallelRFI)
2160 bool UnknownUse =
false;
2161 bool KernelParallelUse =
false;
2162 unsigned NumDirectCalls = 0;
2165 OMPInformationCache::foreachUse(*
F, [&](
Use &U) {
2166 if (
auto *CB = dyn_cast<CallBase>(
U.getUser()))
2167 if (CB->isCallee(&U)) {
2172 if (isa<ICmpInst>(
U.getUser())) {
2173 ToBeReplacedStateMachineUses.push_back(&U);
2179 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2180 const unsigned int WrapperFunctionArgNo = 6;
2181 if (!KernelParallelUse && CI &&
2183 KernelParallelUse = true;
2184 ToBeReplacedStateMachineUses.push_back(&U);
2192 if (!KernelParallelUse)
2198 if (UnknownUse || NumDirectCalls != 1 ||
2199 ToBeReplacedStateMachineUses.
size() > 2) {
2201 return ORA <<
"Parallel region is used in "
2202 << (UnknownUse ?
"unknown" :
"unexpected")
2203 <<
" ways. Will not attempt to rewrite the state machine.";
2205 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP101",
Remark);
2214 return ORA <<
"Parallel region is not called from a unique kernel. "
2215 "Will not attempt to rewrite the state machine.";
2217 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP102",
Remark);
2233 for (
Use *U : ToBeReplacedStateMachineUses)
2235 ID,
U->get()->getType()));
2237 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2246struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2251 bool isAssumedTracked()
const {
return getAssumed(); }
2254 bool isKnownTracked()
const {
return getAssumed(); }
2263 return std::nullopt;
2269 virtual std::optional<Value *>
2277 const std::string
getName()
const override {
return "AAICVTracker"; }
2280 const char *getIdAddr()
const override {
return &
ID; }
2287 static const char ID;
2290struct AAICVTrackerFunction :
public AAICVTracker {
2292 : AAICVTracker(IRP,
A) {}
2295 const std::string getAsStr(
Attributor *)
const override {
2296 return "ICVTrackerFunction";
2300 void trackStatistics()
const override {}
2304 return ChangeStatus::UNCHANGED;
2309 InternalControlVar::ICV___last>
2310 ICVReplacementValuesMap;
2317 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2320 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2322 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2324 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2330 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2331 HasChanged = ChangeStatus::CHANGED;
2337 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2338 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2339 HasChanged = ChangeStatus::CHANGED;
2345 SetterRFI.foreachUse(TrackValues,
F);
2347 bool UsedAssumedInformation =
false;
2348 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2349 UsedAssumedInformation,
2355 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2356 ValuesMap.insert(std::make_pair(Entry,
nullptr));
2367 const auto *CB = dyn_cast<CallBase>(&
I);
2368 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2369 CB->hasFnAttr(
"no_openmp_routines"))
2370 return std::nullopt;
2372 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2373 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2374 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2375 Function *CalledFunction = CB->getCalledFunction();
2378 if (CalledFunction ==
nullptr)
2380 if (CalledFunction == GetterRFI.Declaration)
2381 return std::nullopt;
2382 if (CalledFunction == SetterRFI.Declaration) {
2383 if (ICVReplacementValuesMap[ICV].
count(&
I))
2384 return ICVReplacementValuesMap[ICV].
lookup(&
I);
2393 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2396 if (ICVTrackingAA->isAssumedTracked()) {
2397 std::optional<Value *> URV =
2398 ICVTrackingAA->getUniqueReplacementValue(ICV);
2409 std::optional<Value *>
2411 return std::nullopt;
2418 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2419 if (ValuesMap.count(
I))
2420 return ValuesMap.lookup(
I);
2426 std::optional<Value *> ReplVal;
2428 while (!Worklist.
empty()) {
2430 if (!Visited.
insert(CurrInst).second)
2438 if (ValuesMap.count(CurrInst)) {
2439 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2442 ReplVal = NewReplVal;
2448 if (ReplVal != NewReplVal)
2454 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2460 ReplVal = NewReplVal;
2466 if (ReplVal != NewReplVal)
2471 if (CurrBB ==
I->getParent() && ReplVal)
2476 if (
const Instruction *Terminator = Pred->getTerminator())
2484struct AAICVTrackerFunctionReturned : AAICVTracker {
2486 : AAICVTracker(IRP,
A) {}
2489 const std::string getAsStr(
Attributor *)
const override {
2490 return "ICVTrackerFunctionReturned";
2494 void trackStatistics()
const override {}
2498 return ChangeStatus::UNCHANGED;
2503 InternalControlVar::ICV___last>
2504 ICVReplacementValuesMap;
2507 std::optional<Value *>
2509 return ICVReplacementValuesMap[ICV];
2514 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2517 if (!ICVTrackingAA->isAssumedTracked())
2518 return indicatePessimisticFixpoint();
2521 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2522 std::optional<Value *> UniqueICVValue;
2525 std::optional<Value *> NewReplVal =
2526 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2529 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2532 UniqueICVValue = NewReplVal;
2537 bool UsedAssumedInformation =
false;
2538 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2539 UsedAssumedInformation,
2541 UniqueICVValue =
nullptr;
2543 if (UniqueICVValue == ReplVal)
2546 ReplVal = UniqueICVValue;
2547 Changed = ChangeStatus::CHANGED;
2554struct AAICVTrackerCallSite : AAICVTracker {
2556 : AAICVTracker(IRP,
A) {}
2559 assert(getAnchorScope() &&
"Expected anchor function");
2563 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2565 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2566 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2567 if (Getter.Declaration == getAssociatedFunction()) {
2568 AssociatedICV = ICVInfo.Kind;
2574 indicatePessimisticFixpoint();
2578 if (!ReplVal || !*ReplVal)
2579 return ChangeStatus::UNCHANGED;
2582 A.deleteAfterManifest(*getCtxI());
2584 return ChangeStatus::CHANGED;
2588 const std::string getAsStr(
Attributor *)
const override {
2589 return "ICVTrackerCallSite";
2593 void trackStatistics()
const override {}
2596 std::optional<Value *> ReplVal;
2599 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2603 if (!ICVTrackingAA->isAssumedTracked())
2604 return indicatePessimisticFixpoint();
2606 std::optional<Value *> NewReplVal =
2607 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2609 if (ReplVal == NewReplVal)
2610 return ChangeStatus::UNCHANGED;
2612 ReplVal = NewReplVal;
2613 return ChangeStatus::CHANGED;
2618 std::optional<Value *>
2624struct AAICVTrackerCallSiteReturned : AAICVTracker {
2626 : AAICVTracker(IRP,
A) {}
2629 const std::string getAsStr(
Attributor *)
const override {
2630 return "ICVTrackerCallSiteReturned";
2634 void trackStatistics()
const override {}
2638 return ChangeStatus::UNCHANGED;
2643 InternalControlVar::ICV___last>
2644 ICVReplacementValuesMap;
2648 std::optional<Value *>
2650 return ICVReplacementValuesMap[ICV];
2655 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2657 DepClassTy::REQUIRED);
2660 if (!ICVTrackingAA->isAssumedTracked())
2661 return indicatePessimisticFixpoint();
2664 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2665 std::optional<Value *> NewReplVal =
2666 ICVTrackingAA->getUniqueReplacementValue(ICV);
2668 if (ReplVal == NewReplVal)
2671 ReplVal = NewReplVal;
2672 Changed = ChangeStatus::CHANGED;
2680static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2686 return hasFunctionEndAsUniqueSuccessor(
Successor);
2693 ~AAExecutionDomainFunction() {
delete RPOT; }
2697 assert(
F &&
"Expected anchor function");
2702 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2703 for (
auto &It : BEDMap) {
2707 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2708 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2709 It.getSecond().IsReachingAlignedBarrierOnly;
2711 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2712 std::to_string(AlignedBlocks) +
" of " +
2713 std::to_string(TotalBlocks) +
2714 " executed by initial thread / aligned";
2726 << BB.
getName() <<
" is executed by a single thread.\n";
2736 auto HandleAlignedBarrier = [&](
CallBase *CB) {
2737 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2738 if (!ED.IsReachedFromAlignedBarrierOnly ||
2739 ED.EncounteredNonLocalSideEffect)
2741 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2752 DeletedBarriers.
insert(CB);
2753 A.deleteAfterManifest(*CB);
2754 ++NumBarriersEliminated;
2756 }
else if (!ED.AlignedBarriers.empty()) {
2759 ED.AlignedBarriers.end());
2761 while (!Worklist.
empty()) {
2763 if (!Visited.
insert(LastCB))
2767 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2769 if (!DeletedBarriers.
count(LastCB)) {
2770 ++NumBarriersEliminated;
2771 A.deleteAfterManifest(*LastCB);
2777 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2778 Worklist.
append(LastED.AlignedBarriers.begin(),
2779 LastED.AlignedBarriers.end());
2785 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2786 for (
auto *AssumeCB : ED.EncounteredAssumes)
2787 A.deleteAfterManifest(*AssumeCB);
2790 for (
auto *CB : AlignedBarriers)
2791 HandleAlignedBarrier(CB);
2795 HandleAlignedBarrier(
nullptr);
2807 mergeInPredecessorBarriersAndAssumptions(
Attributor &
A, ExecutionDomainTy &ED,
2808 const ExecutionDomainTy &PredED);
2813 bool mergeInPredecessor(
Attributor &
A, ExecutionDomainTy &ED,
2814 const ExecutionDomainTy &PredED,
2815 bool InitialEdgeOnly =
false);
2818 bool handleCallees(
Attributor &
A, ExecutionDomainTy &EntryBBED);
2828 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2829 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2834 assert(
I.getFunction() == getAnchorScope() &&
2835 "Instruction is out of scope!");
2839 bool ForwardIsOk =
true;
2845 auto *CB = dyn_cast<CallBase>(CurI);
2848 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2850 const auto &It = CEDMap.find({CB, PRE});
2851 if (It == CEDMap.end())
2853 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2854 ForwardIsOk =
false;
2858 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2859 ForwardIsOk =
false;
2864 auto *CB = dyn_cast<CallBase>(CurI);
2867 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2869 const auto &It = CEDMap.find({CB, POST});
2870 if (It == CEDMap.end())
2872 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2885 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2887 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2899 "No request should be made against an invalid state!");
2900 return BEDMap.lookup(&BB);
2902 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2905 "No request should be made against an invalid state!");
2906 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2910 "No request should be made against an invalid state!");
2911 return InterProceduralED;
2925 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2933 if (
C->isAllOnesValue()) {
2934 auto *CB = dyn_cast<CallBase>(
Cmp->getOperand(0));
2935 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2936 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2937 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2943 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2949 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2950 if (
II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2954 if (
auto *
II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2955 if (
II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2963 ExecutionDomainTy InterProceduralED;
2975 static bool setAndRecord(
bool &R,
bool V) {
2986void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2987 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2988 for (
auto *EA : PredED.EncounteredAssumes)
2989 ED.addAssumeInst(
A, *EA);
2991 for (
auto *AB : PredED.AlignedBarriers)
2992 ED.addAlignedBarrier(
A, *AB);
2995bool AAExecutionDomainFunction::mergeInPredecessor(
2996 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
2997 bool InitialEdgeOnly) {
2999 bool Changed =
false;
3001 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3002 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3003 ED.IsExecutedByInitialThreadOnly));
3005 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3006 ED.IsReachedFromAlignedBarrierOnly &&
3007 PredED.IsReachedFromAlignedBarrierOnly);
3008 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3009 ED.EncounteredNonLocalSideEffect |
3010 PredED.EncounteredNonLocalSideEffect);
3012 if (ED.IsReachedFromAlignedBarrierOnly)
3013 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3015 ED.clearAssumeInstAndAlignedBarriers();
3019bool AAExecutionDomainFunction::handleCallees(
Attributor &
A,
3020 ExecutionDomainTy &EntryBBED) {
3025 DepClassTy::OPTIONAL);
3026 if (!EDAA || !EDAA->getState().isValidState())
3029 EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
3033 ExecutionDomainTy ExitED;
3034 bool AllCallSitesKnown;
3035 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3037 AllCallSitesKnown)) {
3038 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3039 mergeInPredecessor(
A, EntryBBED, CSInED);
3040 ExitED.IsReachingAlignedBarrierOnly &=
3041 CSOutED.IsReachingAlignedBarrierOnly;
3048 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3049 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3050 EntryBBED.EncounteredNonLocalSideEffect =
false;
3051 ExitED.IsReachingAlignedBarrierOnly =
false;
3053 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3054 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3055 EntryBBED.EncounteredNonLocalSideEffect =
true;
3056 ExitED.IsReachingAlignedBarrierOnly =
false;
3060 bool Changed =
false;
3061 auto &FnED = BEDMap[
nullptr];
3062 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3063 FnED.IsReachedFromAlignedBarrierOnly &
3064 EntryBBED.IsReachedFromAlignedBarrierOnly);
3065 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3066 FnED.IsReachingAlignedBarrierOnly &
3067 ExitED.IsReachingAlignedBarrierOnly);
3068 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3069 EntryBBED.IsExecutedByInitialThreadOnly);
3075 bool Changed =
false;
3080 auto HandleAlignedBarrier = [&](
CallBase &CB, ExecutionDomainTy &ED) {
3081 Changed |= AlignedBarriers.insert(&CB);
3083 auto &CallInED = CEDMap[{&CB, PRE}];
3084 Changed |= mergeInPredecessor(
A, CallInED, ED);
3085 CallInED.IsReachingAlignedBarrierOnly =
true;
3087 ED.EncounteredNonLocalSideEffect =
false;
3088 ED.IsReachedFromAlignedBarrierOnly =
true;
3090 ED.clearAssumeInstAndAlignedBarriers();
3091 ED.addAlignedBarrier(
A, CB);
3092 auto &CallOutED = CEDMap[{&CB, POST}];
3093 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3097 A.getAAFor<
AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3104 for (
auto &RIt : *RPOT) {
3107 bool IsEntryBB = &BB == &EntryBB;
3110 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3111 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3112 ExecutionDomainTy ED;
3115 Changed |= handleCallees(
A, ED);
3119 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3123 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3125 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3126 A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
3127 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3134 bool UsedAssumedInformation;
3135 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3136 false, DepClassTy::OPTIONAL,
3142 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
3143 if (
auto *AI = dyn_cast_or_null<AssumeInst>(
II)) {
3144 ED.addAssumeInst(
A, *AI);
3148 if (
II->isAssumeLikeIntrinsic())
3152 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
3153 if (!ED.EncounteredNonLocalSideEffect) {
3155 if (ED.IsReachedFromAlignedBarrierOnly)
3160 case AtomicOrdering::NotAtomic:
3162 case AtomicOrdering::Unordered:
3164 case AtomicOrdering::Monotonic:
3166 case AtomicOrdering::Acquire:
3168 case AtomicOrdering::Release:
3170 case AtomicOrdering::AcquireRelease:
3172 case AtomicOrdering::SequentiallyConsistent:
3176 NonNoOpFences.insert(FI);
3179 auto *CB = dyn_cast<CallBase>(&
I);
3181 bool IsAlignedBarrier =
3185 AlignedBarrierLastInBlock &= IsNoSync;
3186 IsExplicitlyAligned &= IsNoSync;
3192 if (IsAlignedBarrier) {
3193 HandleAlignedBarrier(*CB, ED);
3194 AlignedBarrierLastInBlock =
true;
3195 IsExplicitlyAligned =
true;
3200 if (isa<MemIntrinsic>(&
I)) {
3201 if (!ED.EncounteredNonLocalSideEffect &&
3203 ED.EncounteredNonLocalSideEffect =
true;
3205 ED.IsReachedFromAlignedBarrierOnly =
false;
3213 auto &CallInED = CEDMap[{CB, PRE}];
3214 Changed |= mergeInPredecessor(
A, CallInED, ED);
3220 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3223 if (EDAA && EDAA->getState().isValidState()) {
3226 CalleeED.IsReachedFromAlignedBarrierOnly;
3227 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3228 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3229 ED.EncounteredNonLocalSideEffect |=
3230 CalleeED.EncounteredNonLocalSideEffect;
3232 ED.EncounteredNonLocalSideEffect =
3233 CalleeED.EncounteredNonLocalSideEffect;
3234 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3236 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3239 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3240 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3241 auto &CallOutED = CEDMap[{CB, POST}];
3242 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3247 ED.IsReachedFromAlignedBarrierOnly =
false;
3248 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3251 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3253 auto &CallOutED = CEDMap[{CB, POST}];
3254 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3257 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3271 if (MemAA && MemAA->getState().isValidState() &&
3272 MemAA->checkForAllAccessesToMemoryKind(
3277 auto &InfoCache =
A.getInfoCache();
3278 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3281 if (
auto *LI = dyn_cast<LoadInst>(&
I))
3282 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3285 if (!ED.EncounteredNonLocalSideEffect &&
3287 ED.EncounteredNonLocalSideEffect =
true;
3290 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3291 if (!isa<UnreachableInst>(BB.getTerminator()) &&
3292 !BB.getTerminator()->getNumSuccessors()) {
3294 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3296 auto &FnED = BEDMap[
nullptr];
3297 if (IsKernel && !IsExplicitlyAligned)
3298 FnED.IsReachingAlignedBarrierOnly =
false;
3299 Changed |= mergeInPredecessor(
A, FnED, ED);
3301 if (!FnED.IsReachingAlignedBarrierOnly) {
3302 IsEndAndNotReachingAlignedBarriersOnly =
true;
3303 SyncInstWorklist.
push_back(BB.getTerminator());
3304 auto &BBED = BEDMap[&BB];
3305 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3309 ExecutionDomainTy &StoredED = BEDMap[&BB];
3310 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3311 !IsEndAndNotReachingAlignedBarriersOnly;
3317 if (ED.IsExecutedByInitialThreadOnly !=
3318 StoredED.IsExecutedByInitialThreadOnly ||
3319 ED.IsReachedFromAlignedBarrierOnly !=
3320 StoredED.IsReachedFromAlignedBarrierOnly ||
3321 ED.EncounteredNonLocalSideEffect !=
3322 StoredED.EncounteredNonLocalSideEffect)
3326 StoredED = std::move(ED);
3332 while (!SyncInstWorklist.
empty()) {
3335 bool HitAlignedBarrierOrKnownEnd =
false;
3337 auto *CB = dyn_cast<CallBase>(CurInst);
3340 auto &CallOutED = CEDMap[{CB, POST}];
3341 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3342 auto &CallInED = CEDMap[{CB, PRE}];
3343 HitAlignedBarrierOrKnownEnd =
3344 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3345 if (HitAlignedBarrierOrKnownEnd)
3347 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3349 if (HitAlignedBarrierOrKnownEnd)
3353 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3355 if (!Visited.
insert(PredBB))
3357 auto &PredED = BEDMap[PredBB];
3358 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3360 SyncInstWorklist.
push_back(PredBB->getTerminator());
3363 if (SyncBB != &EntryBB)
3366 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3369 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3374struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3379 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3383 virtual bool isAssumedHeapToShared(
CallBase &CB)
const = 0;
3387 virtual bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const = 0;
3390 const std::string
getName()
const override {
return "AAHeapToShared"; }
3393 const char *getIdAddr()
const override {
return &
ID; }
3402 static const char ID;
3405struct AAHeapToSharedFunction :
public AAHeapToShared {
3407 : AAHeapToShared(IRP,
A) {}
3409 const std::string getAsStr(
Attributor *)
const override {
3410 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3411 " malloc calls eligible.";
3415 void trackStatistics()
const override {}
3419 void findPotentialRemovedFreeCalls(
Attributor &
A) {
3420 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3421 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3423 PotentialRemovedFreeCalls.clear();
3427 for (
auto *U : CB->
users()) {
3429 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3433 if (FreeCalls.
size() != 1)
3436 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3442 indicatePessimisticFixpoint();
3446 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3447 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3448 if (!RFI.Declaration)
3453 bool &) -> std::optional<Value *> {
return nullptr; };
3456 for (
User *U : RFI.Declaration->
users())
3457 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3460 MallocCalls.insert(CB);
3465 findPotentialRemovedFreeCalls(
A);
3468 bool isAssumedHeapToShared(
CallBase &CB)
const override {
3469 return isValidState() && MallocCalls.count(&CB);
3472 bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const override {
3473 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3477 if (MallocCalls.empty())
3478 return ChangeStatus::UNCHANGED;
3480 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3481 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3485 DepClassTy::OPTIONAL);
3490 if (HS &&
HS->isAssumedHeapToStack(*CB))
3495 for (
auto *U : CB->
users()) {
3497 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3500 if (FreeCalls.
size() != 1)
3507 <<
" with shared memory."
3508 <<
" Shared memory usage is limited to "
3514 <<
" with " << AllocSize->getZExtValue()
3515 <<
" bytes of shared memory\n");
3521 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3526 static_cast<unsigned>(AddressSpace::Shared));
3531 return OR <<
"Replaced globalized variable with "
3532 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3533 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3534 <<
"of shared memory.";
3540 "HeapToShared on allocation without alignment attribute");
3541 SharedMem->setAlignment(*Alignment);
3544 A.deleteAfterManifest(*CB);
3545 A.deleteAfterManifest(*FreeCalls.
front());
3547 SharedMemoryUsed += AllocSize->getZExtValue();
3548 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3549 Changed = ChangeStatus::CHANGED;
3556 if (MallocCalls.empty())
3557 return indicatePessimisticFixpoint();
3558 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3559 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3560 if (!RFI.Declaration)
3561 return ChangeStatus::UNCHANGED;
3565 auto NumMallocCalls = MallocCalls.size();
3568 for (
User *U : RFI.Declaration->
users()) {
3569 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3570 if (CB->getCaller() !=
F)
3572 if (!MallocCalls.count(CB))
3574 if (!isa<ConstantInt>(CB->getArgOperand(0))) {
3575 MallocCalls.remove(CB);
3580 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3581 MallocCalls.remove(CB);
3585 findPotentialRemovedFreeCalls(
A);
3587 if (NumMallocCalls != MallocCalls.size())
3588 return ChangeStatus::CHANGED;
3590 return ChangeStatus::UNCHANGED;
3598 unsigned SharedMemoryUsed = 0;
3601struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3607 static bool requiresCalleeForCallBase() {
return false; }
3610 void trackStatistics()
const override {}
3613 const std::string getAsStr(
Attributor *)
const override {
3614 if (!isValidState())
3616 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3618 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3620 std::string(
" #PRs: ") +
3621 (ReachedKnownParallelRegions.isValidState()
3622 ? std::to_string(ReachedKnownParallelRegions.size())
3624 ", #Unknown PRs: " +
3625 (ReachedUnknownParallelRegions.isValidState()
3628 ", #Reaching Kernels: " +
3629 (ReachingKernelEntries.isValidState()
3633 (ParallelLevels.isValidState()
3636 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3643 const std::string
getName()
const override {
return "AAKernelInfo"; }
3646 const char *getIdAddr()
const override {
return &
ID; }
3653 static const char ID;
3658struct AAKernelInfoFunction : AAKernelInfo {
3660 : AAKernelInfo(IRP,
A) {}
3665 return GuardedInstructions;
3668 void setConfigurationOfKernelEnvironment(
ConstantStruct *ConfigC) {
3670 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3671 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3672 KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
3675#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3676 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3677 ConstantStruct *ConfigC = \
3678 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3679 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3680 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3681 assert(NewConfigC && "Failed to create new configuration environment"); \
3682 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3693#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3700 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3704 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3705 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3706 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3707 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3711 auto StoreCallBase = [](
Use &U,
3712 OMPInformationCache::RuntimeFunctionInfo &RFI,
3714 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3716 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3718 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3724 StoreCallBase(U, InitRFI, KernelInitCB);
3728 DeinitRFI.foreachUse(
3730 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3736 if (!KernelInitCB || !KernelDeinitCB)
3740 ReachingKernelEntries.insert(Fn);
3741 IsKernelEntry =
true;
3749 KernelConfigurationSimplifyCB =
3751 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3752 if (!isAtFixpoint()) {
3755 UsedAssumedInformation =
true;
3756 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
3761 A.registerGlobalVariableSimplificationCallback(
3762 *KernelEnvGV, KernelConfigurationSimplifyCB);
3766 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3771 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3775 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3777 setExecModeOfKernelEnvironment(AssumedExecModeC);
3784 setMinThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinThreads));
3786 setMaxThreadsOfKernelEnvironment(ConstantInt::get(Int32Ty,
MaxThreads));
3787 auto [MinTeams, MaxTeams] =
3790 setMinTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MinTeams));
3792 setMaxTeamsOfKernelEnvironment(ConstantInt::get(Int32Ty, MaxTeams));
3795 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3796 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3798 setMayUseNestedParallelismOfKernelEnvironment(
3799 AssumedMayUseNestedParallelismC);
3803 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3806 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3807 setUseGenericStateMachineOfKernelEnvironment(
3808 AssumedUseGenericStateMachineC);
3814 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3816 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3820 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3823 A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
3837 if (SPMDCompatibilityTracker.isValidState())
3838 return AddDependence(
A,
this, QueryingAA);
3840 if (!ReachedKnownParallelRegions.isValidState())
3841 return AddDependence(
A,
this, QueryingAA);
3847 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3848 CustomStateMachineUseCB);
3849 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3850 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3851 CustomStateMachineUseCB);
3852 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3853 CustomStateMachineUseCB);
3854 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3855 CustomStateMachineUseCB);
3859 if (SPMDCompatibilityTracker.isAtFixpoint())
3866 if (!SPMDCompatibilityTracker.isValidState())
3867 return AddDependence(
A,
this, QueryingAA);
3870 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3879 if (!SPMDCompatibilityTracker.isValidState())
3880 return AddDependence(
A,
this, QueryingAA);
3881 if (SPMDCompatibilityTracker.empty())
3882 return AddDependence(
A,
this, QueryingAA);
3883 if (!mayContainParallelRegion())
3884 return AddDependence(
A,
this, QueryingAA);
3887 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3891 static std::string sanitizeForGlobalName(std::string S) {
3895 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3896 (C >=
'0' && C <=
'9') || C ==
'_');
3907 if (!KernelInitCB || !KernelDeinitCB)
3908 return ChangeStatus::UNCHANGED;
3912 bool HasBuiltStateMachine =
true;
3913 if (!changeToSPMDMode(
A, Changed)) {
3915 HasBuiltStateMachine = buildCustomStateMachine(
A, Changed);
3917 HasBuiltStateMachine =
false;
3924 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3925 ExistingKernelEnvC);
3926 if (!HasBuiltStateMachine)
3927 setUseGenericStateMachineOfKernelEnvironment(
3928 OldUseGenericStateMachineVal);
3935 Changed = ChangeStatus::CHANGED;
3941 void insertInstructionGuardsHelper(
Attributor &
A) {
3942 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3944 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3978 DT, LI, MSU,
"region.guarded.end");
3981 MSU,
"region.barrier");
3984 DT, LI, MSU,
"region.exit");
3986 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3989 "Expected a different CFG");
3992 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
3995 A.registerManifestAddedBasicBlock(*RegionEndBB);
3996 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
3997 A.registerManifestAddedBasicBlock(*RegionExitBB);
3998 A.registerManifestAddedBasicBlock(*RegionStartBB);
3999 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4001 bool HasBroadcastValues =
false;
4006 for (
Use &U :
I.uses()) {
4012 if (OutsideUses.
empty())
4015 HasBroadcastValues =
true;
4020 M,
I.getType(),
false,
4022 sanitizeForGlobalName(
4023 (
I.getName() +
".guarded.output.alloc").str()),
4025 static_cast<unsigned>(AddressSpace::Shared));
4032 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4036 for (
Use *U : OutsideUses)
4037 A.changeUseAfterManifest(*U, *LoadI);
4040 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4046 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4047 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4050 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4052 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4058 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4059 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4061 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4062 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4064 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4066 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4067 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4068 OMPInfoCache.OMPBuilder.Builder
4069 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4075 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4076 M, OMPRTL___kmpc_barrier_simple_spmd);
4077 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4080 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4082 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4085 if (HasBroadcastValues) {
4090 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4094 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4096 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4098 if (!Visited.
insert(BB).second)
4104 while (++IP != IPEnd) {
4105 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4108 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4110 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4111 LastEffect =
nullptr;
4118 for (
auto &Reorder : Reorders)
4124 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4126 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4129 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4130 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
4132 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4135 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4139 if (SPMDCompatibilityTracker.contains(&
I)) {
4140 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4141 if (GuardedRegionStart)
4142 GuardedRegionEnd = &
I;
4144 GuardedRegionStart = GuardedRegionEnd = &
I;
4151 if (GuardedRegionStart) {
4153 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4154 GuardedRegionStart =
nullptr;
4155 GuardedRegionEnd =
nullptr;
4160 for (
auto &GR : GuardedRegions)
4161 CreateGuardedRegion(GR.first, GR.second);
4164 void forceSingleThreadPerWorkgroupHelper(
Attributor &
A) {
4173 auto &Ctx = getAnchorValue().getContext();
4180 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4185 A.registerManifestAddedBasicBlock(*InitBB);
4186 A.registerManifestAddedBasicBlock(*UserCodeBB);
4187 A.registerManifestAddedBasicBlock(*ReturnBB);
4196 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4198 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4199 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4204 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4210 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4211 "thread.is_main", InitBB);
4217 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4220 if (!OMPInfoCache.runtimeFnsAvailable(
4221 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
4222 OMPRTL___kmpc_barrier_simple_spmd}))
4225 if (!SPMDCompatibilityTracker.isAssumed()) {
4226 for (
Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4227 if (!NonCompatibleI)
4231 if (
auto *CB = dyn_cast<CallBase>(NonCompatibleI))
4232 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4236 ORA <<
"Value has potential side effects preventing SPMD-mode "
4238 if (isa<CallBase>(NonCompatibleI)) {
4239 ORA <<
". Add `[[omp::assume(\"ompx_spmd_amenable\")]]` to "
4240 "the called function to override";
4248 << *NonCompatibleI <<
"\n");
4260 Kernel = CB->getCaller();
4268 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4274 Changed = ChangeStatus::CHANGED;
4278 if (mayContainParallelRegion())
4279 insertInstructionGuardsHelper(
A);
4281 forceSingleThreadPerWorkgroupHelper(
A);
4286 "Initially non-SPMD kernel has SPMD exec mode!");
4287 setExecModeOfKernelEnvironment(
4291 ++NumOpenMPTargetRegionKernelsSPMD;
4294 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4306 if (!ReachedKnownParallelRegions.isValidState())
4309 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4310 if (!OMPInfoCache.runtimeFnsAvailable(
4311 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4312 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4313 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4324 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4325 ExistingKernelEnvC);
4327 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4332 if (UseStateMachineC->
isZero() ||
4336 Changed = ChangeStatus::CHANGED;
4339 setUseGenericStateMachineOfKernelEnvironment(
4346 if (!mayContainParallelRegion()) {
4347 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4350 return OR <<
"Removing unused state machine from generic-mode kernel.";
4358 if (ReachedUnknownParallelRegions.empty()) {
4359 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4362 return OR <<
"Rewriting generic-mode kernel with a customized state "
4367 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4370 return OR <<
"Generic-mode kernel is executed with a customized state "
4371 "machine that requires a fallback.";
4376 for (
CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4377 if (!UnknownParallelRegionCB)
4380 return ORA <<
"Call may contain unknown parallel regions. Use "
4381 <<
"`[[omp::assume(\"omp_no_parallelism\")]]` to "
4419 auto &Ctx = getAnchorValue().getContext();
4423 BasicBlock *InitBB = KernelInitCB->getParent();
4425 KernelInitCB->getNextNode(),
"thread.user_code.check");
4429 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4431 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4433 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4436 Kernel, UserCodeEntryBB);
4439 Kernel, UserCodeEntryBB);
4441 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4442 A.registerManifestAddedBasicBlock(*InitBB);
4443 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4444 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4445 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4446 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4452 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4458 ConstantInt::get(KernelInitCB->getType(), -1),
4459 "thread.is_worker", InitBB);
4465 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4466 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4468 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4469 M, OMPRTL___kmpc_get_warp_size);
4472 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4476 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4479 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4483 "thread.is_main_or_worker", IsWorkerCheckBB);
4486 IsMainOrWorker, IsWorkerCheckBB);
4490 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4492 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4496 OMPInfoCache.OMPBuilder.updateToLocation(
4499 StateMachineBeginBB->
end()),
4502 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4503 Value *GTid = KernelInitCB;
4506 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4507 M, OMPRTL___kmpc_barrier_simple_generic);
4510 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4514 (
unsigned int)AddressSpace::Generic) {
4516 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4517 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4522 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4523 M, OMPRTL___kmpc_kernel_parallel);
4525 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4526 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4529 StateMachineBeginBB);
4539 StateMachineBeginBB);
4540 IsDone->setDebugLoc(DLoc);
4542 IsDone, StateMachineBeginBB)
4546 StateMachineDoneBarrierBB, IsActiveWorker,
4547 StateMachineIsActiveCheckBB)
4553 const unsigned int WrapperFunctionArgNo = 6;
4558 for (
int I = 0, E = ReachedKnownParallelRegions.size();
I < E; ++
I) {
4559 auto *CB = ReachedKnownParallelRegions[
I];
4560 auto *ParallelRegion = dyn_cast<Function>(
4561 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4563 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4564 StateMachineEndParallelBB);
4566 ->setDebugLoc(DLoc);
4572 Kernel, StateMachineEndParallelBB);
4573 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4574 A.registerManifestAddedBasicBlock(*PRNextBB);
4579 if (
I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4582 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4590 StateMachineIfCascadeCurrentBB)
4592 StateMachineIfCascadeCurrentBB = PRNextBB;
4598 if (!ReachedUnknownParallelRegions.empty()) {
4599 StateMachineIfCascadeCurrentBB->
setName(
4600 "worker_state_machine.parallel_region.fallback.execute");
4602 StateMachineIfCascadeCurrentBB)
4603 ->setDebugLoc(DLoc);
4606 StateMachineIfCascadeCurrentBB)
4610 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4611 M, OMPRTL___kmpc_kernel_end_parallel);
4614 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4620 ->setDebugLoc(DLoc);
4630 KernelInfoState StateBefore = getState();
4636 struct UpdateKernelEnvCRAII {
4637 AAKernelInfoFunction &AA;
4639 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4641 ~UpdateKernelEnvCRAII() {
4648 if (!AA.isValidState()) {
4649 AA.KernelEnvC = ExistingKernelEnvC;
4653 if (!AA.ReachedKnownParallelRegions.isValidState())
4654 AA.setUseGenericStateMachineOfKernelEnvironment(
4655 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4656 ExistingKernelEnvC));
4658 if (!AA.SPMDCompatibilityTracker.isValidState())
4659 AA.setExecModeOfKernelEnvironment(
4660 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4663 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4665 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4666 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4667 AA.setMayUseNestedParallelismOfKernelEnvironment(
4668 NewMayUseNestedParallelismC);
4675 if (isa<CallBase>(
I))
4678 if (!
I.mayWriteToMemory())
4680 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
4683 DepClassTy::OPTIONAL);
4686 DepClassTy::OPTIONAL);
4687 if (UnderlyingObjsAA &&
4688 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4689 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4693 auto *CB = dyn_cast<CallBase>(&Obj);
4694 return CB && HS && HS->isAssumedHeapToStack(*CB);
4700 SPMDCompatibilityTracker.insert(&
I);
4704 bool UsedAssumedInformationInCheckRWInst =
false;
4705 if (!SPMDCompatibilityTracker.isAtFixpoint())
4706 if (!
A.checkForAllReadWriteInstructions(
4707 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4710 bool UsedAssumedInformationFromReachingKernels =
false;
4711 if (!IsKernelEntry) {
4712 updateParallelLevels(
A);
4714 bool AllReachingKernelsKnown =
true;
4715 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4716 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4718 if (!SPMDCompatibilityTracker.empty()) {
4719 if (!ParallelLevels.isValidState())
4720 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4721 else if (!ReachingKernelEntries.isValidState())
4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4728 for (
auto *
Kernel : ReachingKernelEntries) {
4729 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4731 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4732 CBAA->SPMDCompatibilityTracker.isAssumed())
4736 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4737 UsedAssumedInformationFromReachingKernels =
true;
4739 if (SPMD != 0 &&
Generic != 0)
4740 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4746 bool AllParallelRegionStatesWereFixed =
true;
4747 bool AllSPMDStatesWereFixed =
true;
4749 auto &CB = cast<CallBase>(
I);
4750 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4754 getState() ^= CBAA->getState();
4755 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4756 AllParallelRegionStatesWereFixed &=
4757 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4758 AllParallelRegionStatesWereFixed &=
4759 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4763 bool UsedAssumedInformationInCheckCallInst =
false;
4764 if (!
A.checkForAllCallLikeInstructions(
4765 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4767 <<
"Failed to visit all call-like instructions!\n";);
4768 return indicatePessimisticFixpoint();
4773 if (!UsedAssumedInformationInCheckCallInst &&
4774 AllParallelRegionStatesWereFixed) {
4775 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4776 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4781 if (!UsedAssumedInformationInCheckRWInst &&
4782 !UsedAssumedInformationInCheckCallInst &&
4783 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4784 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4786 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4787 : ChangeStatus::CHANGED;
4793 bool &AllReachingKernelsKnown) {
4797 assert(Caller &&
"Caller is nullptr");
4799 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4801 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4802 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4808 ReachingKernelEntries.indicatePessimisticFixpoint();
4813 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4815 AllReachingKernelsKnown))
4816 ReachingKernelEntries.indicatePessimisticFixpoint();
4821 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4822 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4823 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4828 assert(Caller &&
"Caller is nullptr");
4832 if (CAA && CAA->ParallelLevels.isValidState()) {
4838 if (Caller == Parallel51RFI.Declaration) {
4839 ParallelLevels.indicatePessimisticFixpoint();
4843 ParallelLevels ^= CAA->ParallelLevels;
4850 ParallelLevels.indicatePessimisticFixpoint();
4855 bool AllCallSitesKnown =
true;
4856 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4859 ParallelLevels.indicatePessimisticFixpoint();
4866struct AAKernelInfoCallSite : AAKernelInfo {
4868 : AAKernelInfo(IRP,
A) {}
4872 AAKernelInfo::initialize(
A);
4874 CallBase &CB = cast<CallBase>(getAssociatedValue());
4879 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4880 indicateOptimisticFixpoint();
4888 indicateOptimisticFixpoint();
4897 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4898 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4899 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4901 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4905 if (!AssumptionAA ||
4906 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4907 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4908 ReachedUnknownParallelRegions.insert(&CB);
4912 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4913 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4914 SPMDCompatibilityTracker.insert(&CB);
4919 indicateOptimisticFixpoint();
4925 if (NumCallees > 1) {
4926 indicatePessimisticFixpoint();
4933 case OMPRTL___kmpc_is_spmd_exec_mode:
4934 case OMPRTL___kmpc_distribute_static_fini:
4935 case OMPRTL___kmpc_for_static_fini:
4936 case OMPRTL___kmpc_global_thread_num:
4937 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4938 case OMPRTL___kmpc_get_hardware_num_blocks:
4939 case OMPRTL___kmpc_single:
4940 case OMPRTL___kmpc_end_single:
4941 case OMPRTL___kmpc_master:
4942 case OMPRTL___kmpc_end_master:
4943 case OMPRTL___kmpc_barrier:
4944 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4945 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4946 case OMPRTL___kmpc_error:
4947 case OMPRTL___kmpc_flush:
4948 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4949 case OMPRTL___kmpc_get_warp_size:
4950 case OMPRTL_omp_get_thread_num:
4951 case OMPRTL_omp_get_num_threads:
4952 case OMPRTL_omp_get_max_threads:
4953 case OMPRTL_omp_in_parallel:
4954 case OMPRTL_omp_get_dynamic:
4955 case OMPRTL_omp_get_cancellation:
4956 case OMPRTL_omp_get_nested:
4957 case OMPRTL_omp_get_schedule:
4958 case OMPRTL_omp_get_thread_limit:
4959 case OMPRTL_omp_get_supported_active_levels:
4960 case OMPRTL_omp_get_max_active_levels:
4961 case OMPRTL_omp_get_level:
4962 case OMPRTL_omp_get_ancestor_thread_num:
4963 case OMPRTL_omp_get_team_size:
4964 case OMPRTL_omp_get_active_level:
4965 case OMPRTL_omp_in_final:
4966 case OMPRTL_omp_get_proc_bind:
4967 case OMPRTL_omp_get_num_places:
4968 case OMPRTL_omp_get_num_procs:
4969 case OMPRTL_omp_get_place_proc_ids:
4970 case OMPRTL_omp_get_place_num:
4971 case OMPRTL_omp_get_partition_num_places:
4972 case OMPRTL_omp_get_partition_place_nums:
4973 case OMPRTL_omp_get_wtime:
4975 case OMPRTL___kmpc_distribute_static_init_4:
4976 case OMPRTL___kmpc_distribute_static_init_4u:
4977 case OMPRTL___kmpc_distribute_static_init_8:
4978 case OMPRTL___kmpc_distribute_static_init_8u:
4979 case OMPRTL___kmpc_for_static_init_4:
4980 case OMPRTL___kmpc_for_static_init_4u:
4981 case OMPRTL___kmpc_for_static_init_8:
4982 case OMPRTL___kmpc_for_static_init_8u: {
4984 unsigned ScheduleArgOpNo = 2;
4985 auto *ScheduleTypeCI =
4987 unsigned ScheduleTypeVal =
4988 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4990 case OMPScheduleType::UnorderedStatic:
4991 case OMPScheduleType::UnorderedStaticChunked:
4992 case OMPScheduleType::OrderedDistribute:
4993 case OMPScheduleType::OrderedDistributeChunked:
4996 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4997 SPMDCompatibilityTracker.insert(&CB);
5001 case OMPRTL___kmpc_target_init:
5004 case OMPRTL___kmpc_target_deinit:
5005 KernelDeinitCB = &CB;
5007 case OMPRTL___kmpc_parallel_51:
5008 if (!handleParallel51(
A, CB))
5009 indicatePessimisticFixpoint();
5011 case OMPRTL___kmpc_omp_task:
5013 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5014 SPMDCompatibilityTracker.insert(&CB);
5015 ReachedUnknownParallelRegions.insert(&CB);
5017 case OMPRTL___kmpc_alloc_shared:
5018 case OMPRTL___kmpc_free_shared:
5024 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5025 SPMDCompatibilityTracker.insert(&CB);
5031 indicateOptimisticFixpoint();
5035 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5036 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5037 CheckCallee(getAssociatedFunction(), 1);
5040 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5041 for (
auto *Callee : OptimisticEdges) {
5042 CheckCallee(Callee, OptimisticEdges.size());
5053 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5054 KernelInfoState StateBefore = getState();
5056 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5057 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5061 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5064 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5066 return indicatePessimisticFixpoint();
5067 if (getState() == FnAA->getState())
5068 return ChangeStatus::UNCHANGED;
5069 getState() = FnAA->getState();
5070 return ChangeStatus::CHANGED;
5073 return indicatePessimisticFixpoint();
5075 CallBase &CB = cast<CallBase>(getAssociatedValue());
5076 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5077 if (!handleParallel51(
A, CB))
5078 return indicatePessimisticFixpoint();
5079 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5080 : ChangeStatus::CHANGED;
5086 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5087 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5088 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5092 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5100 case OMPRTL___kmpc_alloc_shared:
5101 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5102 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5103 SPMDCompatibilityTracker.insert(&CB);
5105 case OMPRTL___kmpc_free_shared:
5106 if ((!HeapToStackAA ||
5107 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5109 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5110 SPMDCompatibilityTracker.insert(&CB);
5113 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5114 SPMDCompatibilityTracker.insert(&CB);
5116 return ChangeStatus::CHANGED;
5120 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5121 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5122 if (
Function *
F = getAssociatedFunction())
5126 for (
auto *Callee : OptimisticEdges) {
5127 CheckCallee(Callee, OptimisticEdges.size());
5133 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5134 : ChangeStatus::CHANGED;
5140 const unsigned int NonWrapperFunctionArgNo = 5;
5141 const unsigned int WrapperFunctionArgNo = 6;
5142 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5143 ? NonWrapperFunctionArgNo
5144 : WrapperFunctionArgNo;
5146 auto *ParallelRegion = dyn_cast<Function>(
5148 if (!ParallelRegion)
5151 ReachedKnownParallelRegions.insert(&CB);
5153 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5155 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5156 !FnAA->ReachedKnownParallelRegions.empty() ||
5157 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5158 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5159 !FnAA->ReachedUnknownParallelRegions.empty();
5164struct AAFoldRuntimeCall
5165 :
public StateWrapper<BooleanState, AbstractAttribute> {
5171 void trackStatistics()
const override {}
5174 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5178 const std::string
getName()
const override {
return "AAFoldRuntimeCall"; }
5181 const char *getIdAddr()
const override {
return &
ID; }
5189 static const char ID;
5192struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5194 : AAFoldRuntimeCall(IRP,
A) {}
5197 const std::string getAsStr(
Attributor *)
const override {
5198 if (!isValidState())
5201 std::string Str(
"simplified value: ");
5203 if (!SimplifiedValue)
5204 return Str + std::string(
"none");
5206 if (!*SimplifiedValue)
5207 return Str + std::string(
"nullptr");
5209 if (
ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
5210 return Str + std::to_string(CI->getSExtValue());
5212 return Str + std::string(
"unknown");
5217 indicatePessimisticFixpoint();
5221 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5222 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5223 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5224 "Expected a known OpenMP runtime function");
5226 RFKind = It->getSecond();
5228 CallBase &CB = cast<CallBase>(getAssociatedValue());
5229 A.registerSimplificationCallback(
5232 bool &UsedAssumedInformation) -> std::optional<Value *> {
5233 assert((isValidState() ||
5234 (SimplifiedValue && *SimplifiedValue ==
nullptr)) &&
5235 "Unexpected invalid state!");
5237 if (!isAtFixpoint()) {
5238 UsedAssumedInformation =
true;
5240 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5242 return SimplifiedValue;
5249 case OMPRTL___kmpc_is_spmd_exec_mode:
5250 Changed |= foldIsSPMDExecMode(
A);
5252 case OMPRTL___kmpc_parallel_level:
5253 Changed |= foldParallelLevel(
A);
5255 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5256 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5258 case OMPRTL___kmpc_get_hardware_num_blocks:
5259 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_num_teams");
5271 if (SimplifiedValue && *SimplifiedValue) {
5274 A.deleteAfterManifest(
I);
5278 if (
auto *
C = dyn_cast<ConstantInt>(*SimplifiedValue))
5279 return OR <<
"Replacing OpenMP runtime call "
5281 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5282 return OR <<
"Replacing OpenMP runtime call "
5290 << **SimplifiedValue <<
"\n");
5292 Changed = ChangeStatus::CHANGED;
5299 SimplifiedValue =
nullptr;
5300 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5306 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5308 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5309 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5310 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5313 if (!CallerKernelInfoAA ||
5314 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5315 return indicatePessimisticFixpoint();
5317 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5319 DepClassTy::REQUIRED);
5321 if (!AA || !AA->isValidState()) {
5322 SimplifiedValue =
nullptr;
5323 return indicatePessimisticFixpoint();
5326 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5327 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5332 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5333 ++KnownNonSPMDCount;
5335 ++AssumedNonSPMDCount;
5339 if ((AssumedSPMDCount + KnownSPMDCount) &&
5340 (AssumedNonSPMDCount + KnownNonSPMDCount))
5341 return indicatePessimisticFixpoint();
5343 auto &Ctx = getAnchorValue().getContext();
5344 if (KnownSPMDCount || AssumedSPMDCount) {
5345 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5346 "Expected only SPMD kernels!");
5350 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5351 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5352 "Expected only non-SPMD kernels!");
5360 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5363 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5364 : ChangeStatus::CHANGED;
5369 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5371 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5374 if (!CallerKernelInfoAA ||
5375 !CallerKernelInfoAA->ParallelLevels.isValidState())
5376 return indicatePessimisticFixpoint();
5378 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5379 return indicatePessimisticFixpoint();
5381 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5382 assert(!SimplifiedValue &&
5383 "SimplifiedValue should keep none at this point");
5384 return ChangeStatus::UNCHANGED;
5387 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5388 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5389 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5391 DepClassTy::REQUIRED);
5392 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5393 return indicatePessimisticFixpoint();
5395 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5396 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5401 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5402 ++KnownNonSPMDCount;
5404 ++AssumedNonSPMDCount;
5408 if ((AssumedSPMDCount + KnownSPMDCount) &&
5409 (AssumedNonSPMDCount + KnownNonSPMDCount))
5410 return indicatePessimisticFixpoint();
5412 auto &Ctx = getAnchorValue().getContext();
5416 if (AssumedSPMDCount || KnownSPMDCount) {
5417 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5418 "Expected only SPMD kernels!");
5421 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5422 "Expected only non-SPMD kernels!");
5425 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5426 : ChangeStatus::CHANGED;
5431 int32_t CurrentAttrValue = -1;
5432 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5434 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5437 if (!CallerKernelInfoAA ||
5438 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5439 return indicatePessimisticFixpoint();
5442 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5443 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5445 if (NextAttrVal == -1 ||
5446 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5447 return indicatePessimisticFixpoint();
5448 CurrentAttrValue = NextAttrVal;
5451 if (CurrentAttrValue != -1) {
5452 auto &Ctx = getAnchorValue().getContext();
5456 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5457 : ChangeStatus::CHANGED;
5463 std::optional<Value *> SimplifiedValue;
5473 auto &RFI = OMPInfoCache.RFIs[RF];
5475 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5478 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5480 DepClassTy::NONE,
false,
5486void OpenMPOpt::registerAAs(
bool IsModulePass) {
5496 A.getOrCreateAAFor<AAKernelInfo>(
5498 DepClassTy::NONE,
false,
5502 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5503 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5504 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5506 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5507 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5514 for (
int Idx = 0;
Idx < OMPInfoCache.ICVs.size() - 1; ++
Idx) {
5517 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5520 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5524 auto &CB = cast<CallBase>(*CI);
5527 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5531 GetterRFI.foreachUse(SCC, CreateAA);
5540 for (
auto *
F : SCC) {
5541 if (
F->isDeclaration())
5547 if (
F->hasLocalLinkage()) {
5549 const auto *CB = dyn_cast<CallBase>(U.getUser());
5550 return CB && CB->isCallee(&U) &&
5551 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5555 registerAAsForFunction(
A, *
F);
5565 if (
F.hasFnAttribute(Attribute::Convergent))
5569 if (
auto *LI = dyn_cast<LoadInst>(&
I)) {
5570 bool UsedAssumedInformation =
false;
5575 if (
auto *CI = dyn_cast<CallBase>(&
I)) {
5580 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
5584 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
5588 if (
auto *
II = dyn_cast<IntrinsicInst>(&
I)) {
5589 if (
II->getIntrinsicID() == Intrinsic::assume) {
5598const char AAICVTracker::ID = 0;
5599const char AAKernelInfo::ID = 0;
5601const char AAHeapToShared::ID = 0;
5602const char AAFoldRuntimeCall::ID = 0;
5604AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5606 AAICVTracker *AA =
nullptr;
5614 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5617 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5620 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5623 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5632 AAExecutionDomainFunction *AA =
nullptr;
5642 "AAExecutionDomain can only be created for function position!");
5644 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5651AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5653 AAHeapToSharedFunction *AA =
nullptr;
5663 "AAHeapToShared can only be created for function position!");
5665 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5672AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5674 AAKernelInfo *AA =
nullptr;
5684 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5687 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5694AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5696 AAFoldRuntimeCall *AA =
nullptr;
5705 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5707 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5728 if (Kernels.contains(&
F))
5730 for (
const User *U :
F.users())
5731 if (!isa<BlockAddress>(U))
5740 return ORA <<
"Could not internalize function. "
5741 <<
"Some optimizations may not be possible. [OMP140]";
5745 bool Changed =
false;
5753 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5757 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5770 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5789 OMPInformationCache InfoCache(M, AG,
Allocator,
nullptr, PostLink);
5791 unsigned MaxFixpointIterations =
5803 return F.hasFnAttribute(
"kernel");
5808 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5809 Changed |= OMPOpt.run(
true);
5814 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5815 !
F.hasFnAttribute(Attribute::NoInline))
5816 F.addFnAttr(Attribute::AlwaysInline);
5846 Module &M = *
C.begin()->getFunction().getParent();
5869 OMPInformationCache InfoCache(*(Functions.
back()->getParent()), AG,
Allocator,
5870 &Functions, PostLink);
5872 unsigned MaxFixpointIterations =
5886 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5887 bool Changed = OMPOpt.run(
false);
5904 NamedMDNode *MD = M.getNamedMetadata(
"nvvm.annotations");
5913 MDString *KindID = dyn_cast<MDString>(
Op->getOperand(1));
5914 if (!KindID || KindID->
getString() !=
"kernel")
5918 mdconst::dyn_extract_or_null<Function>(
Op->getOperand(0));
5925 ++NumOpenMPTargetRegionKernels;
5926 Kernels.insert(KernelFn);
5928 ++NumNonOpenMPTargetRegionKernels;
5935 Metadata *MD = M.getModuleFlag(
"openmp");
5943 Metadata *MD = M.getModuleFlag(
"openmp-device");
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines an array type that can be indexed using scoped enum values.
static void emitRemark(const Function &F, OptimizationRemarkEmitter &ORE, bool Skip)
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
This file provides utility analysis objects describing memory locations.
uint64_t IntrinsicInst * II
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicible functions on the device."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
AttributeSet getParamAttrs(unsigned ArgNo) const
The attributes for the argument or parameter at the given index are returned.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
bool isConditional() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Allocate memory in an ever growing pool, as if by bump-pointer.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the parameter attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(LazyCallGraph &LCG, LazyCallGraph::SCC &SCC, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
@ ICMP_SLT
signed less than
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
An instruction for ordering other memory operations.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
InsertPoint - A saved insertion point.
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const Instruction * getPrevNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the previous non-debug instruction in the same basic block as 'this',...
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
An instruction for reading from memory.
StringRef getString() const
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
A Module instance is used to store all the information related to an LLVM module.
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
iterator_range< op_iterator > operands()
An interface to create LLVM-IR for OpenMP directives.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal=nullptr, InsertPosition InsertBefore=nullptr)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
const value_type & back() const
Return the last element of the SetVector.
iterator end()
Get an iterator to the end of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
iterator begin()
Get an iterator to the beginning of the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
The instances of the Type class are immutable: once they are created, they are never changed.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt16Ty(LLVMContext &C)
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
A raw_ostream that writes to an std::string.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
std::string to_string(const T &Value)
bool operator!=(uint64_t V1, const APInt &V2)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
Implement std::hash so that hash_code can be used in STL containers.
An abstract attribute for getting assumption information.
An abstract state for querying live call edges.
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
bool IsReachedFromAlignedBarrierOnly
bool isExecutedByInitialThreadOnly(const Instruction &I) const
Check if an instruction is executed only by the initial thread.
static AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
virtual ExecutionDomainTy getFunctionExecutionDomain() const =0
virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const =0
virtual bool isExecutedInAlignedRegion(Attributor &A, const Instruction &I) const =0
Check if the instruction I is executed in an aligned region, that is, the synchronizing effects befor...
virtual bool isNoOpFence(const FenceInst &FI) const =0
Helper function to determine if FI is a no-op given the information about its execution from ExecDoma...
static const char ID
Unique ID (due to the unique address)
An abstract interface for indirect call information interference.
An abstract interface for liveness abstract attribute.
An abstract interface for all memory location attributes (readnone/argmemonly/inaccessiblememonly/ina...
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
An abstract Attribute for determining the necessity of the convergent attribute.
An abstract attribute for getting all assumption underlying objects.
Base struct for all "concrete attribute" deductions.
virtual ChangeStatus manifest(Attributor &A)
Hook for the Attributor to trigger the manifestation of the information represented by the abstract a...
virtual void initialize(Attributor &A)
Initialize the state with the information in the Attributor A.
virtual const std::string getAsStr(Attributor *A) const =0
This function should return the "summarized" assumed state as string.
virtual ChangeStatus updateImpl(Attributor &A)=0
The actual update/transfer function which has to be implemented by the derived classes.
virtual void trackStatistics() const =0
Hook to enable custom statistic tracking, called after manifest that resulted in a change if statisti...
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Constant * >(const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
std::function< std::optional< Value * >(const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Function * getAnchorScope() const
Return the Function surrounding the anchor value.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Helper to tie a abstract state implementation to an abstract attribute.
StateType & getState() override
See AbstractAttribute::getState(...).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...