50#include "llvm/IR/IntrinsicsAMDGPU.h"
51#include "llvm/IR/IntrinsicsNVPTX.h"
67#define DEBUG_TYPE "openmp-opt"
70 "openmp-opt-disable",
cl::desc(
"Disable OpenMP specific optimizations."),
74 "openmp-opt-enable-merging",
80 cl::desc(
"Disable function internalization."),
91 "openmp-hide-memory-transfer-latency",
92 cl::desc(
"[WIP] Tries to hide the latency of host to device memory"
97 "openmp-opt-disable-deglobalization",
98 cl::desc(
"Disable OpenMP optimizations involving deglobalization."),
102 "openmp-opt-disable-spmdization",
103 cl::desc(
"Disable OpenMP optimizations involving SPMD-ization."),
107 "openmp-opt-disable-folding",
112 "openmp-opt-disable-state-machine-rewrite",
113 cl::desc(
"Disable OpenMP optimizations that replace the state machine."),
117 "openmp-opt-disable-barrier-elimination",
118 cl::desc(
"Disable OpenMP optimizations that eliminate barriers."),
122 "openmp-opt-print-module-after",
123 cl::desc(
"Print the current module after OpenMP optimizations."),
127 "openmp-opt-print-module-before",
128 cl::desc(
"Print the current module before OpenMP optimizations."),
132 "openmp-opt-inline-device",
143 cl::desc(
"Maximal number of attributor iterations."),
148 cl::desc(
"Maximum amount of shared memory to use."),
149 cl::init(std::numeric_limits<unsigned>::max()));
152 "Number of OpenMP runtime calls deduplicated");
154 "Number of OpenMP parallel regions deleted");
156 "Number of OpenMP runtime functions identified");
158 "Number of OpenMP runtime function uses identified");
160 "Number of OpenMP target region entry points (=kernels) identified");
162 "Number of non-OpenMP target region kernels identified");
164 "Number of OpenMP target region entry points (=kernels) executed in "
165 "SPMD-mode instead of generic-mode");
166STATISTIC(NumOpenMPTargetRegionKernelsWithoutStateMachine,
167 "Number of OpenMP target region entry points (=kernels) executed in "
168 "generic-mode without a state machines");
169STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback,
170 "Number of OpenMP target region entry points (=kernels) executed in "
171 "generic-mode with customized state machines with fallback");
172STATISTIC(NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback,
173 "Number of OpenMP target region entry points (=kernels) executed in "
174 "generic-mode with customized state machines without fallback");
176 NumOpenMPParallelRegionsReplacedInGPUStateMachine,
177 "Number of OpenMP parallel regions replaced with ID in GPU state machines");
179 "Number of OpenMP parallel regions merged");
181 "Amount of memory pushed to shared memory");
182STATISTIC(NumBarriersEliminated,
"Number of redundant barriers eliminated");
210#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX) \
211 constexpr const unsigned MEMBER##Idx = IDX;
216#undef KERNEL_ENVIRONMENT_IDX
218#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX) \
219 constexpr const unsigned MEMBER##Idx = IDX;
229#undef KERNEL_ENVIRONMENT_CONFIGURATION_IDX
231#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE) \
232 RETURNTYPE *get##MEMBER##FromKernelEnvironment(ConstantStruct *KernelEnvC) { \
233 return cast<RETURNTYPE>(KernelEnvC->getAggregateElement(MEMBER##Idx)); \
239#undef KERNEL_ENVIRONMENT_GETTER
241#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER) \
242 ConstantInt *get##MEMBER##FromKernelEnvironment( \
243 ConstantStruct *KernelEnvC) { \
244 ConstantStruct *ConfigC = \
245 getConfigurationFromKernelEnvironment(KernelEnvC); \
246 return dyn_cast<ConstantInt>(ConfigC->getAggregateElement(MEMBER##Idx)); \
257#undef KERNEL_ENVIRONMENT_CONFIGURATION_GETTER
261 constexpr const int InitKernelEnvironmentArgNo = 0;
262 return cast<GlobalVariable>(
276struct AAHeapToShared;
287 OpenMPPostLink(OpenMPPostLink) {
290 OMPBuilder.initialize();
291 initializeRuntimeFunctions(M);
292 initializeInternalControlVars();
296 struct InternalControlVarInfo {
323 struct RuntimeFunctionInfo {
347 void clearUsesMap() { UsesMap.
clear(); }
350 operator bool()
const {
return Declaration; }
353 UseVector &getOrCreateUseVector(
Function *
F) {
354 std::shared_ptr<UseVector> &UV = UsesMap[
F];
356 UV = std::make_shared<UseVector>();
362 const UseVector *getUseVector(
Function &
F)
const {
363 auto I = UsesMap.find(&
F);
364 if (
I != UsesMap.end())
365 return I->second.get();
370 size_t getNumFunctionsWithUses()
const {
return UsesMap.size(); }
374 size_t getNumArgs()
const {
return ArgumentTypes.
size(); }
392 UseVector &UV = getOrCreateUseVector(
F);
402 while (!ToBeDeleted.
empty()) {
416 decltype(UsesMap)::iterator
begin() {
return UsesMap.
begin(); }
417 decltype(UsesMap)::iterator
end() {
return UsesMap.
end(); }
425 RuntimeFunction::OMPRTL___last>
433 InternalControlVar::ICV___last>
438 void initializeInternalControlVars() {
439#define ICV_RT_SET(_Name, RTL) \
441 auto &ICV = ICVs[_Name]; \
444#define ICV_RT_GET(Name, RTL) \
446 auto &ICV = ICVs[Name]; \
449#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
451 auto &ICV = ICVs[Enum]; \
454 ICV.InitKind = Init; \
455 ICV.EnvVarName = _EnvVarName; \
456 switch (ICV.InitKind) { \
457 case ICV_IMPLEMENTATION_DEFINED: \
458 ICV.InitValue = nullptr; \
461 ICV.InitValue = ConstantInt::get( \
462 Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
465 ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
471#include "llvm/Frontend/OpenMP/OMPKinds.def"
477 static bool declMatchesRTFTypes(
Function *
F,
Type *RTFRetType,
484 if (
F->getReturnType() != RTFRetType)
486 if (
F->arg_size() != RTFArgTypes.
size())
489 auto *RTFTyIt = RTFArgTypes.
begin();
491 if (Arg.getType() != *RTFTyIt)
501 unsigned collectUses(RuntimeFunctionInfo &RFI,
bool CollectStats =
true) {
502 unsigned NumUses = 0;
503 if (!RFI.Declaration)
508 NumOpenMPRuntimeFunctionsIdentified += 1;
509 NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
513 for (
Use &U : RFI.Declaration->uses()) {
514 if (
Instruction *UserI = dyn_cast<Instruction>(
U.getUser())) {
515 if (!
CGSCC ||
CGSCC->empty() ||
CGSCC->contains(UserI->getFunction())) {
516 RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
520 RFI.getOrCreateUseVector(
nullptr).push_back(&U);
529 auto &RFI = RFIs[RTF];
531 collectUses(RFI,
false);
535 void recollectUses() {
536 for (
int Idx = 0;
Idx < RFIs.size(); ++
Idx)
556 RuntimeFunctionInfo &RFI = RFIs[Fn];
558 if (RFI.Declaration && RFI.Declaration->isDeclaration())
566 void initializeRuntimeFunctions(
Module &M) {
569#define OMP_TYPE(VarName, ...) \
570 Type *VarName = OMPBuilder.VarName; \
573#define OMP_ARRAY_TYPE(VarName, ...) \
574 ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
576 PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
577 (void)VarName##PtrTy;
579#define OMP_FUNCTION_TYPE(VarName, ...) \
580 FunctionType *VarName = OMPBuilder.VarName; \
582 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
585#define OMP_STRUCT_TYPE(VarName, ...) \
586 StructType *VarName = OMPBuilder.VarName; \
588 PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
591#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
593 SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
594 Function *F = M.getFunction(_Name); \
595 RTLFunctions.insert(F); \
596 if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
597 RuntimeFunctionIDMap[F] = _Enum; \
598 auto &RFI = RFIs[_Enum]; \
601 RFI.IsVarArg = _IsVarArg; \
602 RFI.ReturnType = OMPBuilder._ReturnType; \
603 RFI.ArgumentTypes = std::move(ArgsTypes); \
604 RFI.Declaration = F; \
605 unsigned NumUses = collectUses(RFI); \
608 dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
610 if (RFI.Declaration) \
611 dbgs() << TAG << "-> got " << NumUses << " uses in " \
612 << RFI.getNumFunctionsWithUses() \
613 << " different functions.\n"; \
617#include "llvm/Frontend/OpenMP/OMPKinds.def"
623 for (
StringRef Prefix : {
"__kmpc",
"_ZN4ompx",
"omp_"})
624 if (
F.hasFnAttribute(Attribute::NoInline) &&
625 F.getName().starts_with(Prefix) &&
626 !
F.hasFnAttribute(Attribute::OptimizeNone))
627 F.removeFnAttr(Attribute::NoInline);
638 bool OpenMPPostLink =
false;
641template <
typename Ty,
bool InsertInval
idates = true>
643 bool contains(
const Ty &Elem)
const {
return Set.contains(Elem); }
644 bool insert(
const Ty &Elem) {
645 if (InsertInvalidates)
647 return Set.insert(Elem);
650 const Ty &operator[](
int Idx)
const {
return Set[
Idx]; }
651 bool operator==(
const BooleanStateWithSetVector &RHS)
const {
652 return BooleanState::operator==(RHS) && Set ==
RHS.Set;
654 bool operator!=(
const BooleanStateWithSetVector &RHS)
const {
655 return !(*
this ==
RHS);
658 bool empty()
const {
return Set.empty(); }
659 size_t size()
const {
return Set.size(); }
662 BooleanStateWithSetVector &
operator^=(
const BooleanStateWithSetVector &RHS) {
663 BooleanState::operator^=(RHS);
664 Set.insert(
RHS.Set.begin(),
RHS.Set.end());
673 typename decltype(Set)::iterator
begin() {
return Set.
begin(); }
674 typename decltype(Set)::iterator
end() {
return Set.
end(); }
679template <
typename Ty,
bool InsertInval
idates = true>
680using BooleanStateWithPtrSetVector =
681 BooleanStateWithSetVector<Ty *, InsertInvalidates>;
685 bool IsAtFixpoint =
false;
689 BooleanStateWithPtrSetVector<
CallBase,
false>
690 ReachedKnownParallelRegions;
693 BooleanStateWithPtrSetVector<CallBase> ReachedUnknownParallelRegions;
698 BooleanStateWithPtrSetVector<Instruction, false> SPMDCompatibilityTracker;
713 bool IsKernelEntry =
false;
716 BooleanStateWithPtrSetVector<Function, false> ReachingKernelEntries;
721 BooleanStateWithSetVector<uint8_t> ParallelLevels;
724 bool NestedParallelism =
false;
729 KernelInfoState() =
default;
730 KernelInfoState(
bool BestState) {
739 bool isAtFixpoint()
const override {
return IsAtFixpoint; }
744 ParallelLevels.indicatePessimisticFixpoint();
745 ReachingKernelEntries.indicatePessimisticFixpoint();
746 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
747 ReachedKnownParallelRegions.indicatePessimisticFixpoint();
748 ReachedUnknownParallelRegions.indicatePessimisticFixpoint();
749 NestedParallelism =
true;
756 ParallelLevels.indicateOptimisticFixpoint();
757 ReachingKernelEntries.indicateOptimisticFixpoint();
758 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
759 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
760 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
765 KernelInfoState &getAssumed() {
return *
this; }
766 const KernelInfoState &getAssumed()
const {
return *
this; }
768 bool operator==(
const KernelInfoState &RHS)
const {
769 if (SPMDCompatibilityTracker !=
RHS.SPMDCompatibilityTracker)
771 if (ReachedKnownParallelRegions !=
RHS.ReachedKnownParallelRegions)
773 if (ReachedUnknownParallelRegions !=
RHS.ReachedUnknownParallelRegions)
775 if (ReachingKernelEntries !=
RHS.ReachingKernelEntries)
777 if (ParallelLevels !=
RHS.ParallelLevels)
779 if (NestedParallelism !=
RHS.NestedParallelism)
785 bool mayContainParallelRegion() {
786 return !ReachedKnownParallelRegions.empty() ||
787 !ReachedUnknownParallelRegions.empty();
791 static KernelInfoState getBestState() {
return KernelInfoState(
true); }
793 static KernelInfoState getBestState(KernelInfoState &KIS) {
794 return getBestState();
798 static KernelInfoState getWorstState() {
return KernelInfoState(
false); }
801 KernelInfoState
operator^=(
const KernelInfoState &KIS) {
803 if (KIS.KernelInitCB) {
804 if (KernelInitCB && KernelInitCB != KIS.KernelInitCB)
807 KernelInitCB = KIS.KernelInitCB;
809 if (KIS.KernelDeinitCB) {
810 if (KernelDeinitCB && KernelDeinitCB != KIS.KernelDeinitCB)
813 KernelDeinitCB = KIS.KernelDeinitCB;
815 if (KIS.KernelEnvC) {
816 if (KernelEnvC && KernelEnvC != KIS.KernelEnvC)
819 KernelEnvC = KIS.KernelEnvC;
821 SPMDCompatibilityTracker ^= KIS.SPMDCompatibilityTracker;
822 ReachedKnownParallelRegions ^= KIS.ReachedKnownParallelRegions;
823 ReachedUnknownParallelRegions ^= KIS.ReachedUnknownParallelRegions;
824 NestedParallelism |= KIS.NestedParallelism;
828 KernelInfoState
operator&=(
const KernelInfoState &KIS) {
829 return (*
this ^= KIS);
845 OffloadArray() =
default;
852 if (!
Array.getAllocatedType()->isArrayTy())
855 if (!getValues(Array,
Before))
858 this->Array = &
Array;
862 static const unsigned DeviceIDArgNum = 1;
863 static const unsigned BasePtrsArgNum = 3;
864 static const unsigned PtrsArgNum = 4;
865 static const unsigned SizesArgNum = 5;
873 const uint64_t NumValues =
Array.getAllocatedType()->getArrayNumElements();
874 StoredValues.
assign(NumValues,
nullptr);
875 LastAccesses.
assign(NumValues,
nullptr);
880 if (BB !=
Before.getParent())
890 if (!isa<StoreInst>(&
I))
893 auto *S = cast<StoreInst>(&
I);
900 LastAccesses[
Idx] = S;
910 const unsigned NumValues = StoredValues.
size();
911 for (
unsigned I = 0;
I < NumValues; ++
I) {
912 if (!StoredValues[
I] || !LastAccesses[
I])
922 using OptimizationRemarkGetter =
926 OptimizationRemarkGetter OREGetter,
927 OMPInformationCache &OMPInfoCache,
Attributor &A)
929 OREGetter(OREGetter), OMPInfoCache(OMPInfoCache),
A(
A) {}
932 bool remarksEnabled() {
933 auto &Ctx =
M.getContext();
934 return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(
DEBUG_TYPE);
938 bool run(
bool IsModulePass) {
942 bool Changed =
false;
948 Changed |= runAttributor(IsModulePass);
951 OMPInfoCache.recollectUses();
954 Changed |= rewriteDeviceCodeStateMachine();
956 if (remarksEnabled())
957 analysisGlobalization();
964 Changed |= runAttributor(IsModulePass);
967 OMPInfoCache.recollectUses();
969 Changed |= deleteParallelRegions();
972 Changed |= hideMemTransfersLatency();
973 Changed |= deduplicateRuntimeCalls();
975 if (mergeParallelRegions()) {
976 deduplicateRuntimeCalls();
982 if (OMPInfoCache.OpenMPPostLink)
983 Changed |= removeRuntimeSymbols();
990 void printICVs()
const {
995 for (
auto ICV : ICVs) {
996 auto ICVInfo = OMPInfoCache.ICVs[ICV];
998 return ORA <<
"OpenMP ICV " <<
ore::NV(
"OpenMPICV", ICVInfo.Name)
1000 << (ICVInfo.InitValue
1001 ?
toString(ICVInfo.InitValue->getValue(), 10,
true)
1002 :
"IMPLEMENTATION_DEFINED");
1005 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPICVTracker",
Remark);
1011 void printKernels()
const {
1017 return ORA <<
"OpenMP GPU kernel "
1018 <<
ore::NV(
"OpenMPGPUKernel",
F->getName()) <<
"\n";
1021 emitRemark<OptimizationRemarkAnalysis>(
F,
"OpenMPGPU",
Remark);
1027 static CallInst *getCallIfRegularCall(
1028 Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1029 CallInst *CI = dyn_cast<CallInst>(
U.getUser());
1039 static CallInst *getCallIfRegularCall(
1040 Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI =
nullptr) {
1041 CallInst *CI = dyn_cast<CallInst>(&V);
1051 bool mergeParallelRegions() {
1052 const unsigned CallbackCalleeOperand = 2;
1053 const unsigned CallbackFirstArgOperand = 3;
1057 OMPInformationCache::RuntimeFunctionInfo &RFI =
1058 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1060 if (!RFI.Declaration)
1064 OMPInformationCache::RuntimeFunctionInfo UnmergableCallsInfo[] = {
1065 OMPInfoCache.RFIs[OMPRTL___kmpc_push_proc_bind],
1066 OMPInfoCache.RFIs[OMPRTL___kmpc_push_num_threads],
1069 bool Changed =
false;
1075 BasicBlock *StartBB =
nullptr, *EndBB =
nullptr;
1076 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1077 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1079 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1080 assert(StartBB !=
nullptr &&
"StartBB should not be null");
1082 assert(EndBB !=
nullptr &&
"EndBB should not be null");
1083 EndBB->getTerminator()->setSuccessor(0, CGEndBB);
1086 auto PrivCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
Value &,
1087 Value &Inner,
Value *&ReplacementValue) -> InsertPointTy {
1088 ReplacementValue = &Inner;
1092 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1096 auto CreateSequentialRegion = [&](
Function *OuterFn,
1104 SplitBlock(ParentBB, SeqEndI->getNextNode(), DT, LI);
1108 SplitBlock(ParentBB, SeqStartI, DT, LI,
nullptr,
"seq.par.merged");
1111 "Expected a different CFG");
1115 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
1116 BasicBlock *CGStartBB = CodeGenIP.getBlock();
1118 SplitBlock(CGStartBB, &*CodeGenIP.getPoint(), DT, LI);
1119 assert(SeqStartBB !=
nullptr &&
"SeqStartBB should not be null");
1121 assert(SeqEndBB !=
nullptr &&
"SeqEndBB should not be null");
1124 auto FiniCB = [&](InsertPointTy CodeGenIP) {};
1130 for (
User *Usr :
I.users()) {
1138 OutsideUsers.
insert(&UsrI);
1141 if (OutsideUsers.
empty())
1148 I.getType(),
DL.getAllocaAddrSpace(),
nullptr,
1149 I.getName() +
".seq.output.alloc", OuterFn->
front().
begin());
1153 new StoreInst(&
I, AllocaI, SeqStartBB->getTerminator()->getIterator());
1159 I.getName() +
".seq.output.load",
1166 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
1167 InsertPointTy SeqAfterIP =
1168 OMPInfoCache.OMPBuilder.createMaster(Loc, BodyGenCB, FiniCB);
1170 OMPInfoCache.OMPBuilder.createBarrier(SeqAfterIP, OMPD_parallel);
1189 assert(MergableCIs.
size() > 1 &&
"Assumed multiple mergable CIs");
1192 OR <<
"Parallel region merged with parallel region"
1193 << (MergableCIs.
size() > 2 ?
"s" :
"") <<
" at ";
1196 if (CI != MergableCIs.
back())
1202 emitRemark<OptimizationRemark>(MergableCIs.
front(),
"OMP150",
Remark);
1206 <<
" parallel regions in " << OriginalFn->
getName()
1210 EndBB =
SplitBlock(BB, MergableCIs.
back()->getNextNode(), DT, LI);
1212 SplitBlock(EndBB, &*EndBB->getFirstInsertionPt(), DT, LI);
1216 assert(BB->getUniqueSuccessor() == StartBB &&
"Expected a different CFG");
1217 const DebugLoc DL = BB->getTerminator()->getDebugLoc();
1222 for (
auto *It = MergableCIs.
begin(), *
End = MergableCIs.
end() - 1;
1231 CreateSequentialRegion(OriginalFn, BB, ForkCI->
getNextNode(),
1242 InsertPointTy AfterIP = OMPInfoCache.OMPBuilder.createParallel(
1243 Loc, AllocaIP, BodyGenCB, PrivCB, FiniCB,
nullptr,
nullptr,
1244 OMP_PROC_BIND_default,
false);
1248 OMPInfoCache.OMPBuilder.finalize(OriginalFn);
1255 for (
auto *CI : MergableCIs) {
1257 FunctionType *FT = OMPInfoCache.OMPBuilder.ParallelTask;
1261 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1271 for (
unsigned U = CallbackFirstArgOperand, E = CI->
arg_size(); U < E;
1275 U - (CallbackFirstArgOperand - CallbackCalleeOperand), A);
1278 if (CI != MergableCIs.back()) {
1281 OMPInfoCache.OMPBuilder.createBarrier(
1290 assert(OutlinedFn != OriginalFn &&
"Outlining failed");
1291 CGUpdater.registerOutlinedFunction(*OriginalFn, *OutlinedFn);
1292 CGUpdater.reanalyzeFunction(*OriginalFn);
1294 NumOpenMPParallelRegionsMerged += MergableCIs.size();
1302 CallInst *CI = getCallIfRegularCall(U, &RFI);
1309 RFI.foreachUse(SCC, DetectPRsCB);
1315 for (
auto &It : BB2PRMap) {
1316 auto &CIs = It.getSecond();
1331 auto IsMergable = [&](
Instruction &
I,
bool IsBeforeMergableRegion) {
1334 if (
I.isTerminator())
1337 if (!isa<CallInst>(&
I))
1341 if (IsBeforeMergableRegion) {
1343 if (!CalledFunction)
1350 for (
const auto &RFI : UnmergableCallsInfo) {
1351 if (CalledFunction == RFI.Declaration)
1359 if (!isa<IntrinsicInst>(CI))
1370 if (CIs.count(&
I)) {
1376 if (IsMergable(
I, MergableCIs.
empty()))
1381 for (; It !=
End; ++It) {
1383 if (CIs.count(&SkipI)) {
1385 <<
" due to " <<
I <<
"\n");
1392 if (MergableCIs.
size() > 1) {
1393 MergableCIsVector.
push_back(MergableCIs);
1395 <<
" parallel regions in block " << BB->
getName()
1400 MergableCIs.
clear();
1403 if (!MergableCIsVector.
empty()) {
1406 for (
auto &MergableCIs : MergableCIsVector)
1407 Merge(MergableCIs, BB);
1408 MergableCIsVector.clear();
1415 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_fork_call);
1416 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_barrier);
1417 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_master);
1418 OMPInfoCache.recollectUsesForFunction(OMPRTL___kmpc_end_master);
1425 bool deleteParallelRegions() {
1426 const unsigned CallbackCalleeOperand = 2;
1428 OMPInformationCache::RuntimeFunctionInfo &RFI =
1429 OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
1431 if (!RFI.Declaration)
1434 bool Changed =
false;
1436 CallInst *CI = getCallIfRegularCall(U);
1439 auto *Fn = dyn_cast<Function>(
1443 if (!Fn->onlyReadsMemory())
1445 if (!Fn->hasFnAttribute(Attribute::WillReturn))
1452 return OR <<
"Removing parallel region with no side-effects.";
1454 emitRemark<OptimizationRemark>(CI,
"OMP160",
Remark);
1456 CGUpdater.removeCallSite(*CI);
1459 ++NumOpenMPParallelRegionsDeleted;
1463 RFI.foreachUse(SCC, DeleteCallCB);
1469 bool deduplicateRuntimeCalls() {
1470 bool Changed =
false;
1473 OMPRTL_omp_get_num_threads,
1474 OMPRTL_omp_in_parallel,
1475 OMPRTL_omp_get_cancellation,
1476 OMPRTL_omp_get_supported_active_levels,
1477 OMPRTL_omp_get_level,
1478 OMPRTL_omp_get_ancestor_thread_num,
1479 OMPRTL_omp_get_team_size,
1480 OMPRTL_omp_get_active_level,
1481 OMPRTL_omp_in_final,
1482 OMPRTL_omp_get_proc_bind,
1483 OMPRTL_omp_get_num_places,
1484 OMPRTL_omp_get_num_procs,
1485 OMPRTL_omp_get_place_num,
1486 OMPRTL_omp_get_partition_num_places,
1487 OMPRTL_omp_get_partition_place_nums};
1491 collectGlobalThreadIdArguments(GTIdArgs);
1493 <<
" global thread ID arguments\n");
1496 for (
auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
1497 Changed |= deduplicateRuntimeCalls(
1498 *
F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
1502 Value *GTIdArg =
nullptr;
1504 if (GTIdArgs.
count(&Arg)) {
1508 Changed |= deduplicateRuntimeCalls(
1509 *
F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
1516 bool removeRuntimeSymbols() {
1522 if (!
GV->getType()->isPointerTy())
1530 GlobalVariable *Client = dyn_cast<GlobalVariable>(
C->stripPointerCasts());
1539 GV->eraseFromParent();
1552 bool hideMemTransfersLatency() {
1553 auto &RFI = OMPInfoCache.RFIs[OMPRTL___tgt_target_data_begin_mapper];
1554 bool Changed =
false;
1556 auto *RTCall = getCallIfRegularCall(U, &RFI);
1560 OffloadArray OffloadArrays[3];
1561 if (!getValuesInOffloadArrays(*RTCall, OffloadArrays))
1564 LLVM_DEBUG(dumpValuesInOffloadArrays(OffloadArrays));
1567 bool WasSplit =
false;
1568 Instruction *WaitMovementPoint = canBeMovedDownwards(*RTCall);
1569 if (WaitMovementPoint)
1570 WasSplit = splitTargetDataBeginRTC(*RTCall, *WaitMovementPoint);
1572 Changed |= WasSplit;
1575 if (OMPInfoCache.runtimeFnsAvailable(
1576 {OMPRTL___tgt_target_data_begin_mapper_issue,
1577 OMPRTL___tgt_target_data_begin_mapper_wait}))
1578 RFI.foreachUse(SCC, SplitMemTransfers);
1583 void analysisGlobalization() {
1584 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
1586 auto CheckGlobalization = [&](
Use &
U,
Function &Decl) {
1587 if (
CallInst *CI = getCallIfRegularCall(U, &RFI)) {
1590 <<
"Found thread data sharing on the GPU. "
1591 <<
"Expect degraded performance due to data globalization.";
1593 emitRemark<OptimizationRemarkMissed>(CI,
"OMP112",
Remark);
1599 RFI.foreachUse(SCC, CheckGlobalization);
1604 bool getValuesInOffloadArrays(
CallInst &RuntimeCall,
1606 assert(OAs.
size() == 3 &&
"Need space for three offload arrays!");
1616 Value *BasePtrsArg =
1625 if (!isa<AllocaInst>(V))
1627 auto *BasePtrsArray = cast<AllocaInst>(V);
1628 if (!OAs[0].
initialize(*BasePtrsArray, RuntimeCall))
1633 if (!isa<AllocaInst>(V))
1635 auto *PtrsArray = cast<AllocaInst>(V);
1636 if (!OAs[1].
initialize(*PtrsArray, RuntimeCall))
1642 if (isa<GlobalValue>(V))
1643 return isa<Constant>(V);
1644 if (!isa<AllocaInst>(V))
1647 auto *SizesArray = cast<AllocaInst>(V);
1648 if (!OAs[2].
initialize(*SizesArray, RuntimeCall))
1659 assert(OAs.
size() == 3 &&
"There are three offload arrays to debug!");
1662 std::string ValuesStr;
1664 std::string Separator =
" --- ";
1666 for (
auto *BP : OAs[0].StoredValues) {
1673 for (
auto *
P : OAs[1].StoredValues) {
1680 for (
auto *S : OAs[2].StoredValues) {
1694 bool IsWorthIt =
false;
1717 bool splitTargetDataBeginRTC(
CallInst &RuntimeCall,
1722 auto &
IRBuilder = OMPInfoCache.OMPBuilder;
1726 Entry.getFirstNonPHIOrDbgOrAlloca());
1728 IRBuilder.AsyncInfo,
nullptr,
"handle");
1736 M, OMPRTL___tgt_target_data_begin_mapper_issue);
1740 for (
auto &Arg : RuntimeCall.
args())
1741 Args.push_back(Arg.get());
1742 Args.push_back(Handle);
1746 OMPInfoCache.setCallingConvention(IssueDecl, IssueCallsite);
1752 M, OMPRTL___tgt_target_data_begin_mapper_wait);
1754 Value *WaitParams[2] = {
1756 OffloadArray::DeviceIDArgNum),
1760 WaitDecl, WaitParams,
"", WaitMovementPoint.
getIterator());
1761 OMPInfoCache.setCallingConvention(WaitDecl, WaitCallsite);
1766 static Value *combinedIdentStruct(
Value *CurrentIdent,
Value *NextIdent,
1767 bool GlobalOnly,
bool &SingleChoice) {
1768 if (CurrentIdent == NextIdent)
1769 return CurrentIdent;
1773 if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
1774 SingleChoice = !CurrentIdent;
1786 getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
1788 bool SingleChoice =
true;
1789 Value *Ident =
nullptr;
1791 CallInst *CI = getCallIfRegularCall(U, &RFI);
1792 if (!CI || &
F != &Caller)
1795 true, SingleChoice);
1798 RFI.foreachUse(SCC, CombineIdentStruct);
1800 if (!Ident || !SingleChoice) {
1803 if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
1805 &
F.getEntryBlock(),
F.getEntryBlock().begin()));
1810 OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1811 Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc, SrcLocStrSize);
1818 bool deduplicateRuntimeCalls(
Function &
F,
1819 OMPInformationCache::RuntimeFunctionInfo &RFI,
1820 Value *ReplVal =
nullptr) {
1821 auto *UV = RFI.getUseVector(
F);
1822 if (!UV || UV->size() + (ReplVal !=
nullptr) < 2)
1826 dbgs() <<
TAG <<
"Deduplicate " << UV->size() <<
" uses of " << RFI.Name
1827 << (ReplVal ?
" with an existing value\n" :
"\n") <<
"\n");
1829 assert((!ReplVal || (isa<Argument>(ReplVal) &&
1830 cast<Argument>(ReplVal)->
getParent() == &
F)) &&
1831 "Unexpected replacement value!");
1834 auto CanBeMoved = [
this](
CallBase &CB) {
1835 unsigned NumArgs = CB.arg_size();
1838 if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
1840 for (
unsigned U = 1;
U < NumArgs; ++
U)
1841 if (isa<Instruction>(CB.getArgOperand(U)))
1852 for (
Use *U : *UV) {
1853 if (
CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
1858 if (!CanBeMoved(*CI))
1866 assert(IP &&
"Expected insertion point!");
1867 cast<Instruction>(ReplVal)->moveBefore(IP);
1873 if (
CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
1876 Value *Ident = getCombinedIdentFromCallUsesIn(RFI,
F,
1882 bool Changed =
false;
1884 CallInst *CI = getCallIfRegularCall(U, &RFI);
1885 if (!CI || CI == ReplVal || &
F != &Caller)
1890 return OR <<
"OpenMP runtime call "
1891 <<
ore::NV(
"OpenMPOptRuntime", RFI.Name) <<
" deduplicated.";
1894 emitRemark<OptimizationRemark>(CI,
"OMP170",
Remark);
1896 emitRemark<OptimizationRemark>(&
F,
"OMP170",
Remark);
1898 CGUpdater.removeCallSite(*CI);
1901 ++NumOpenMPRuntimeCallsDeduplicated;
1905 RFI.foreachUse(SCC, ReplaceAndDeleteCB);
1919 if (!
F.hasLocalLinkage())
1921 for (
Use &U :
F.uses()) {
1922 if (
CallInst *CI = getCallIfRegularCall(U)) {
1924 if (CI == &RefCI || GTIdArgs.
count(ArgOp) ||
1925 getCallIfRegularCall(
1926 *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
1935 auto AddUserArgs = [&](
Value >Id) {
1936 for (
Use &U : GTId.uses())
1937 if (
CallInst *CI = dyn_cast<CallInst>(
U.getUser()))
1940 if (CallArgOpIsGTId(*Callee,
U.getOperandNo(), *CI))
1945 OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
1946 OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
1948 GlobThreadNumRFI.foreachUse(SCC, [&](
Use &U,
Function &
F) {
1949 if (
CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
1957 for (
unsigned U = 0;
U < GTIdArgs.
size(); ++
U)
1958 AddUserArgs(*GTIdArgs[U]);
1973 return getUniqueKernelFor(*
I.getFunction());
1978 bool rewriteDeviceCodeStateMachine();
1994 template <
typename RemarkKind,
typename RemarkCallBack>
1996 RemarkCallBack &&RemarkCB)
const {
1998 auto &ORE = OREGetter(
F);
2002 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I))
2003 <<
" [" << RemarkName <<
"]";
2007 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
I)); });
2011 template <
typename RemarkKind,
typename RemarkCallBack>
2013 RemarkCallBack &&RemarkCB)
const {
2014 auto &ORE = OREGetter(
F);
2018 return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F))
2019 <<
" [" << RemarkName <<
"]";
2023 [&]() {
return RemarkCB(RemarkKind(
DEBUG_TYPE, RemarkName,
F)); });
2037 OptimizationRemarkGetter OREGetter;
2040 OMPInformationCache &OMPInfoCache;
2046 bool runAttributor(
bool IsModulePass) {
2050 registerAAs(IsModulePass);
2055 <<
" functions, result: " << Changed <<
".\n");
2057 if (Changed == ChangeStatus::CHANGED)
2058 OMPInfoCache.invalidateAnalyses();
2060 return Changed == ChangeStatus::CHANGED;
2067 void registerAAs(
bool IsModulePass);
2076 if (OMPInfoCache.CGSCC && !OMPInfoCache.CGSCC->empty() &&
2077 !OMPInfoCache.CGSCC->contains(&
F))
2082 std::optional<Kernel> &CachedKernel = UniqueKernelMap[&
F];
2084 return *CachedKernel;
2091 return *CachedKernel;
2094 CachedKernel =
nullptr;
2095 if (!
F.hasLocalLinkage()) {
2099 return ORA <<
"Potentially unknown OpenMP target region caller.";
2101 emitRemark<OptimizationRemarkAnalysis>(&
F,
"OMP100",
Remark);
2107 auto GetUniqueKernelForUse = [&](
const Use &
U) ->
Kernel {
2108 if (
auto *Cmp = dyn_cast<ICmpInst>(
U.getUser())) {
2110 if (
Cmp->isEquality())
2111 return getUniqueKernelFor(*Cmp);
2114 if (
auto *CB = dyn_cast<CallBase>(
U.getUser())) {
2116 if (CB->isCallee(&U))
2117 return getUniqueKernelFor(*CB);
2119 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2120 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2122 if (OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI))
2123 return getUniqueKernelFor(*CB);
2132 OMPInformationCache::foreachUse(
F, [&](
const Use &U) {
2133 PotentialKernels.
insert(GetUniqueKernelForUse(U));
2137 if (PotentialKernels.
size() == 1)
2138 K = *PotentialKernels.
begin();
2141 UniqueKernelMap[&
F] =
K;
2146bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
2147 OMPInformationCache::RuntimeFunctionInfo &KernelParallelRFI =
2148 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
2150 bool Changed =
false;
2151 if (!KernelParallelRFI)
2162 bool UnknownUse =
false;
2163 bool KernelParallelUse =
false;
2164 unsigned NumDirectCalls = 0;
2167 OMPInformationCache::foreachUse(*
F, [&](
Use &U) {
2168 if (
auto *CB = dyn_cast<CallBase>(
U.getUser()))
2169 if (CB->isCallee(&U)) {
2174 if (isa<ICmpInst>(
U.getUser())) {
2175 ToBeReplacedStateMachineUses.push_back(&U);
2181 OpenMPOpt::getCallIfRegularCall(*
U.getUser(), &KernelParallelRFI);
2182 const unsigned int WrapperFunctionArgNo = 6;
2183 if (!KernelParallelUse && CI &&
2185 KernelParallelUse = true;
2186 ToBeReplacedStateMachineUses.push_back(&U);
2194 if (!KernelParallelUse)
2200 if (UnknownUse || NumDirectCalls != 1 ||
2201 ToBeReplacedStateMachineUses.
size() > 2) {
2203 return ORA <<
"Parallel region is used in "
2204 << (UnknownUse ?
"unknown" :
"unexpected")
2205 <<
" ways. Will not attempt to rewrite the state machine.";
2207 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP101",
Remark);
2216 return ORA <<
"Parallel region is not called from a unique kernel. "
2217 "Will not attempt to rewrite the state machine.";
2219 emitRemark<OptimizationRemarkAnalysis>(
F,
"OMP102",
Remark);
2235 for (
Use *U : ToBeReplacedStateMachineUses)
2237 ID,
U->get()->getType()));
2239 ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
2248struct AAICVTracker :
public StateWrapper<BooleanState, AbstractAttribute> {
2253 bool isAssumedTracked()
const {
return getAssumed(); }
2256 bool isKnownTracked()
const {
return getAssumed(); }
2265 return std::nullopt;
2271 virtual std::optional<Value *>
2279 const std::string
getName()
const override {
return "AAICVTracker"; }
2282 const char *getIdAddr()
const override {
return &
ID; }
2289 static const char ID;
2292struct AAICVTrackerFunction :
public AAICVTracker {
2294 : AAICVTracker(IRP,
A) {}
2297 const std::string getAsStr(
Attributor *)
const override {
2298 return "ICVTrackerFunction";
2302 void trackStatistics()
const override {}
2306 return ChangeStatus::UNCHANGED;
2311 InternalControlVar::ICV___last>
2312 ICVReplacementValuesMap;
2319 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2322 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2324 auto &ValuesMap = ICVReplacementValuesMap[ICV];
2326 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
2332 if (ValuesMap.insert(std::make_pair(CI, CI->
getArgOperand(0))).second)
2333 HasChanged = ChangeStatus::CHANGED;
2339 std::optional<Value *> ReplVal = getValueForCall(
A,
I, ICV);
2340 if (ReplVal && ValuesMap.insert(std::make_pair(&
I, *ReplVal)).second)
2341 HasChanged = ChangeStatus::CHANGED;
2347 SetterRFI.foreachUse(TrackValues,
F);
2349 bool UsedAssumedInformation =
false;
2350 A.checkForAllInstructions(CallCheck, *
this, {Instruction::Call},
2351 UsedAssumedInformation,
2357 if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
2358 ValuesMap.insert(std::make_pair(Entry,
nullptr));
2369 const auto *CB = dyn_cast<CallBase>(&
I);
2370 if (!CB || CB->hasFnAttr(
"no_openmp") ||
2371 CB->hasFnAttr(
"no_openmp_routines"))
2372 return std::nullopt;
2374 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2375 auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
2376 auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
2377 Function *CalledFunction = CB->getCalledFunction();
2380 if (CalledFunction ==
nullptr)
2382 if (CalledFunction == GetterRFI.Declaration)
2383 return std::nullopt;
2384 if (CalledFunction == SetterRFI.Declaration) {
2385 if (ICVReplacementValuesMap[ICV].
count(&
I))
2386 return ICVReplacementValuesMap[ICV].
lookup(&
I);
2395 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2398 if (ICVTrackingAA->isAssumedTracked()) {
2399 std::optional<Value *> URV =
2400 ICVTrackingAA->getUniqueReplacementValue(ICV);
2411 std::optional<Value *>
2413 return std::nullopt;
2420 const auto &ValuesMap = ICVReplacementValuesMap[ICV];
2421 if (ValuesMap.count(
I))
2422 return ValuesMap.lookup(
I);
2428 std::optional<Value *> ReplVal;
2430 while (!Worklist.
empty()) {
2432 if (!Visited.
insert(CurrInst).second)
2440 if (ValuesMap.count(CurrInst)) {
2441 std::optional<Value *> NewReplVal = ValuesMap.lookup(CurrInst);
2444 ReplVal = NewReplVal;
2450 if (ReplVal != NewReplVal)
2456 std::optional<Value *> NewReplVal = getValueForCall(
A, *CurrInst, ICV);
2462 ReplVal = NewReplVal;
2468 if (ReplVal != NewReplVal)
2473 if (CurrBB ==
I->getParent() && ReplVal)
2478 if (
const Instruction *Terminator = Pred->getTerminator())
2486struct AAICVTrackerFunctionReturned : AAICVTracker {
2488 : AAICVTracker(IRP,
A) {}
2491 const std::string getAsStr(
Attributor *)
const override {
2492 return "ICVTrackerFunctionReturned";
2496 void trackStatistics()
const override {}
2500 return ChangeStatus::UNCHANGED;
2505 InternalControlVar::ICV___last>
2506 ICVReplacementValuesMap;
2509 std::optional<Value *>
2511 return ICVReplacementValuesMap[ICV];
2516 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2519 if (!ICVTrackingAA->isAssumedTracked())
2520 return indicatePessimisticFixpoint();
2523 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2524 std::optional<Value *> UniqueICVValue;
2527 std::optional<Value *> NewReplVal =
2528 ICVTrackingAA->getReplacementValue(ICV, &
I,
A);
2531 if (UniqueICVValue && UniqueICVValue != NewReplVal)
2534 UniqueICVValue = NewReplVal;
2539 bool UsedAssumedInformation =
false;
2540 if (!
A.checkForAllInstructions(CheckReturnInst, *
this, {Instruction::Ret},
2541 UsedAssumedInformation,
2543 UniqueICVValue =
nullptr;
2545 if (UniqueICVValue == ReplVal)
2548 ReplVal = UniqueICVValue;
2549 Changed = ChangeStatus::CHANGED;
2556struct AAICVTrackerCallSite : AAICVTracker {
2558 : AAICVTracker(IRP,
A) {}
2561 assert(getAnchorScope() &&
"Expected anchor function");
2565 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2567 auto ICVInfo = OMPInfoCache.ICVs[ICV];
2568 auto &Getter = OMPInfoCache.RFIs[ICVInfo.Getter];
2569 if (Getter.Declaration == getAssociatedFunction()) {
2570 AssociatedICV = ICVInfo.Kind;
2576 indicatePessimisticFixpoint();
2580 if (!ReplVal || !*ReplVal)
2581 return ChangeStatus::UNCHANGED;
2584 A.deleteAfterManifest(*getCtxI());
2586 return ChangeStatus::CHANGED;
2590 const std::string getAsStr(
Attributor *)
const override {
2591 return "ICVTrackerCallSite";
2595 void trackStatistics()
const override {}
2598 std::optional<Value *> ReplVal;
2601 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2605 if (!ICVTrackingAA->isAssumedTracked())
2606 return indicatePessimisticFixpoint();
2608 std::optional<Value *> NewReplVal =
2609 ICVTrackingAA->getReplacementValue(AssociatedICV, getCtxI(),
A);
2611 if (ReplVal == NewReplVal)
2612 return ChangeStatus::UNCHANGED;
2614 ReplVal = NewReplVal;
2615 return ChangeStatus::CHANGED;
2620 std::optional<Value *>
2626struct AAICVTrackerCallSiteReturned : AAICVTracker {
2628 : AAICVTracker(IRP,
A) {}
2631 const std::string getAsStr(
Attributor *)
const override {
2632 return "ICVTrackerCallSiteReturned";
2636 void trackStatistics()
const override {}
2640 return ChangeStatus::UNCHANGED;
2645 InternalControlVar::ICV___last>
2646 ICVReplacementValuesMap;
2650 std::optional<Value *>
2652 return ICVReplacementValuesMap[ICV];
2657 const auto *ICVTrackingAA =
A.getAAFor<AAICVTracker>(
2659 DepClassTy::REQUIRED);
2662 if (!ICVTrackingAA->isAssumedTracked())
2663 return indicatePessimisticFixpoint();
2666 std::optional<Value *> &ReplVal = ICVReplacementValuesMap[ICV];
2667 std::optional<Value *> NewReplVal =
2668 ICVTrackingAA->getUniqueReplacementValue(ICV);
2670 if (ReplVal == NewReplVal)
2673 ReplVal = NewReplVal;
2674 Changed = ChangeStatus::CHANGED;
2682static bool hasFunctionEndAsUniqueSuccessor(
const BasicBlock *BB) {
2688 return hasFunctionEndAsUniqueSuccessor(
Successor);
2695 ~AAExecutionDomainFunction() {
delete RPOT; }
2699 assert(
F &&
"Expected anchor function");
2704 unsigned TotalBlocks = 0, InitialThreadBlocks = 0, AlignedBlocks = 0;
2705 for (
auto &It : BEDMap) {
2709 InitialThreadBlocks += It.getSecond().IsExecutedByInitialThreadOnly;
2710 AlignedBlocks += It.getSecond().IsReachedFromAlignedBarrierOnly &&
2711 It.getSecond().IsReachingAlignedBarrierOnly;
2713 return "[AAExecutionDomain] " + std::to_string(InitialThreadBlocks) +
"/" +
2714 std::to_string(AlignedBlocks) +
" of " +
2715 std::to_string(TotalBlocks) +
2716 " executed by initial thread / aligned";
2728 << BB.
getName() <<
" is executed by a single thread.\n";
2738 auto HandleAlignedBarrier = [&](
CallBase *CB) {
2739 const ExecutionDomainTy &ED = CB ? CEDMap[{CB, PRE}] : BEDMap[
nullptr];
2740 if (!ED.IsReachedFromAlignedBarrierOnly ||
2741 ED.EncounteredNonLocalSideEffect)
2743 if (!ED.EncounteredAssumes.empty() && !
A.isModulePass())
2754 DeletedBarriers.
insert(CB);
2755 A.deleteAfterManifest(*CB);
2756 ++NumBarriersEliminated;
2758 }
else if (!ED.AlignedBarriers.empty()) {
2761 ED.AlignedBarriers.end());
2763 while (!Worklist.
empty()) {
2765 if (!Visited.
insert(LastCB))
2769 if (!hasFunctionEndAsUniqueSuccessor(LastCB->
getParent()))
2771 if (!DeletedBarriers.
count(LastCB)) {
2772 ++NumBarriersEliminated;
2773 A.deleteAfterManifest(*LastCB);
2779 const ExecutionDomainTy &LastED = CEDMap[{LastCB, PRE}];
2780 Worklist.
append(LastED.AlignedBarriers.begin(),
2781 LastED.AlignedBarriers.end());
2787 if (!ED.EncounteredAssumes.empty() && (CB || !ED.AlignedBarriers.empty()))
2788 for (
auto *AssumeCB : ED.EncounteredAssumes)
2789 A.deleteAfterManifest(*AssumeCB);
2792 for (
auto *CB : AlignedBarriers)
2793 HandleAlignedBarrier(CB);
2797 HandleAlignedBarrier(
nullptr);
2809 mergeInPredecessorBarriersAndAssumptions(
Attributor &
A, ExecutionDomainTy &ED,
2810 const ExecutionDomainTy &PredED);
2815 bool mergeInPredecessor(
Attributor &
A, ExecutionDomainTy &ED,
2816 const ExecutionDomainTy &PredED,
2817 bool InitialEdgeOnly =
false);
2820 bool handleCallees(
Attributor &
A, ExecutionDomainTy &EntryBBED);
2830 assert(BB.
getParent() == getAnchorScope() &&
"Block is out of scope!");
2831 return BEDMap.lookup(&BB).IsExecutedByInitialThreadOnly;
2836 assert(
I.getFunction() == getAnchorScope() &&
2837 "Instruction is out of scope!");
2841 bool ForwardIsOk =
true;
2847 auto *CB = dyn_cast<CallBase>(CurI);
2850 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2852 const auto &It = CEDMap.find({CB, PRE});
2853 if (It == CEDMap.end())
2855 if (!It->getSecond().IsReachingAlignedBarrierOnly)
2856 ForwardIsOk =
false;
2860 if (!CurI && !BEDMap.lookup(
I.getParent()).IsReachingAlignedBarrierOnly)
2861 ForwardIsOk =
false;
2866 auto *CB = dyn_cast<CallBase>(CurI);
2869 if (CB != &
I && AlignedBarriers.contains(
const_cast<CallBase *
>(CB)))
2871 const auto &It = CEDMap.find({CB, POST});
2872 if (It == CEDMap.end())
2874 if (It->getSecond().IsReachedFromAlignedBarrierOnly)
2887 return BEDMap.lookup(
nullptr).IsReachedFromAlignedBarrierOnly;
2889 return BEDMap.lookup(PredBB).IsReachedFromAlignedBarrierOnly;
2901 "No request should be made against an invalid state!");
2902 return BEDMap.lookup(&BB);
2904 std::pair<ExecutionDomainTy, ExecutionDomainTy>
2907 "No request should be made against an invalid state!");
2908 return {CEDMap.lookup({&CB, PRE}), CEDMap.lookup({&CB, POST})};
2912 "No request should be made against an invalid state!");
2913 return InterProceduralED;
2927 if (!Cmp || !
Cmp->isTrueWhenEqual() || !
Cmp->isEquality())
2935 if (
C->isAllOnesValue()) {
2936 auto *CB = dyn_cast<CallBase>(
Cmp->getOperand(0));
2937 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
2938 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
2939 CB = CB ? OpenMPOpt::getCallIfRegularCall(*CB, &RFI) : nullptr;
2945 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
2951 if (
auto *II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2952 if (II->getIntrinsicID() == Intrinsic::nvvm_read_ptx_sreg_tid_x)
2956 if (
auto *II = dyn_cast<IntrinsicInst>(
Cmp->getOperand(0)))
2957 if (II->getIntrinsicID() == Intrinsic::amdgcn_workitem_id_x)
2965 ExecutionDomainTy InterProceduralED;
2977 static bool setAndRecord(
bool &R,
bool V) {
2988void AAExecutionDomainFunction::mergeInPredecessorBarriersAndAssumptions(
2989 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED) {
2990 for (
auto *EA : PredED.EncounteredAssumes)
2991 ED.addAssumeInst(
A, *EA);
2993 for (
auto *AB : PredED.AlignedBarriers)
2994 ED.addAlignedBarrier(
A, *AB);
2997bool AAExecutionDomainFunction::mergeInPredecessor(
2998 Attributor &
A, ExecutionDomainTy &ED,
const ExecutionDomainTy &PredED,
2999 bool InitialEdgeOnly) {
3001 bool Changed =
false;
3003 setAndRecord(ED.IsExecutedByInitialThreadOnly,
3004 InitialEdgeOnly || (PredED.IsExecutedByInitialThreadOnly &&
3005 ED.IsExecutedByInitialThreadOnly));
3007 Changed |= setAndRecord(ED.IsReachedFromAlignedBarrierOnly,
3008 ED.IsReachedFromAlignedBarrierOnly &&
3009 PredED.IsReachedFromAlignedBarrierOnly);
3010 Changed |= setAndRecord(ED.EncounteredNonLocalSideEffect,
3011 ED.EncounteredNonLocalSideEffect |
3012 PredED.EncounteredNonLocalSideEffect);
3014 if (ED.IsReachedFromAlignedBarrierOnly)
3015 mergeInPredecessorBarriersAndAssumptions(
A, ED, PredED);
3017 ED.clearAssumeInstAndAlignedBarriers();
3021bool AAExecutionDomainFunction::handleCallees(
Attributor &
A,
3022 ExecutionDomainTy &EntryBBED) {
3027 DepClassTy::OPTIONAL);
3028 if (!EDAA || !EDAA->getState().isValidState())
3031 EDAA->getExecutionDomain(*cast<CallBase>(ACS.getInstruction())));
3035 ExecutionDomainTy ExitED;
3036 bool AllCallSitesKnown;
3037 if (
A.checkForAllCallSites(PredForCallSite, *
this,
3039 AllCallSitesKnown)) {
3040 for (
const auto &[CSInED, CSOutED] : CallSiteEDs) {
3041 mergeInPredecessor(
A, EntryBBED, CSInED);
3042 ExitED.IsReachingAlignedBarrierOnly &=
3043 CSOutED.IsReachingAlignedBarrierOnly;
3050 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3051 EntryBBED.IsReachedFromAlignedBarrierOnly =
true;
3052 EntryBBED.EncounteredNonLocalSideEffect =
false;
3053 ExitED.IsReachingAlignedBarrierOnly =
false;
3055 EntryBBED.IsExecutedByInitialThreadOnly =
false;
3056 EntryBBED.IsReachedFromAlignedBarrierOnly =
false;
3057 EntryBBED.EncounteredNonLocalSideEffect =
true;
3058 ExitED.IsReachingAlignedBarrierOnly =
false;
3062 bool Changed =
false;
3063 auto &FnED = BEDMap[
nullptr];
3064 Changed |= setAndRecord(FnED.IsReachedFromAlignedBarrierOnly,
3065 FnED.IsReachedFromAlignedBarrierOnly &
3066 EntryBBED.IsReachedFromAlignedBarrierOnly);
3067 Changed |= setAndRecord(FnED.IsReachingAlignedBarrierOnly,
3068 FnED.IsReachingAlignedBarrierOnly &
3069 ExitED.IsReachingAlignedBarrierOnly);
3070 Changed |= setAndRecord(FnED.IsExecutedByInitialThreadOnly,
3071 EntryBBED.IsExecutedByInitialThreadOnly);
3077 bool Changed =
false;
3082 auto HandleAlignedBarrier = [&](
CallBase &CB, ExecutionDomainTy &ED) {
3083 Changed |= AlignedBarriers.insert(&CB);
3085 auto &CallInED = CEDMap[{&CB, PRE}];
3086 Changed |= mergeInPredecessor(
A, CallInED, ED);
3087 CallInED.IsReachingAlignedBarrierOnly =
true;
3089 ED.EncounteredNonLocalSideEffect =
false;
3090 ED.IsReachedFromAlignedBarrierOnly =
true;
3092 ED.clearAssumeInstAndAlignedBarriers();
3093 ED.addAlignedBarrier(
A, CB);
3094 auto &CallOutED = CEDMap[{&CB, POST}];
3095 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3099 A.getAAFor<
AAIsDead>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
3106 for (
auto &RIt : *RPOT) {
3109 bool IsEntryBB = &BB == &EntryBB;
3112 bool AlignedBarrierLastInBlock = IsEntryBB && IsKernel;
3113 bool IsExplicitlyAligned = IsEntryBB && IsKernel;
3114 ExecutionDomainTy ED;
3117 Changed |= handleCallees(
A, ED);
3121 if (LivenessAA && LivenessAA->isAssumedDead(&BB))
3125 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, &BB))
3127 bool InitialEdgeOnly = isInitialThreadOnlyEdge(
3128 A, dyn_cast<BranchInst>(PredBB->getTerminator()), BB);
3129 mergeInPredecessor(
A, ED, BEDMap[PredBB], InitialEdgeOnly);
3136 bool UsedAssumedInformation;
3137 if (
A.isAssumedDead(
I, *
this, LivenessAA, UsedAssumedInformation,
3138 false, DepClassTy::OPTIONAL,
3144 if (
auto *II = dyn_cast<IntrinsicInst>(&
I)) {
3145 if (
auto *AI = dyn_cast_or_null<AssumeInst>(II)) {
3146 ED.addAssumeInst(
A, *AI);
3150 if (II->isAssumeLikeIntrinsic())
3154 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
3155 if (!ED.EncounteredNonLocalSideEffect) {
3157 if (ED.IsReachedFromAlignedBarrierOnly)
3162 case AtomicOrdering::NotAtomic:
3164 case AtomicOrdering::Unordered:
3166 case AtomicOrdering::Monotonic:
3168 case AtomicOrdering::Acquire:
3170 case AtomicOrdering::Release:
3172 case AtomicOrdering::AcquireRelease:
3174 case AtomicOrdering::SequentiallyConsistent:
3178 NonNoOpFences.insert(FI);
3181 auto *CB = dyn_cast<CallBase>(&
I);
3183 bool IsAlignedBarrier =
3187 AlignedBarrierLastInBlock &= IsNoSync;
3188 IsExplicitlyAligned &= IsNoSync;
3194 if (IsAlignedBarrier) {
3195 HandleAlignedBarrier(*CB, ED);
3196 AlignedBarrierLastInBlock =
true;
3197 IsExplicitlyAligned =
true;
3202 if (isa<MemIntrinsic>(&
I)) {
3203 if (!ED.EncounteredNonLocalSideEffect &&
3205 ED.EncounteredNonLocalSideEffect =
true;
3207 ED.IsReachedFromAlignedBarrierOnly =
false;
3215 auto &CallInED = CEDMap[{CB, PRE}];
3216 Changed |= mergeInPredecessor(
A, CallInED, ED);
3222 if (!IsNoSync && Callee && !
Callee->isDeclaration()) {
3225 if (EDAA && EDAA->getState().isValidState()) {
3228 CalleeED.IsReachedFromAlignedBarrierOnly;
3229 AlignedBarrierLastInBlock = ED.IsReachedFromAlignedBarrierOnly;
3230 if (IsNoSync || !CalleeED.IsReachedFromAlignedBarrierOnly)
3231 ED.EncounteredNonLocalSideEffect |=
3232 CalleeED.EncounteredNonLocalSideEffect;
3234 ED.EncounteredNonLocalSideEffect =
3235 CalleeED.EncounteredNonLocalSideEffect;
3236 if (!CalleeED.IsReachingAlignedBarrierOnly) {
3238 setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3241 if (CalleeED.IsReachedFromAlignedBarrierOnly)
3242 mergeInPredecessorBarriersAndAssumptions(
A, ED, CalleeED);
3243 auto &CallOutED = CEDMap[{CB, POST}];
3244 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3249 ED.IsReachedFromAlignedBarrierOnly =
false;
3250 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3253 AlignedBarrierLastInBlock &= ED.IsReachedFromAlignedBarrierOnly;
3255 auto &CallOutED = CEDMap[{CB, POST}];
3256 Changed |= mergeInPredecessor(
A, CallOutED, ED);
3259 if (!
I.mayHaveSideEffects() && !
I.mayReadFromMemory())
3273 if (MemAA && MemAA->getState().isValidState() &&
3274 MemAA->checkForAllAccessesToMemoryKind(
3279 auto &InfoCache =
A.getInfoCache();
3280 if (!
I.mayHaveSideEffects() && InfoCache.isOnlyUsedByAssume(
I))
3283 if (
auto *LI = dyn_cast<LoadInst>(&
I))
3284 if (LI->hasMetadata(LLVMContext::MD_invariant_load))
3287 if (!ED.EncounteredNonLocalSideEffect &&
3289 ED.EncounteredNonLocalSideEffect =
true;
3292 bool IsEndAndNotReachingAlignedBarriersOnly =
false;
3293 if (!isa<UnreachableInst>(BB.getTerminator()) &&
3294 !BB.getTerminator()->getNumSuccessors()) {
3296 Changed |= mergeInPredecessor(
A, InterProceduralED, ED);
3298 auto &FnED = BEDMap[
nullptr];
3299 if (IsKernel && !IsExplicitlyAligned)
3300 FnED.IsReachingAlignedBarrierOnly =
false;
3301 Changed |= mergeInPredecessor(
A, FnED, ED);
3303 if (!FnED.IsReachingAlignedBarrierOnly) {
3304 IsEndAndNotReachingAlignedBarriersOnly =
true;
3305 SyncInstWorklist.
push_back(BB.getTerminator());
3306 auto &BBED = BEDMap[&BB];
3307 Changed |= setAndRecord(BBED.IsReachingAlignedBarrierOnly,
false);
3311 ExecutionDomainTy &StoredED = BEDMap[&BB];
3312 ED.IsReachingAlignedBarrierOnly = StoredED.IsReachingAlignedBarrierOnly &
3313 !IsEndAndNotReachingAlignedBarriersOnly;
3319 if (ED.IsExecutedByInitialThreadOnly !=
3320 StoredED.IsExecutedByInitialThreadOnly ||
3321 ED.IsReachedFromAlignedBarrierOnly !=
3322 StoredED.IsReachedFromAlignedBarrierOnly ||
3323 ED.EncounteredNonLocalSideEffect !=
3324 StoredED.EncounteredNonLocalSideEffect)
3328 StoredED = std::move(ED);
3334 while (!SyncInstWorklist.
empty()) {
3337 bool HitAlignedBarrierOrKnownEnd =
false;
3339 auto *CB = dyn_cast<CallBase>(CurInst);
3342 auto &CallOutED = CEDMap[{CB, POST}];
3343 Changed |= setAndRecord(CallOutED.IsReachingAlignedBarrierOnly,
false);
3344 auto &CallInED = CEDMap[{CB, PRE}];
3345 HitAlignedBarrierOrKnownEnd =
3346 AlignedBarriers.count(CB) || !CallInED.IsReachingAlignedBarrierOnly;
3347 if (HitAlignedBarrierOrKnownEnd)
3349 Changed |= setAndRecord(CallInED.IsReachingAlignedBarrierOnly,
false);
3351 if (HitAlignedBarrierOrKnownEnd)
3355 if (LivenessAA && LivenessAA->isEdgeDead(PredBB, SyncBB))
3357 if (!Visited.
insert(PredBB))
3359 auto &PredED = BEDMap[PredBB];
3360 if (setAndRecord(PredED.IsReachingAlignedBarrierOnly,
false)) {
3362 SyncInstWorklist.
push_back(PredBB->getTerminator());
3365 if (SyncBB != &EntryBB)
3368 setAndRecord(InterProceduralED.IsReachingAlignedBarrierOnly,
false);
3371 return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
3376struct AAHeapToShared :
public StateWrapper<BooleanState, AbstractAttribute> {
3381 static AAHeapToShared &createForPosition(
const IRPosition &IRP,
3385 virtual bool isAssumedHeapToShared(
CallBase &CB)
const = 0;
3389 virtual bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const = 0;
3392 const std::string
getName()
const override {
return "AAHeapToShared"; }
3395 const char *getIdAddr()
const override {
return &
ID; }
3404 static const char ID;
3407struct AAHeapToSharedFunction :
public AAHeapToShared {
3409 : AAHeapToShared(IRP,
A) {}
3411 const std::string getAsStr(
Attributor *)
const override {
3412 return "[AAHeapToShared] " + std::to_string(MallocCalls.size()) +
3413 " malloc calls eligible.";
3417 void trackStatistics()
const override {}
3421 void findPotentialRemovedFreeCalls(
Attributor &
A) {
3422 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3423 auto &FreeRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3425 PotentialRemovedFreeCalls.clear();
3429 for (
auto *U : CB->
users()) {
3431 if (
C &&
C->getCalledFunction() == FreeRFI.Declaration)
3435 if (FreeCalls.
size() != 1)
3438 PotentialRemovedFreeCalls.insert(FreeCalls.
front());
3444 indicatePessimisticFixpoint();
3448 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3449 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3450 if (!RFI.Declaration)
3455 bool &) -> std::optional<Value *> {
return nullptr; };
3458 for (
User *U : RFI.Declaration->
users())
3459 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3462 MallocCalls.insert(CB);
3467 findPotentialRemovedFreeCalls(
A);
3470 bool isAssumedHeapToShared(
CallBase &CB)
const override {
3471 return isValidState() && MallocCalls.count(&CB);
3474 bool isAssumedHeapToSharedRemovedFree(
CallBase &CB)
const override {
3475 return isValidState() && PotentialRemovedFreeCalls.count(&CB);
3479 if (MallocCalls.empty())
3480 return ChangeStatus::UNCHANGED;
3482 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3483 auto &FreeCall = OMPInfoCache.RFIs[OMPRTL___kmpc_free_shared];
3487 DepClassTy::OPTIONAL);
3492 if (HS &&
HS->isAssumedHeapToStack(*CB))
3497 for (
auto *U : CB->
users()) {
3499 if (
C &&
C->getCalledFunction() == FreeCall.Declaration)
3502 if (FreeCalls.
size() != 1)
3509 <<
" with shared memory."
3510 <<
" Shared memory usage is limited to "
3516 <<
" with " << AllocSize->getZExtValue()
3517 <<
" bytes of shared memory\n");
3523 Type *Int8ArrTy = ArrayType::get(Int8Ty, AllocSize->getZExtValue());
3528 static_cast<unsigned>(AddressSpace::Shared));
3533 return OR <<
"Replaced globalized variable with "
3534 <<
ore::NV(
"SharedMemory", AllocSize->getZExtValue())
3535 << (AllocSize->isOne() ?
" byte " :
" bytes ")
3536 <<
"of shared memory.";
3542 "HeapToShared on allocation without alignment attribute");
3543 SharedMem->setAlignment(*Alignment);
3546 A.deleteAfterManifest(*CB);
3547 A.deleteAfterManifest(*FreeCalls.
front());
3549 SharedMemoryUsed += AllocSize->getZExtValue();
3550 NumBytesMovedToSharedMemory = SharedMemoryUsed;
3551 Changed = ChangeStatus::CHANGED;
3558 if (MallocCalls.empty())
3559 return indicatePessimisticFixpoint();
3560 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3561 auto &RFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
3562 if (!RFI.Declaration)
3563 return ChangeStatus::UNCHANGED;
3567 auto NumMallocCalls = MallocCalls.size();
3570 for (
User *U : RFI.Declaration->
users()) {
3571 if (
CallBase *CB = dyn_cast<CallBase>(U)) {
3572 if (CB->getCaller() !=
F)
3574 if (!MallocCalls.count(CB))
3576 if (!isa<ConstantInt>(CB->getArgOperand(0))) {
3577 MallocCalls.remove(CB);
3582 if (!ED || !ED->isExecutedByInitialThreadOnly(*CB))
3583 MallocCalls.remove(CB);
3587 findPotentialRemovedFreeCalls(
A);
3589 if (NumMallocCalls != MallocCalls.size())
3590 return ChangeStatus::CHANGED;
3592 return ChangeStatus::UNCHANGED;
3600 unsigned SharedMemoryUsed = 0;
3603struct AAKernelInfo :
public StateWrapper<KernelInfoState, AbstractAttribute> {
3609 static bool requiresCalleeForCallBase() {
return false; }
3612 void trackStatistics()
const override {}
3615 const std::string getAsStr(
Attributor *)
const override {
3616 if (!isValidState())
3618 return std::string(SPMDCompatibilityTracker.isAssumed() ?
"SPMD"
3620 std::string(SPMDCompatibilityTracker.isAtFixpoint() ?
" [FIX]"
3622 std::string(
" #PRs: ") +
3623 (ReachedKnownParallelRegions.isValidState()
3624 ? std::to_string(ReachedKnownParallelRegions.size())
3626 ", #Unknown PRs: " +
3627 (ReachedUnknownParallelRegions.isValidState()
3630 ", #Reaching Kernels: " +
3631 (ReachingKernelEntries.isValidState()
3635 (ParallelLevels.isValidState()
3638 ", NestedPar: " + (NestedParallelism ?
"yes" :
"no");
3645 const std::string
getName()
const override {
return "AAKernelInfo"; }
3648 const char *getIdAddr()
const override {
return &
ID; }
3655 static const char ID;
3660struct AAKernelInfoFunction : AAKernelInfo {
3662 : AAKernelInfo(IRP,
A) {}
3667 return GuardedInstructions;
3670 void setConfigurationOfKernelEnvironment(
ConstantStruct *ConfigC) {
3672 KernelEnvC, ConfigC, {KernelInfo::ConfigurationIdx});
3673 assert(NewKernelEnvC &&
"Failed to create new kernel environment");
3674 KernelEnvC = cast<ConstantStruct>(NewKernelEnvC);
3677#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER) \
3678 void set##MEMBER##OfKernelEnvironment(ConstantInt *NewVal) { \
3679 ConstantStruct *ConfigC = \
3680 KernelInfo::getConfigurationFromKernelEnvironment(KernelEnvC); \
3681 Constant *NewConfigC = ConstantFoldInsertValueInstruction( \
3682 ConfigC, NewVal, {KernelInfo::MEMBER##Idx}); \
3683 assert(NewConfigC && "Failed to create new configuration environment"); \
3684 setConfigurationOfKernelEnvironment(cast<ConstantStruct>(NewConfigC)); \
3695#undef KERNEL_ENVIRONMENT_CONFIGURATION_SETTER
3702 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3706 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
3707 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
3708 OMPInformationCache::RuntimeFunctionInfo &DeinitRFI =
3709 OMPInfoCache.RFIs[OMPRTL___kmpc_target_deinit];
3713 auto StoreCallBase = [](
Use &U,
3714 OMPInformationCache::RuntimeFunctionInfo &RFI,
3716 CallBase *CB = OpenMPOpt::getCallIfRegularCall(U, &RFI);
3718 "Unexpected use of __kmpc_target_init or __kmpc_target_deinit!");
3720 "Multiple uses of __kmpc_target_init or __kmpc_target_deinit!");
3726 StoreCallBase(U, InitRFI, KernelInitCB);
3730 DeinitRFI.foreachUse(
3732 StoreCallBase(U, DeinitRFI, KernelDeinitCB);
3738 if (!KernelInitCB || !KernelDeinitCB)
3742 ReachingKernelEntries.insert(Fn);
3743 IsKernelEntry =
true;
3751 KernelConfigurationSimplifyCB =
3753 bool &UsedAssumedInformation) -> std::optional<Constant *> {
3754 if (!isAtFixpoint()) {
3757 UsedAssumedInformation =
true;
3758 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
3763 A.registerGlobalVariableSimplificationCallback(
3764 *KernelEnvGV, KernelConfigurationSimplifyCB);
3768 KernelInfo::getExecModeFromKernelEnvironment(KernelEnvC);
3773 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
3777 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
3779 setExecModeOfKernelEnvironment(AssumedExecModeC);
3786 setMinThreadsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinThreads));
3789 auto [MinTeams, MaxTeams] =
3792 setMinTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MinTeams));
3794 setMaxTeamsOfKernelEnvironment(ConstantInt::get(
Int32Ty, MaxTeams));
3797 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(KernelEnvC);
3798 ConstantInt *AssumedMayUseNestedParallelismC = ConstantInt::get(
3800 setMayUseNestedParallelismOfKernelEnvironment(
3801 AssumedMayUseNestedParallelismC);
3805 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3808 ConstantInt::get(UseGenericStateMachineC->
getIntegerType(),
false);
3809 setUseGenericStateMachineOfKernelEnvironment(
3810 AssumedUseGenericStateMachineC);
3816 if (!OMPInfoCache.RFIs[RFKind].Declaration)
3818 A.registerVirtualUseCallback(*OMPInfoCache.RFIs[RFKind].Declaration, CB);
3822 auto AddDependence = [](
Attributor &
A,
const AAKernelInfo *KI,
3825 A.recordDependence(*KI, *QueryingAA, DepClassTy::OPTIONAL);
3839 if (SPMDCompatibilityTracker.isValidState())
3840 return AddDependence(
A,
this, QueryingAA);
3842 if (!ReachedKnownParallelRegions.isValidState())
3843 return AddDependence(
A,
this, QueryingAA);
3849 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_num_threads_in_block,
3850 CustomStateMachineUseCB);
3851 RegisterVirtualUse(OMPRTL___kmpc_get_warp_size, CustomStateMachineUseCB);
3852 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_generic,
3853 CustomStateMachineUseCB);
3854 RegisterVirtualUse(OMPRTL___kmpc_kernel_parallel,
3855 CustomStateMachineUseCB);
3856 RegisterVirtualUse(OMPRTL___kmpc_kernel_end_parallel,
3857 CustomStateMachineUseCB);
3861 if (SPMDCompatibilityTracker.isAtFixpoint())
3868 if (!SPMDCompatibilityTracker.isValidState())
3869 return AddDependence(
A,
this, QueryingAA);
3872 RegisterVirtualUse(OMPRTL___kmpc_get_hardware_thread_id_in_block,
3881 if (!SPMDCompatibilityTracker.isValidState())
3882 return AddDependence(
A,
this, QueryingAA);
3883 if (SPMDCompatibilityTracker.empty())
3884 return AddDependence(
A,
this, QueryingAA);
3885 if (!mayContainParallelRegion())
3886 return AddDependence(
A,
this, QueryingAA);
3889 RegisterVirtualUse(OMPRTL___kmpc_barrier_simple_spmd, SPMDBarrierUseCB);
3893 static std::string sanitizeForGlobalName(std::string S) {
3897 return !((C >=
'a' && C <=
'z') || (C >=
'A' && C <=
'Z') ||
3898 (C >=
'0' && C <=
'9') || C ==
'_');
3909 if (!KernelInitCB || !KernelDeinitCB)
3910 return ChangeStatus::UNCHANGED;
3914 bool HasBuiltStateMachine =
true;
3915 if (!changeToSPMDMode(
A, Changed)) {
3917 HasBuiltStateMachine = buildCustomStateMachine(
A, Changed);
3919 HasBuiltStateMachine =
false;
3926 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
3927 ExistingKernelEnvC);
3928 if (!HasBuiltStateMachine)
3929 setUseGenericStateMachineOfKernelEnvironment(
3930 OldUseGenericStateMachineVal);
3937 Changed = ChangeStatus::CHANGED;
3943 void insertInstructionGuardsHelper(
Attributor &
A) {
3944 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
3946 auto CreateGuardedRegion = [&](
Instruction *RegionStartI,
3980 DT, LI, MSU,
"region.guarded.end");
3983 MSU,
"region.barrier");
3986 DT, LI, MSU,
"region.exit");
3988 SplitBlock(ParentBB, RegionStartI, DT, LI, MSU,
"region.guarded");
3991 "Expected a different CFG");
3994 ParentBB, ParentBB->
getTerminator(), DT, LI, MSU,
"region.check.tid");
3997 A.registerManifestAddedBasicBlock(*RegionEndBB);
3998 A.registerManifestAddedBasicBlock(*RegionBarrierBB);
3999 A.registerManifestAddedBasicBlock(*RegionExitBB);
4000 A.registerManifestAddedBasicBlock(*RegionStartBB);
4001 A.registerManifestAddedBasicBlock(*RegionCheckTidBB);
4003 bool HasBroadcastValues =
false;
4008 for (
Use &U :
I.uses()) {
4014 if (OutsideUses.
empty())
4017 HasBroadcastValues =
true;
4022 M,
I.getType(),
false,
4024 sanitizeForGlobalName(
4025 (
I.getName() +
".guarded.output.alloc").str()),
4027 static_cast<unsigned>(AddressSpace::Shared));
4034 I.getType(), SharedMem,
I.getName() +
".guarded.output.load",
4038 for (
Use *U : OutsideUses)
4039 A.changeUseAfterManifest(*U, *LoadI);
4042 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4048 InsertPointTy(ParentBB, ParentBB->
end()),
DL);
4049 OMPInfoCache.OMPBuilder.updateToLocation(Loc);
4052 OMPInfoCache.OMPBuilder.getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4054 OMPInfoCache.OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4060 InsertPointTy(RegionCheckTidBB, RegionCheckTidBB->
end()),
DL);
4061 OMPInfoCache.OMPBuilder.updateToLocation(LocRegionCheckTid);
4063 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4064 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4066 OMPInfoCache.OMPBuilder.Builder.CreateCall(HardwareTidFn, {});
4068 OMPInfoCache.setCallingConvention(HardwareTidFn, Tid);
4069 Value *TidCheck = OMPInfoCache.OMPBuilder.Builder.CreateIsNull(Tid);
4070 OMPInfoCache.OMPBuilder.Builder
4071 .CreateCondBr(TidCheck, RegionStartBB, RegionBarrierBB)
4077 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4078 M, OMPRTL___kmpc_barrier_simple_spmd);
4079 OMPInfoCache.OMPBuilder.updateToLocation(InsertPointTy(
4082 OMPInfoCache.OMPBuilder.Builder.CreateCall(BarrierFn, {Ident, Tid});
4084 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4087 if (HasBroadcastValues) {
4092 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4096 auto &AllocSharedRFI = OMPInfoCache.RFIs[OMPRTL___kmpc_alloc_shared];
4098 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4100 if (!Visited.
insert(BB).second)
4106 while (++IP != IPEnd) {
4107 if (!IP->mayHaveSideEffects() && !IP->mayReadFromMemory())
4110 if (OpenMPOpt::getCallIfRegularCall(*
I, &AllocSharedRFI))
4112 if (!
I->user_empty() || !SPMDCompatibilityTracker.contains(
I)) {
4113 LastEffect =
nullptr;
4120 for (
auto &Reorder : Reorders)
4126 for (
Instruction *GuardedI : SPMDCompatibilityTracker) {
4128 auto *CalleeAA =
A.lookupAAFor<AAKernelInfo>(
4131 assert(CalleeAA !=
nullptr &&
"Expected Callee AAKernelInfo");
4132 auto &CalleeAAFunction = *cast<AAKernelInfoFunction>(CalleeAA);
4134 if (CalleeAAFunction.getGuardedInstructions().contains(GuardedI))
4137 Instruction *GuardedRegionStart =
nullptr, *GuardedRegionEnd =
nullptr;
4141 if (SPMDCompatibilityTracker.contains(&
I)) {
4142 CalleeAAFunction.getGuardedInstructions().insert(&
I);
4143 if (GuardedRegionStart)
4144 GuardedRegionEnd = &
I;
4146 GuardedRegionStart = GuardedRegionEnd = &
I;
4153 if (GuardedRegionStart) {
4155 std::make_pair(GuardedRegionStart, GuardedRegionEnd));
4156 GuardedRegionStart =
nullptr;
4157 GuardedRegionEnd =
nullptr;
4162 for (
auto &GR : GuardedRegions)
4163 CreateGuardedRegion(GR.first, GR.second);
4166 void forceSingleThreadPerWorkgroupHelper(
Attributor &
A) {
4175 auto &Ctx = getAnchorValue().getContext();
4182 KernelInitCB->
getNextNode(),
"main.thread.user_code");
4187 A.registerManifestAddedBasicBlock(*InitBB);
4188 A.registerManifestAddedBasicBlock(*UserCodeBB);
4189 A.registerManifestAddedBasicBlock(*ReturnBB);
4198 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4200 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4201 M, OMPRTL___kmpc_get_hardware_thread_id_in_block);
4206 OMPInfoCache.setCallingConvention(ThreadIdInBlockFn, ThreadIdInBlock);
4212 ConstantInt::get(ThreadIdInBlock->
getType(), 0),
4213 "thread.is_main", InitBB);
4219 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4222 if (!OMPInfoCache.runtimeFnsAvailable(
4223 {OMPRTL___kmpc_get_hardware_thread_id_in_block,
4224 OMPRTL___kmpc_barrier_simple_spmd}))
4227 if (!SPMDCompatibilityTracker.isAssumed()) {
4228 for (
Instruction *NonCompatibleI : SPMDCompatibilityTracker) {
4229 if (!NonCompatibleI)
4233 if (
auto *CB = dyn_cast<CallBase>(NonCompatibleI))
4234 if (OMPInfoCache.RTLFunctions.contains(CB->getCalledFunction()))
4238 ORA <<
"Value has potential side effects preventing SPMD-mode "
4240 if (isa<CallBase>(NonCompatibleI)) {
4241 ORA <<
". Add `__attribute__((assume(\"ompx_spmd_amenable\")))` to "
4242 "the called function to override";
4250 << *NonCompatibleI <<
"\n");
4262 Kernel = CB->getCaller();
4270 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4276 Changed = ChangeStatus::CHANGED;
4280 if (mayContainParallelRegion())
4281 insertInstructionGuardsHelper(
A);
4283 forceSingleThreadPerWorkgroupHelper(
A);
4288 "Initially non-SPMD kernel has SPMD exec mode!");
4289 setExecModeOfKernelEnvironment(
4293 ++NumOpenMPTargetRegionKernelsSPMD;
4296 return OR <<
"Transformed generic-mode kernel to SPMD-mode.";
4308 if (!ReachedKnownParallelRegions.isValidState())
4311 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4312 if (!OMPInfoCache.runtimeFnsAvailable(
4313 {OMPRTL___kmpc_get_hardware_num_threads_in_block,
4314 OMPRTL___kmpc_get_warp_size, OMPRTL___kmpc_barrier_simple_generic,
4315 OMPRTL___kmpc_kernel_parallel, OMPRTL___kmpc_kernel_end_parallel}))
4326 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4327 ExistingKernelEnvC);
4329 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC);
4334 if (UseStateMachineC->
isZero() ||
4338 Changed = ChangeStatus::CHANGED;
4341 setUseGenericStateMachineOfKernelEnvironment(
4348 if (!mayContainParallelRegion()) {
4349 ++NumOpenMPTargetRegionKernelsWithoutStateMachine;
4352 return OR <<
"Removing unused state machine from generic-mode kernel.";
4360 if (ReachedUnknownParallelRegions.empty()) {
4361 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithoutFallback;
4364 return OR <<
"Rewriting generic-mode kernel with a customized state "
4369 ++NumOpenMPTargetRegionKernelsCustomStateMachineWithFallback;
4372 return OR <<
"Generic-mode kernel is executed with a customized state "
4373 "machine that requires a fallback.";
4378 for (
CallBase *UnknownParallelRegionCB : ReachedUnknownParallelRegions) {
4379 if (!UnknownParallelRegionCB)
4382 return ORA <<
"Call may contain unknown parallel regions. Use "
4383 <<
"`__attribute__((assume(\"omp_no_parallelism\")))` to "
4421 auto &Ctx = getAnchorValue().getContext();
4425 BasicBlock *InitBB = KernelInitCB->getParent();
4427 KernelInitCB->getNextNode(),
"thread.user_code.check");
4431 Ctx,
"worker_state_machine.begin",
Kernel, UserCodeEntryBB);
4433 Ctx,
"worker_state_machine.finished",
Kernel, UserCodeEntryBB);
4435 Ctx,
"worker_state_machine.is_active.check",
Kernel, UserCodeEntryBB);
4438 Kernel, UserCodeEntryBB);
4441 Kernel, UserCodeEntryBB);
4443 Ctx,
"worker_state_machine.done.barrier",
Kernel, UserCodeEntryBB);
4444 A.registerManifestAddedBasicBlock(*InitBB);
4445 A.registerManifestAddedBasicBlock(*UserCodeEntryBB);
4446 A.registerManifestAddedBasicBlock(*IsWorkerCheckBB);
4447 A.registerManifestAddedBasicBlock(*StateMachineBeginBB);
4448 A.registerManifestAddedBasicBlock(*StateMachineFinishedBB);
4449 A.registerManifestAddedBasicBlock(*StateMachineIsActiveCheckBB);
4450 A.registerManifestAddedBasicBlock(*StateMachineIfCascadeCurrentBB);
4451 A.registerManifestAddedBasicBlock(*StateMachineEndParallelBB);
4452 A.registerManifestAddedBasicBlock(*StateMachineDoneBarrierBB);
4454 const DebugLoc &DLoc = KernelInitCB->getDebugLoc();
4460 ConstantInt::get(KernelInitCB->getType(), -1),
4461 "thread.is_worker", InitBB);
4467 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4468 M, OMPRTL___kmpc_get_hardware_num_threads_in_block);
4470 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4471 M, OMPRTL___kmpc_get_warp_size);
4474 OMPInfoCache.setCallingConvention(BlockHwSizeFn, BlockHwSize);
4478 OMPInfoCache.setCallingConvention(WarpSizeFn, WarpSize);
4481 BlockHwSize, WarpSize,
"block.size", IsWorkerCheckBB);
4485 "thread.is_main_or_worker", IsWorkerCheckBB);
4488 IsMainOrWorker, IsWorkerCheckBB);
4492 Type *VoidPtrTy = PointerType::getUnqual(Ctx);
4494 new AllocaInst(VoidPtrTy,
DL.getAllocaAddrSpace(),
nullptr,
4498 OMPInfoCache.OMPBuilder.updateToLocation(
4501 StateMachineBeginBB->
end()),
4504 Value *Ident = KernelInfo::getIdentFromKernelEnvironment(KernelEnvC);
4505 Value *GTid = KernelInitCB;
4508 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4509 M, OMPRTL___kmpc_barrier_simple_generic);
4512 OMPInfoCache.setCallingConvention(BarrierFn, Barrier);
4516 (
unsigned int)AddressSpace::Generic) {
4518 WorkFnAI, PointerType::get(Ctx, (
unsigned int)AddressSpace::Generic),
4519 WorkFnAI->
getName() +
".generic", StateMachineBeginBB);
4524 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4525 M, OMPRTL___kmpc_kernel_parallel);
4527 KernelParallelFn, {WorkFnAI},
"worker.is_active", StateMachineBeginBB);
4528 OMPInfoCache.setCallingConvention(KernelParallelFn, IsActiveWorker);
4531 StateMachineBeginBB);
4541 StateMachineBeginBB);
4542 IsDone->setDebugLoc(DLoc);
4544 IsDone, StateMachineBeginBB)
4548 StateMachineDoneBarrierBB, IsActiveWorker,
4549 StateMachineIsActiveCheckBB)
4555 const unsigned int WrapperFunctionArgNo = 6;
4560 for (
int I = 0, E = ReachedKnownParallelRegions.size();
I < E; ++
I) {
4561 auto *CB = ReachedKnownParallelRegions[
I];
4562 auto *ParallelRegion = dyn_cast<Function>(
4563 CB->getArgOperand(WrapperFunctionArgNo)->stripPointerCasts());
4565 Ctx,
"worker_state_machine.parallel_region.execute",
Kernel,
4566 StateMachineEndParallelBB);
4568 ->setDebugLoc(DLoc);
4574 Kernel, StateMachineEndParallelBB);
4575 A.registerManifestAddedBasicBlock(*PRExecuteBB);
4576 A.registerManifestAddedBasicBlock(*PRNextBB);
4581 if (
I + 1 < E || !ReachedUnknownParallelRegions.empty()) {
4584 "worker.check_parallel_region", StateMachineIfCascadeCurrentBB);
4592 StateMachineIfCascadeCurrentBB)
4594 StateMachineIfCascadeCurrentBB = PRNextBB;
4600 if (!ReachedUnknownParallelRegions.empty()) {
4601 StateMachineIfCascadeCurrentBB->
setName(
4602 "worker_state_machine.parallel_region.fallback.execute");
4604 StateMachineIfCascadeCurrentBB)
4605 ->setDebugLoc(DLoc);
4608 StateMachineIfCascadeCurrentBB)
4612 OMPInfoCache.OMPBuilder.getOrCreateRuntimeFunction(
4613 M, OMPRTL___kmpc_kernel_end_parallel);
4616 OMPInfoCache.setCallingConvention(EndParallelFn, EndParallel);
4622 ->setDebugLoc(DLoc);
4632 KernelInfoState StateBefore = getState();
4638 struct UpdateKernelEnvCRAII {
4639 AAKernelInfoFunction &AA;
4641 UpdateKernelEnvCRAII(AAKernelInfoFunction &AA) : AA(AA) {}
4643 ~UpdateKernelEnvCRAII() {
4650 if (!AA.isValidState()) {
4651 AA.KernelEnvC = ExistingKernelEnvC;
4655 if (!AA.ReachedKnownParallelRegions.isValidState())
4656 AA.setUseGenericStateMachineOfKernelEnvironment(
4657 KernelInfo::getUseGenericStateMachineFromKernelEnvironment(
4658 ExistingKernelEnvC));
4660 if (!AA.SPMDCompatibilityTracker.isValidState())
4661 AA.setExecModeOfKernelEnvironment(
4662 KernelInfo::getExecModeFromKernelEnvironment(ExistingKernelEnvC));
4665 KernelInfo::getMayUseNestedParallelismFromKernelEnvironment(
4667 ConstantInt *NewMayUseNestedParallelismC = ConstantInt::get(
4668 MayUseNestedParallelismC->
getIntegerType(), AA.NestedParallelism);
4669 AA.setMayUseNestedParallelismOfKernelEnvironment(
4670 NewMayUseNestedParallelismC);
4677 if (isa<CallBase>(
I))
4680 if (!
I.mayWriteToMemory())
4682 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
4685 DepClassTy::OPTIONAL);
4688 DepClassTy::OPTIONAL);
4689 if (UnderlyingObjsAA &&
4690 UnderlyingObjsAA->forallUnderlyingObjects([&](
Value &Obj) {
4691 if (AA::isAssumedThreadLocalObject(A, Obj, *this))
4695 auto *CB = dyn_cast<CallBase>(&Obj);
4696 return CB && HS && HS->isAssumedHeapToStack(*CB);
4702 SPMDCompatibilityTracker.insert(&
I);
4706 bool UsedAssumedInformationInCheckRWInst =
false;
4707 if (!SPMDCompatibilityTracker.isAtFixpoint())
4708 if (!
A.checkForAllReadWriteInstructions(
4709 CheckRWInst, *
this, UsedAssumedInformationInCheckRWInst))
4712 bool UsedAssumedInformationFromReachingKernels =
false;
4713 if (!IsKernelEntry) {
4714 updateParallelLevels(
A);
4716 bool AllReachingKernelsKnown =
true;
4717 updateReachingKernelEntries(
A, AllReachingKernelsKnown);
4718 UsedAssumedInformationFromReachingKernels = !AllReachingKernelsKnown;
4720 if (!SPMDCompatibilityTracker.empty()) {
4721 if (!ParallelLevels.isValidState())
4722 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4723 else if (!ReachingKernelEntries.isValidState())
4724 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4730 for (
auto *
Kernel : ReachingKernelEntries) {
4731 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4733 if (CBAA && CBAA->SPMDCompatibilityTracker.isValidState() &&
4734 CBAA->SPMDCompatibilityTracker.isAssumed())
4738 if (!CBAA || !CBAA->SPMDCompatibilityTracker.isAtFixpoint())
4739 UsedAssumedInformationFromReachingKernels =
true;
4741 if (SPMD != 0 &&
Generic != 0)
4742 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4748 bool AllParallelRegionStatesWereFixed =
true;
4749 bool AllSPMDStatesWereFixed =
true;
4751 auto &CB = cast<CallBase>(
I);
4752 auto *CBAA =
A.getAAFor<AAKernelInfo>(
4756 getState() ^= CBAA->getState();
4757 AllSPMDStatesWereFixed &= CBAA->SPMDCompatibilityTracker.isAtFixpoint();
4758 AllParallelRegionStatesWereFixed &=
4759 CBAA->ReachedKnownParallelRegions.isAtFixpoint();
4760 AllParallelRegionStatesWereFixed &=
4761 CBAA->ReachedUnknownParallelRegions.isAtFixpoint();
4765 bool UsedAssumedInformationInCheckCallInst =
false;
4766 if (!
A.checkForAllCallLikeInstructions(
4767 CheckCallInst, *
this, UsedAssumedInformationInCheckCallInst)) {
4769 <<
"Failed to visit all call-like instructions!\n";);
4770 return indicatePessimisticFixpoint();
4775 if (!UsedAssumedInformationInCheckCallInst &&
4776 AllParallelRegionStatesWereFixed) {
4777 ReachedKnownParallelRegions.indicateOptimisticFixpoint();
4778 ReachedUnknownParallelRegions.indicateOptimisticFixpoint();
4783 if (!UsedAssumedInformationInCheckRWInst &&
4784 !UsedAssumedInformationInCheckCallInst &&
4785 !UsedAssumedInformationFromReachingKernels && AllSPMDStatesWereFixed)
4786 SPMDCompatibilityTracker.indicateOptimisticFixpoint();
4788 return StateBefore == getState() ? ChangeStatus::UNCHANGED
4789 : ChangeStatus::CHANGED;
4795 bool &AllReachingKernelsKnown) {
4799 assert(Caller &&
"Caller is nullptr");
4801 auto *CAA =
A.getOrCreateAAFor<AAKernelInfo>(
4803 if (CAA && CAA->ReachingKernelEntries.isValidState()) {
4804 ReachingKernelEntries ^= CAA->ReachingKernelEntries;
4810 ReachingKernelEntries.indicatePessimisticFixpoint();
4815 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4817 AllReachingKernelsKnown))
4818 ReachingKernelEntries.indicatePessimisticFixpoint();
4823 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4824 OMPInformationCache::RuntimeFunctionInfo &Parallel51RFI =
4825 OMPInfoCache.RFIs[OMPRTL___kmpc_parallel_51];
4830 assert(Caller &&
"Caller is nullptr");
4834 if (CAA && CAA->ParallelLevels.isValidState()) {
4840 if (Caller == Parallel51RFI.Declaration) {
4841 ParallelLevels.indicatePessimisticFixpoint();
4845 ParallelLevels ^= CAA->ParallelLevels;
4852 ParallelLevels.indicatePessimisticFixpoint();
4857 bool AllCallSitesKnown =
true;
4858 if (!
A.checkForAllCallSites(PredCallSite, *
this,
4861 ParallelLevels.indicatePessimisticFixpoint();
4868struct AAKernelInfoCallSite : AAKernelInfo {
4870 : AAKernelInfo(IRP,
A) {}
4874 AAKernelInfo::initialize(
A);
4876 CallBase &CB = cast<CallBase>(getAssociatedValue());
4881 if (AssumptionAA && AssumptionAA->hasAssumption(
"ompx_spmd_amenable")) {
4882 indicateOptimisticFixpoint();
4890 indicateOptimisticFixpoint();
4899 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
4900 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
4901 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
4903 if (!Callee || !
A.isFunctionIPOAmendable(*Callee)) {
4907 if (!AssumptionAA ||
4908 !(AssumptionAA->hasAssumption(
"omp_no_openmp") ||
4909 AssumptionAA->hasAssumption(
"omp_no_parallelism")))
4910 ReachedUnknownParallelRegions.insert(&CB);
4914 if (!SPMDCompatibilityTracker.isAtFixpoint()) {
4915 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4916 SPMDCompatibilityTracker.insert(&CB);
4921 indicateOptimisticFixpoint();
4927 if (NumCallees > 1) {
4928 indicatePessimisticFixpoint();
4935 case OMPRTL___kmpc_is_spmd_exec_mode:
4936 case OMPRTL___kmpc_distribute_static_fini:
4937 case OMPRTL___kmpc_for_static_fini:
4938 case OMPRTL___kmpc_global_thread_num:
4939 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
4940 case OMPRTL___kmpc_get_hardware_num_blocks:
4941 case OMPRTL___kmpc_single:
4942 case OMPRTL___kmpc_end_single:
4943 case OMPRTL___kmpc_master:
4944 case OMPRTL___kmpc_end_master:
4945 case OMPRTL___kmpc_barrier:
4946 case OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2:
4947 case OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2:
4948 case OMPRTL___kmpc_error:
4949 case OMPRTL___kmpc_flush:
4950 case OMPRTL___kmpc_get_hardware_thread_id_in_block:
4951 case OMPRTL___kmpc_get_warp_size:
4952 case OMPRTL_omp_get_thread_num:
4953 case OMPRTL_omp_get_num_threads:
4954 case OMPRTL_omp_get_max_threads:
4955 case OMPRTL_omp_in_parallel:
4956 case OMPRTL_omp_get_dynamic:
4957 case OMPRTL_omp_get_cancellation:
4958 case OMPRTL_omp_get_nested:
4959 case OMPRTL_omp_get_schedule:
4960 case OMPRTL_omp_get_thread_limit:
4961 case OMPRTL_omp_get_supported_active_levels:
4962 case OMPRTL_omp_get_max_active_levels:
4963 case OMPRTL_omp_get_level:
4964 case OMPRTL_omp_get_ancestor_thread_num:
4965 case OMPRTL_omp_get_team_size:
4966 case OMPRTL_omp_get_active_level:
4967 case OMPRTL_omp_in_final:
4968 case OMPRTL_omp_get_proc_bind:
4969 case OMPRTL_omp_get_num_places:
4970 case OMPRTL_omp_get_num_procs:
4971 case OMPRTL_omp_get_place_proc_ids:
4972 case OMPRTL_omp_get_place_num:
4973 case OMPRTL_omp_get_partition_num_places:
4974 case OMPRTL_omp_get_partition_place_nums:
4975 case OMPRTL_omp_get_wtime:
4977 case OMPRTL___kmpc_distribute_static_init_4:
4978 case OMPRTL___kmpc_distribute_static_init_4u:
4979 case OMPRTL___kmpc_distribute_static_init_8:
4980 case OMPRTL___kmpc_distribute_static_init_8u:
4981 case OMPRTL___kmpc_for_static_init_4:
4982 case OMPRTL___kmpc_for_static_init_4u:
4983 case OMPRTL___kmpc_for_static_init_8:
4984 case OMPRTL___kmpc_for_static_init_8u: {
4986 unsigned ScheduleArgOpNo = 2;
4987 auto *ScheduleTypeCI =
4989 unsigned ScheduleTypeVal =
4990 ScheduleTypeCI ? ScheduleTypeCI->getZExtValue() : 0;
4992 case OMPScheduleType::UnorderedStatic:
4993 case OMPScheduleType::UnorderedStaticChunked:
4994 case OMPScheduleType::OrderedDistribute:
4995 case OMPScheduleType::OrderedDistributeChunked:
4998 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
4999 SPMDCompatibilityTracker.insert(&CB);
5003 case OMPRTL___kmpc_target_init:
5006 case OMPRTL___kmpc_target_deinit:
5007 KernelDeinitCB = &CB;
5009 case OMPRTL___kmpc_parallel_51:
5010 if (!handleParallel51(
A, CB))
5011 indicatePessimisticFixpoint();
5013 case OMPRTL___kmpc_omp_task:
5015 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5016 SPMDCompatibilityTracker.insert(&CB);
5017 ReachedUnknownParallelRegions.insert(&CB);
5019 case OMPRTL___kmpc_alloc_shared:
5020 case OMPRTL___kmpc_free_shared:
5026 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5027 SPMDCompatibilityTracker.insert(&CB);
5033 indicateOptimisticFixpoint();
5037 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5038 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5039 CheckCallee(getAssociatedFunction(), 1);
5042 const auto &OptimisticEdges = AACE->getOptimisticEdges();
5043 for (
auto *Callee : OptimisticEdges) {
5044 CheckCallee(Callee, OptimisticEdges.size());
5055 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5056 KernelInfoState StateBefore = getState();
5058 auto CheckCallee = [&](
Function *
F,
int NumCallees) {
5059 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(
F);
5063 if (It == OMPInfoCache.RuntimeFunctionIDMap.end()) {
5066 A.getAAFor<AAKernelInfo>(*
this, FnPos, DepClassTy::REQUIRED);
5068 return indicatePessimisticFixpoint();
5069 if (getState() == FnAA->getState())
5070 return ChangeStatus::UNCHANGED;
5071 getState() = FnAA->getState();
5072 return ChangeStatus::CHANGED;
5075 return indicatePessimisticFixpoint();
5077 CallBase &CB = cast<CallBase>(getAssociatedValue());
5078 if (It->getSecond() == OMPRTL___kmpc_parallel_51) {
5079 if (!handleParallel51(
A, CB))
5080 return indicatePessimisticFixpoint();
5081 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5082 : ChangeStatus::CHANGED;
5088 (It->getSecond() == OMPRTL___kmpc_alloc_shared ||
5089 It->getSecond() == OMPRTL___kmpc_free_shared) &&
5090 "Expected a __kmpc_alloc_shared or __kmpc_free_shared runtime call");
5094 auto *HeapToSharedAA =
A.getAAFor<AAHeapToShared>(
5102 case OMPRTL___kmpc_alloc_shared:
5103 if ((!HeapToStackAA || !HeapToStackAA->isAssumedHeapToStack(CB)) &&
5104 (!HeapToSharedAA || !HeapToSharedAA->isAssumedHeapToShared(CB)))
5105 SPMDCompatibilityTracker.insert(&CB);
5107 case OMPRTL___kmpc_free_shared:
5108 if ((!HeapToStackAA ||
5109 !HeapToStackAA->isAssumedHeapToStackRemovedFree(CB)) &&
5111 !HeapToSharedAA->isAssumedHeapToSharedRemovedFree(CB)))
5112 SPMDCompatibilityTracker.insert(&CB);
5115 SPMDCompatibilityTracker.indicatePessimisticFixpoint();
5116 SPMDCompatibilityTracker.insert(&CB);
5118 return ChangeStatus::CHANGED;
5122 A.getAAFor<
AACallEdges>(*
this, getIRPosition(), DepClassTy::OPTIONAL);
5123 if (!AACE || !AACE->getState().isValidState() || AACE->hasUnknownCallee()) {
5124 if (
Function *
F = getAssociatedFunction())
5128 for (
auto *Callee : OptimisticEdges) {
5129 CheckCallee(Callee, OptimisticEdges.size());
5135 return StateBefore == getState() ? ChangeStatus::UNCHANGED
5136 : ChangeStatus::CHANGED;
5142 const unsigned int NonWrapperFunctionArgNo = 5;
5143 const unsigned int WrapperFunctionArgNo = 6;
5144 auto ParallelRegionOpArgNo = SPMDCompatibilityTracker.isAssumed()
5145 ? NonWrapperFunctionArgNo
5146 : WrapperFunctionArgNo;
5148 auto *ParallelRegion = dyn_cast<Function>(
5150 if (!ParallelRegion)
5153 ReachedKnownParallelRegions.insert(&CB);
5155 auto *FnAA =
A.getAAFor<AAKernelInfo>(
5157 NestedParallelism |= !FnAA || !FnAA->getState().isValidState() ||
5158 !FnAA->ReachedKnownParallelRegions.empty() ||
5159 !FnAA->ReachedKnownParallelRegions.isValidState() ||
5160 !FnAA->ReachedUnknownParallelRegions.isValidState() ||
5161 !FnAA->ReachedUnknownParallelRegions.empty();
5166struct AAFoldRuntimeCall
5167 :
public StateWrapper<BooleanState, AbstractAttribute> {
5173 void trackStatistics()
const override {}
5176 static AAFoldRuntimeCall &createForPosition(
const IRPosition &IRP,
5180 const std::string
getName()
const override {
return "AAFoldRuntimeCall"; }
5183 const char *getIdAddr()
const override {
return &
ID; }
5191 static const char ID;
5194struct AAFoldRuntimeCallCallSiteReturned : AAFoldRuntimeCall {
5196 : AAFoldRuntimeCall(IRP,
A) {}
5199 const std::string getAsStr(
Attributor *)
const override {
5200 if (!isValidState())
5203 std::string Str(
"simplified value: ");
5205 if (!SimplifiedValue)
5206 return Str + std::string(
"none");
5208 if (!*SimplifiedValue)
5209 return Str + std::string(
"nullptr");
5211 if (
ConstantInt *CI = dyn_cast<ConstantInt>(*SimplifiedValue))
5212 return Str + std::to_string(CI->getSExtValue());
5214 return Str + std::string(
"unknown");
5219 indicatePessimisticFixpoint();
5223 auto &OMPInfoCache =
static_cast<OMPInformationCache &
>(
A.getInfoCache());
5224 const auto &It = OMPInfoCache.RuntimeFunctionIDMap.find(Callee);
5225 assert(It != OMPInfoCache.RuntimeFunctionIDMap.end() &&
5226 "Expected a known OpenMP runtime function");
5228 RFKind = It->getSecond();
5230 CallBase &CB = cast<CallBase>(getAssociatedValue());
5231 A.registerSimplificationCallback(
5234 bool &UsedAssumedInformation) -> std::optional<Value *> {
5235 assert((isValidState() ||
5236 (SimplifiedValue && *SimplifiedValue ==
nullptr)) &&
5237 "Unexpected invalid state!");
5239 if (!isAtFixpoint()) {
5240 UsedAssumedInformation =
true;
5242 A.recordDependence(*
this, *AA, DepClassTy::OPTIONAL);
5244 return SimplifiedValue;
5251 case OMPRTL___kmpc_is_spmd_exec_mode:
5252 Changed |= foldIsSPMDExecMode(
A);
5254 case OMPRTL___kmpc_parallel_level:
5255 Changed |= foldParallelLevel(
A);
5257 case OMPRTL___kmpc_get_hardware_num_threads_in_block:
5258 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_thread_limit");
5260 case OMPRTL___kmpc_get_hardware_num_blocks:
5261 Changed = Changed | foldKernelFnAttribute(
A,
"omp_target_num_teams");
5273 if (SimplifiedValue && *SimplifiedValue) {
5276 A.deleteAfterManifest(
I);
5280 if (
auto *
C = dyn_cast<ConstantInt>(*SimplifiedValue))
5281 return OR <<
"Replacing OpenMP runtime call "
5283 <<
ore::NV(
"FoldedValue",
C->getZExtValue()) <<
".";
5284 return OR <<
"Replacing OpenMP runtime call "
5292 << **SimplifiedValue <<
"\n");
5294 Changed = ChangeStatus::CHANGED;
5301 SimplifiedValue =
nullptr;
5302 return AAFoldRuntimeCall::indicatePessimisticFixpoint();
5308 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5310 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5311 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5312 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5315 if (!CallerKernelInfoAA ||
5316 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5317 return indicatePessimisticFixpoint();
5319 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5321 DepClassTy::REQUIRED);
5323 if (!AA || !AA->isValidState()) {
5324 SimplifiedValue =
nullptr;
5325 return indicatePessimisticFixpoint();
5328 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5329 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5334 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5335 ++KnownNonSPMDCount;
5337 ++AssumedNonSPMDCount;
5341 if ((AssumedSPMDCount + KnownSPMDCount) &&
5342 (AssumedNonSPMDCount + KnownNonSPMDCount))
5343 return indicatePessimisticFixpoint();
5345 auto &Ctx = getAnchorValue().getContext();
5346 if (KnownSPMDCount || AssumedSPMDCount) {
5347 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5348 "Expected only SPMD kernels!");
5352 }
else if (KnownNonSPMDCount || AssumedNonSPMDCount) {
5353 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5354 "Expected only non-SPMD kernels!");
5362 assert(!SimplifiedValue &&
"SimplifiedValue should be none");
5365 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5366 : ChangeStatus::CHANGED;
5371 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5373 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5376 if (!CallerKernelInfoAA ||
5377 !CallerKernelInfoAA->ParallelLevels.isValidState())
5378 return indicatePessimisticFixpoint();
5380 if (!CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5381 return indicatePessimisticFixpoint();
5383 if (CallerKernelInfoAA->ReachingKernelEntries.empty()) {
5384 assert(!SimplifiedValue &&
5385 "SimplifiedValue should keep none at this point");
5386 return ChangeStatus::UNCHANGED;
5389 unsigned AssumedSPMDCount = 0, KnownSPMDCount = 0;
5390 unsigned AssumedNonSPMDCount = 0, KnownNonSPMDCount = 0;
5391 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5393 DepClassTy::REQUIRED);
5394 if (!AA || !AA->SPMDCompatibilityTracker.isValidState())
5395 return indicatePessimisticFixpoint();
5397 if (AA->SPMDCompatibilityTracker.isAssumed()) {
5398 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5403 if (AA->SPMDCompatibilityTracker.isAtFixpoint())
5404 ++KnownNonSPMDCount;
5406 ++AssumedNonSPMDCount;
5410 if ((AssumedSPMDCount + KnownSPMDCount) &&
5411 (AssumedNonSPMDCount + KnownNonSPMDCount))
5412 return indicatePessimisticFixpoint();
5414 auto &Ctx = getAnchorValue().getContext();
5418 if (AssumedSPMDCount || KnownSPMDCount) {
5419 assert(KnownNonSPMDCount == 0 && AssumedNonSPMDCount == 0 &&
5420 "Expected only SPMD kernels!");
5423 assert(KnownSPMDCount == 0 && AssumedSPMDCount == 0 &&
5424 "Expected only non-SPMD kernels!");
5427 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5428 : ChangeStatus::CHANGED;
5433 int32_t CurrentAttrValue = -1;
5434 std::optional<Value *> SimplifiedValueBefore = SimplifiedValue;
5436 auto *CallerKernelInfoAA =
A.getAAFor<AAKernelInfo>(
5439 if (!CallerKernelInfoAA ||
5440 !CallerKernelInfoAA->ReachingKernelEntries.isValidState())
5441 return indicatePessimisticFixpoint();
5444 for (
Kernel K : CallerKernelInfoAA->ReachingKernelEntries) {
5445 int32_t NextAttrVal =
K->getFnAttributeAsParsedInteger(Attr, -1);
5447 if (NextAttrVal == -1 ||
5448 (CurrentAttrValue != -1 && CurrentAttrValue != NextAttrVal))
5449 return indicatePessimisticFixpoint();
5450 CurrentAttrValue = NextAttrVal;
5453 if (CurrentAttrValue != -1) {
5454 auto &Ctx = getAnchorValue().getContext();
5458 return SimplifiedValue == SimplifiedValueBefore ? ChangeStatus::UNCHANGED
5459 : ChangeStatus::CHANGED;
5465 std::optional<Value *> SimplifiedValue;
5475 auto &RFI = OMPInfoCache.RFIs[RF];
5477 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &RFI);
5480 A.getOrCreateAAFor<AAFoldRuntimeCall>(
5482 DepClassTy::NONE,
false,
5488void OpenMPOpt::registerAAs(
bool IsModulePass) {
5498 A.getOrCreateAAFor<AAKernelInfo>(
5500 DepClassTy::NONE,
false,
5504 OMPInformationCache::RuntimeFunctionInfo &InitRFI =
5505 OMPInfoCache.RFIs[OMPRTL___kmpc_target_init];
5506 InitRFI.foreachUse(SCC, CreateKernelInfoCB);
5508 registerFoldRuntimeCall(OMPRTL___kmpc_is_spmd_exec_mode);
5509 registerFoldRuntimeCall(OMPRTL___kmpc_parallel_level);
5510 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_threads_in_block);
5511 registerFoldRuntimeCall(OMPRTL___kmpc_get_hardware_num_blocks);
5516 for (
int Idx = 0;
Idx < OMPInfoCache.ICVs.size() - 1; ++
Idx) {
5519 auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
5522 CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
5526 auto &CB = cast<CallBase>(*CI);
5529 A.getOrCreateAAFor<AAICVTracker>(CBPos);
5533 GetterRFI.foreachUse(SCC, CreateAA);
5542 for (
auto *
F : SCC) {
5543 if (
F->isDeclaration())
5549 if (
F->hasLocalLinkage()) {
5551 const auto *CB = dyn_cast<CallBase>(U.getUser());
5552 return CB && CB->isCallee(&U) &&
5553 A.isRunOn(const_cast<Function *>(CB->getCaller()));
5557 registerAAsForFunction(
A, *
F);
5567 if (
F.hasFnAttribute(Attribute::Convergent))
5571 if (
auto *LI = dyn_cast<LoadInst>(&
I)) {
5572 bool UsedAssumedInformation =
false;
5577 if (
auto *CI = dyn_cast<CallBase>(&
I)) {
5582 if (
auto *SI = dyn_cast<StoreInst>(&
I)) {
5586 if (
auto *FI = dyn_cast<FenceInst>(&
I)) {
5590 if (
auto *II = dyn_cast<IntrinsicInst>(&
I)) {
5591 if (II->getIntrinsicID() == Intrinsic::assume) {
5600const char AAICVTracker::ID = 0;
5601const char AAKernelInfo::ID = 0;
5603const char AAHeapToShared::ID = 0;
5604const char AAFoldRuntimeCall::ID = 0;
5606AAICVTracker &AAICVTracker::createForPosition(
const IRPosition &IRP,
5608 AAICVTracker *AA =
nullptr;
5616 AA =
new (
A.Allocator) AAICVTrackerFunctionReturned(IRP,
A);
5619 AA =
new (
A.Allocator) AAICVTrackerCallSiteReturned(IRP,
A);
5622 AA =
new (
A.Allocator) AAICVTrackerCallSite(IRP,
A);
5625 AA =
new (
A.Allocator) AAICVTrackerFunction(IRP,
A);
5634 AAExecutionDomainFunction *AA =
nullptr;
5644 "AAExecutionDomain can only be created for function position!");
5646 AA =
new (
A.Allocator) AAExecutionDomainFunction(IRP,
A);
5653AAHeapToShared &AAHeapToShared::createForPosition(
const IRPosition &IRP,
5655 AAHeapToSharedFunction *AA =
nullptr;
5665 "AAHeapToShared can only be created for function position!");
5667 AA =
new (
A.Allocator) AAHeapToSharedFunction(IRP,
A);
5674AAKernelInfo &AAKernelInfo::createForPosition(
const IRPosition &IRP,
5676 AAKernelInfo *AA =
nullptr;
5686 AA =
new (
A.Allocator) AAKernelInfoCallSite(IRP,
A);
5689 AA =
new (
A.Allocator) AAKernelInfoFunction(IRP,
A);
5696AAFoldRuntimeCall &AAFoldRuntimeCall::createForPosition(
const IRPosition &IRP,
5698 AAFoldRuntimeCall *AA =
nullptr;
5707 llvm_unreachable(
"KernelInfo can only be created for call site position!");
5709 AA =
new (
A.Allocator) AAFoldRuntimeCallCallSiteReturned(IRP,
A);
5730 if (Kernels.contains(&
F))
5732 for (
const User *U :
F.users())
5733 if (!isa<BlockAddress>(U))
5742 return ORA <<
"Could not internalize function. "
5743 <<
"Some optimizations may not be possible. [OMP140]";
5747 bool Changed =
false;
5755 if (!
F.isDeclaration() && !Kernels.contains(&
F) && IsCalled(
F) &&
5759 }
else if (!
F.hasLocalLinkage() && !
F.hasFnAttribute(Attribute::Cold)) {
5772 if (!
F.isDeclaration() && !InternalizedMap.
lookup(&
F)) {
5791 OMPInformationCache InfoCache(M, AG,
Allocator,
nullptr, PostLink);
5793 unsigned MaxFixpointIterations =
5805 return F.hasFnAttribute(
"kernel");
5810 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5811 Changed |= OMPOpt.run(
true);
5816 if (!
F.isDeclaration() && !Kernels.contains(&
F) &&
5817 !
F.hasFnAttribute(Attribute::NoInline))
5818 F.addFnAttr(Attribute::AlwaysInline);
5848 Module &M = *
C.begin()->getFunction().getParent();
5871 OMPInformationCache InfoCache(*(Functions.
back()->getParent()), AG,
Allocator,
5872 &Functions, PostLink);
5874 unsigned MaxFixpointIterations =
5888 OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache,
A);
5889 bool Changed = OMPOpt.run(
false);
5906 NamedMDNode *MD = M.getNamedMetadata(
"nvvm.annotations");
5915 MDString *KindID = dyn_cast<MDString>(
Op->getOperand(1));
5916 if (!KindID || KindID->
getString() !=
"kernel")
5920 mdconst::dyn_extract_or_null<Function>(
Op->getOperand(0));
5927 ++NumOpenMPTargetRegionKernels;
5928 Kernels.insert(KernelFn);
5930 ++NumNonOpenMPTargetRegionKernels;
5937 Metadata *MD = M.getModuleFlag(
"openmp");
5945 Metadata *MD = M.getModuleFlag(
"openmp-device");
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Expand Atomic instructions
static cl::opt< unsigned > SetFixpointIterations("attributor-max-iterations", cl::Hidden, cl::desc("Maximal number of fixpoint iterations."), cl::init(32))
static const Function * getParent(const Value *V)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
This file provides interfaces used to manipulate a call graph, regardless if it is a "old style" Call...
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
dxil pretty DXIL Metadata Pretty Printer
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file defines an array type that can be indexed using scoped enum values.
static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset, uint64_t BaseAddr, uint64_t Addr, SourceLocations &SrcLocs, llvm::Error &Err)
A Lookup helper functions.
This file provides utility analysis objects describing memory locations.
This file defines constans and helpers used when dealing with OpenMP.
This file defines constans that will be used by both host and device compilation.
static constexpr auto TAG
static cl::opt< bool > HideMemoryTransferLatency("openmp-hide-memory-transfer-latency", cl::desc("[WIP] Tries to hide the latency of host to device memory" " transfers"), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptStateMachineRewrite("openmp-opt-disable-state-machine-rewrite", cl::desc("Disable OpenMP optimizations that replace the state machine."), cl::Hidden, cl::init(false))
static cl::opt< bool > EnableParallelRegionMerging("openmp-opt-enable-merging", cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden, cl::init(false))
static cl::opt< bool > AlwaysInlineDeviceFunctions("openmp-opt-inline-device", cl::desc("Inline all applicible functions on the device."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleAfterOptimizations("openmp-opt-print-module-after", cl::desc("Print the current module after OpenMP optimizations."), cl::Hidden, cl::init(false))
#define KERNEL_ENVIRONMENT_CONFIGURATION_GETTER(MEMBER)
#define KERNEL_ENVIRONMENT_CONFIGURATION_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_CONFIGURATION_SETTER(MEMBER)
static cl::opt< bool > PrintOpenMPKernels("openmp-print-gpu-kernels", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptFolding("openmp-opt-disable-folding", cl::desc("Disable OpenMP optimizations involving folding."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintModuleBeforeOptimizations("openmp-opt-print-module-before", cl::desc("Print the current module before OpenMP optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SetFixpointIterations("openmp-opt-max-iterations", cl::Hidden, cl::desc("Maximal number of attributor iterations."), cl::init(256))
static cl::opt< bool > DisableInternalization("openmp-opt-disable-internalization", cl::desc("Disable function internalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > PrintICVValues("openmp-print-icv-values", cl::init(false), cl::Hidden)
static cl::opt< bool > DisableOpenMPOptimizations("openmp-opt-disable", cl::desc("Disable OpenMP specific optimizations."), cl::Hidden, cl::init(false))
static cl::opt< unsigned > SharedMemoryLimit("openmp-opt-shared-limit", cl::Hidden, cl::desc("Maximum amount of shared memory to use."), cl::init(std::numeric_limits< unsigned >::max()))
static cl::opt< bool > EnableVerboseRemarks("openmp-opt-verbose-remarks", cl::desc("Enables more verbose remarks."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptDeglobalization("openmp-opt-disable-deglobalization", cl::desc("Disable OpenMP optimizations involving deglobalization."), cl::Hidden, cl::init(false))
static cl::opt< bool > DisableOpenMPOptBarrierElimination("openmp-opt-disable-barrier-elimination", cl::desc("Disable OpenMP optimizations that eliminate barriers."), cl::Hidden, cl::init(false))
static cl::opt< bool > DeduceICVValues("openmp-deduce-icv-values", cl::init(false), cl::Hidden)
#define KERNEL_ENVIRONMENT_IDX(MEMBER, IDX)
#define KERNEL_ENVIRONMENT_GETTER(MEMBER, RETURNTYPE)
static cl::opt< bool > DisableOpenMPOptSPMDization("openmp-opt-disable-spmdization", cl::desc("Disable OpenMP optimizations involving SPMD-ization."), cl::Hidden, cl::init(false))
FunctionAnalysisManager FAM
This file builds on the ADT/GraphTraits.h file to build a generic graph post order iterator.
static StringRef getName(Value *V)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file implements a set that has insertion order iteration characteristics.
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static const int BlockSize
static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, ArrayRef< StringLiteral > StandardNames)
Initialize the set of available library functions based on the specified target triple.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
A container for analyses that lazily runs them and caches their results.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This class represents an incoming formal argument to a Function.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
size - Get the array size.
AttributeSet getParamAttrs(unsigned ArgNo) const
The attributes for the argument or parameter at the given index are returned.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
reverse_iterator rbegin()
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
InstListType::reverse_iterator reverse_iterator
const Function * getParent() const
Return the enclosing method, or null if none.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, BasicBlock::iterator InsertBefore)
bool isConditional() const
BasicBlock * getSuccessor(unsigned i) const
Value * getCondition() const
Allocate memory in an ever growing pool, as if by bump-pointer.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
void setCallingConv(CallingConv::ID CC)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool doesNotAccessMemory(unsigned OpNo) const
bool isIndirectCall() const
Return true if the callsite is an indirect call.
bool isCallee(Value::const_user_iterator UI) const
Determine whether the passed iterator points to the callee operand's Use.
Value * getArgOperand(unsigned i) const
void setArgOperand(unsigned i, Value *v)
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
unsigned arg_size() const
AttributeList getAttributes() const
Return the parameter attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
bool isArgOperand(const Use *U) const
bool hasOperandBundles() const
Return true if this User has any operand bundles.
Function * getCaller()
Helper to get the caller (the parent function).
Wrapper to unify "old style" CallGraph and "new style" LazyCallGraph.
void initialize(CallGraph &CG, CallGraphSCC &SCC)
Initializers for usage outside of a CGSCC pass, inside a CGSCC pass in the old and new pass manager (...
This class represents a function call, abstracting a target machine's calling convention.
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr, BasicBlock::iterator InsertBefore)
@ ICMP_SLT
signed less than
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
This is the shared class of boolean and integer constants.
IntegerType * getIntegerType() const
Variant of the getType() method to always return an IntegerType, which reduces the amount of casting ...
static ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
This is an important base class in LLVM.
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
ValueT lookup(const_arg_type_t< KeyT > Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
std::pair< iterator, bool > insert(const std::pair< KeyT, ValueT > &KV)
Implements a dense probed hash-table based set.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Instruction * findNearestCommonDominator(Instruction *I1, Instruction *I2) const
Find the nearest instruction I that dominates both I1 and I2, in the sense that a result produced bef...
An instruction for ordering other memory operations.
AtomicOrdering getOrdering() const
Returns the ordering constraint of this fence instruction.
A proxy from a FunctionAnalysisManager to an SCC.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
const BasicBlock & getEntryBlock() const
const BasicBlock & front() const
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Argument * getArg(unsigned i) const
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
bool hasLocalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
PointerType * getType() const
Global values are always pointers.
@ PrivateLinkage
Like Internal, but omit from symbol table.
@ InternalLinkage
Rename collisions when linking (static functions).
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
InsertPoint - A saved insertion point.
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
An analysis over an "outer" IR unit that provides access to an analysis manager over an "inner" IR un...
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
bool mayWriteToMemory() const LLVM_READONLY
Return true if this instruction may modify memory.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
const Instruction * getPrevNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the previous non-debug instruction in the same basic block as 'this',...
const BasicBlock * getParent() const
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
const Function * getFunction() const
Return the function this instruction belongs to.
bool mayHaveSideEffects() const LLVM_READONLY
Return true if the instruction may have side effects.
bool mayReadFromMemory() const LLVM_READONLY
Return true if this instruction may read memory.
const Instruction * getNextNonDebugInstruction(bool SkipPseudoOp=false) const
Return a pointer to the next non-debug instruction in the same basic block as 'this',...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
void setSuccessor(unsigned Idx, BasicBlock *BB)
Update the specified successor to point at the provided block.
void moveBefore(Instruction *MovePos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
A node in the call graph.
An SCC of the call graph.
A lazily constructed view of the call graph of a module.
An instruction for reading from memory.
StringRef getString() const
void eraseFromParent()
This method unlinks 'this' from the containing function and deletes it.
A Module instance is used to store all the information related to an LLVM module.
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
iterator_range< op_iterator > operands()
An interface to create LLVM-IR for OpenMP directives.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM, LazyCallGraph &CG, CGSCCUpdateResult &UR)
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
static ReturnInst * Create(LLVMContext &C, Value *retVal, BasicBlock::iterator InsertBefore)
A vector that has set insertion semantics.
size_type size() const
Determine the number of elements in the SetVector.
const value_type & back() const
Return the last element of the SetVector.
iterator end()
Get an iterator to the end of the SetVector.
size_type count(const key_type &key) const
Count the number of elements of a given key in the SetVector.
iterator begin()
Get an iterator to the beginning of the SetVector.
bool insert(const value_type &X)
Insert a new element into the SetVector.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
bool starts_with(StringRef Prefix) const
Check if this string starts with the given Prefix.
Triple - Helper class for working with autoconf configuration names.
The instances of the Type class are immutable: once they are created, they are never changed.
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static Type * getVoidTy(LLVMContext &C)
static IntegerType * getInt16Ty(LLVMContext &C)
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
bool replaceUsesOfWith(Value *From, Value *To)
Replace uses of one Value with another.
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
void setName(const Twine &Name)
Change the name of the value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
iterator_range< user_iterator > users()
const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
unsigned getNumUses() const
This method computes the number of uses of this Value.
StringRef getName() const
Return a constant reference to the value's name.
An efficient, type-erasing, non-owning reference to a callable.
self_iterator getIterator()
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
A raw_ostream that writes to an std::string.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
GlobalVariable * getKernelEnvironementGVFromKernelInitCB(CallBase *KernelInitCB)
ConstantStruct * getKernelEnvironementFromKernelInitCB(CallBase *KernelInitCB)
bool isValidAtPosition(const ValueAndContext &VAC, InformationCache &InfoCache)
Return true if the value of VAC is a valid at the position of VAC, that is a constant,...
bool isPotentiallyAffectedByBarrier(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is potentially affected by a barrier.
bool isNoSyncInst(Attributor &A, const Instruction &I, const AbstractAttribute &QueryingAA)
Return true if I is a nosync instruction.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
E & operator^=(E &LHS, E RHS)
@ C
The default llvm calling convention, compatible with C.
initializer< Ty > init(const Ty &Val)
std::optional< const char * > toString(const std::optional< DWARFFormValue > &V)
Take an optional DWARFFormValue and try to extract a string value from it.
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
constexpr uint64_t PointerSize
aarch64 pointer size.
bool isOpenMPDevice(Module &M)
Helper to determine if M is a OpenMP target offloading device module.
bool containsOpenMP(Module &M)
Helper to determine if M contains OpenMP.
InternalControlVar
IDs for all Internal Control Variables (ICVs).
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
KernelSet getDeviceKernels(Module &M)
Get OpenMP device kernels in M.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
@ OMP_TGT_EXEC_MODE_GENERIC_SPMD
@ OMP_TGT_EXEC_MODE_GENERIC
bool isOpenMPKernel(Function &Fn)
Return true iff Fn is an OpenMP GPU kernel; Fn has the "kernel" attribute.
DiagnosticInfoOptimizationBase::Argument NV
const_iterator begin(StringRef path, Style style=Style::native)
Get begin iterator over path.
const_iterator end(StringRef path)
Get end iterator over path.
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
bool succ_empty(const Instruction *I)
std::string to_string(const T &Value)
bool operator!=(uint64_t V1, const APInt &V2)
Value * GetPointerBaseWithConstantOffset(Value *Ptr, int64_t &Offset, const DataLayout &DL, bool AllowNonInbounds=true)
Analyze the specified pointer to see if it can be expressed as a base pointer plus a constant offset.
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
@ FullLTOPostLink
Full LTO postlink (backend compile) phase.
@ ThinLTOPreLink
ThinLTO prelink (summary) phase.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool operator&=(SparseBitVector< ElementSize > *LHS, const SparseBitVector< ElementSize > &RHS)
auto count(R &&Range, const E &Element)
Wrapper function around std::count to count the number of times an element Element occurs in the give...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
Implement std::hash so that hash_code can be used in STL containers.
An abstract attribute for getting assumption information.
An abstract state for querying live call edges.
virtual const SetVector< Function * > & getOptimisticEdges() const =0
Get the optimistic edges.
bool IsReachedFromAlignedBarrierOnly
bool isExecutedByInitialThreadOnly(const Instruction &I) const
Check if an instruction is executed only by the initial thread.
static AAExecutionDomain & createForPosition(const IRPosition &IRP, Attributor &A)
Create an abstract attribute view for the position IRP.
virtual ExecutionDomainTy getFunctionExecutionDomain() const =0
virtual ExecutionDomainTy getExecutionDomain(const BasicBlock &) const =0
virtual bool isExecutedInAlignedRegion(Attributor &A, const Instruction &I) const =0
Check if the instruction I is executed in an aligned region, that is, the synchronizing effects befor...
virtual bool isNoOpFence(const FenceInst &FI) const =0
Helper function to determine if FI is a no-op given the information about its execution from ExecDoma...
static const char ID
Unique ID (due to the unique address)
An abstract interface for indirect call information interference.
An abstract interface for liveness abstract attribute.
An abstract interface for all memory location attributes (readnone/argmemonly/inaccessiblememonly/ina...
AccessKind
Simple enum to distinguish read/write/read-write accesses.
StateType::base_t MemoryLocationsKind
static bool isAlignedBarrier(const CallBase &CB, bool ExecutedAligned)
Helper function to determine if CB is an aligned (GPU) barrier.
An abstract Attribute for determining the necessity of the convergent attribute.
An abstract attribute for getting all assumption underlying objects.
Base struct for all "concrete attribute" deductions.
virtual ChangeStatus manifest(Attributor &A)
Hook for the Attributor to trigger the manifestation of the information represented by the abstract a...
virtual void initialize(Attributor &A)
Initialize the state with the information in the Attributor A.
virtual const std::string getAsStr(Attributor *A) const =0
This function should return the "summarized" assumed state as string.
virtual ChangeStatus updateImpl(Attributor &A)=0
The actual update/transfer function which has to be implemented by the derived classes.
virtual void trackStatistics() const =0
Hook to enable custom statistic tracking, called after manifest that resulted in a change if statisti...
virtual const char * getIdAddr() const =0
This function should return the address of the ID of the AbstractAttribute.
An interface to query the internal state of an abstract attribute.
virtual ChangeStatus indicatePessimisticFixpoint()=0
Indicate that the abstract state should converge to the pessimistic state.
virtual bool isAtFixpoint() const =0
Return if this abstract state is fixed, thus does not need to be updated if information changes as it...
virtual bool isValidState() const =0
Return if this abstract state is in a valid state.
virtual ChangeStatus indicateOptimisticFixpoint()=0
Indicate that the abstract state should converge to the optimistic state.
Wrapper for FunctionAnalysisManager.
Configuration for the Attributor.
std::function< void(Attributor &A, const Function &F)> InitializationCallback
Callback function to be invoked on internal functions marked live.
std::optional< unsigned > MaxFixpointIterations
Maximum number of iterations to run until fixpoint.
bool RewriteSignatures
Flag to determine if we rewrite function signatures.
OptimizationRemarkGetter OREGetter
IPOAmendableCBTy IPOAmendableCB
bool IsModulePass
Is the user of the Attributor a module pass or not.
bool DefaultInitializeLiveInternals
Flag to determine if we want to initialize all default AAs for an internal function marked live.
The fixpoint analysis framework that orchestrates the attribute deduction.
static bool isInternalizable(Function &F)
Returns true if the function F can be internalized.
std::function< std::optional< Constant * >(const GlobalVariable &, const AbstractAttribute *, bool &)> GlobalVariableSimplifictionCallbackTy
Register CB as a simplification callback.
std::function< bool(Attributor &, const AbstractAttribute *)> VirtualUseCallbackTy
static bool internalizeFunctions(SmallPtrSetImpl< Function * > &FnSet, DenseMap< Function *, Function * > &FnMap)
Make copies of each function in the set FnSet such that the copied version has internal linkage after...
std::function< std::optional< Value * >(const IRPosition &, const AbstractAttribute *, bool &)> SimplifictionCallbackTy
Register CB as a simplification callback.
Simple wrapper for a single bit (boolean) state.
Support structure for SCC passes to communicate updates the call graph back to the CGSCC pass manager...
Helper to describe and deal with positions in the LLVM-IR.
static const IRPosition callsite_returned(const CallBase &CB)
Create a position describing the returned value of CB.
static const IRPosition returned(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the returned value of F.
static const IRPosition value(const Value &V, const CallBaseContext *CBContext=nullptr)
Create a position describing the value of V.
static const IRPosition inst(const Instruction &I, const CallBaseContext *CBContext=nullptr)
Create a position describing the instruction I.
@ IRP_ARGUMENT
An attribute for a function argument.
@ IRP_RETURNED
An attribute for the function return value.
@ IRP_CALL_SITE
An attribute for a call site (function scope).
@ IRP_CALL_SITE_RETURNED
An attribute for a call site return value.
@ IRP_FUNCTION
An attribute for a function (scope).
@ IRP_FLOAT
A position that is not associated with a spot suitable for attributes.
@ IRP_CALL_SITE_ARGUMENT
An attribute for a call site argument.
@ IRP_INVALID
An invalid position.
static const IRPosition function(const Function &F, const CallBaseContext *CBContext=nullptr)
Create a position describing the function scope of F.
Kind getPositionKind() const
Return the associated position kind.
static const IRPosition callsite_function(const CallBase &CB)
Create a position describing the function scope of CB.
Function * getAnchorScope() const
Return the Function surrounding the anchor value.
bool isValidState() const override
See AbstractState::isValidState() NOTE: For now we simply pretend that the worst possible state is in...
ChangeStatus indicatePessimisticFixpoint() override
See AbstractState::indicatePessimisticFixpoint(...)
bool operator==(const IntegerStateBase< base_t, BestState, WorstState > &R) const
Equality for IntegerStateBase.
Direction
An enum for the direction of the loop.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Helper to tie a abstract state implementation to an abstract attribute.
StateType & getState() override
See AbstractAttribute::getState(...).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...