58#define DEBUG_TYPE "openmp-ir-builder"
65 cl::desc(
"Use optimistic attributes describing "
66 "'as-if' properties of runtime calls."),
70 "openmp-ir-builder-unroll-threshold-factor",
cl::Hidden,
71 cl::desc(
"Factor for the unroll threshold to account for code "
72 "simplifications still taking place"),
83 if (!IP1.isSet() || !IP2.isSet())
85 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
90 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
91 case OMPScheduleType::UnorderedStaticChunked:
92 case OMPScheduleType::UnorderedStatic:
93 case OMPScheduleType::UnorderedDynamicChunked:
94 case OMPScheduleType::UnorderedGuidedChunked:
95 case OMPScheduleType::UnorderedRuntime:
96 case OMPScheduleType::UnorderedAuto:
97 case OMPScheduleType::UnorderedTrapezoidal:
98 case OMPScheduleType::UnorderedGreedy:
99 case OMPScheduleType::UnorderedBalanced:
100 case OMPScheduleType::UnorderedGuidedIterativeChunked:
101 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
102 case OMPScheduleType::UnorderedSteal:
103 case OMPScheduleType::UnorderedStaticBalancedChunked:
104 case OMPScheduleType::UnorderedGuidedSimd:
105 case OMPScheduleType::UnorderedRuntimeSimd:
106 case OMPScheduleType::OrderedStaticChunked:
107 case OMPScheduleType::OrderedStatic:
108 case OMPScheduleType::OrderedDynamicChunked:
109 case OMPScheduleType::OrderedGuidedChunked:
110 case OMPScheduleType::OrderedRuntime:
111 case OMPScheduleType::OrderedAuto:
112 case OMPScheduleType::OrderdTrapezoidal:
113 case OMPScheduleType::NomergeUnorderedStaticChunked:
114 case OMPScheduleType::NomergeUnorderedStatic:
115 case OMPScheduleType::NomergeUnorderedDynamicChunked:
116 case OMPScheduleType::NomergeUnorderedGuidedChunked:
117 case OMPScheduleType::NomergeUnorderedRuntime:
118 case OMPScheduleType::NomergeUnorderedAuto:
119 case OMPScheduleType::NomergeUnorderedTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedGreedy:
121 case OMPScheduleType::NomergeUnorderedBalanced:
122 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
124 case OMPScheduleType::NomergeUnorderedSteal:
125 case OMPScheduleType::NomergeOrderedStaticChunked:
126 case OMPScheduleType::NomergeOrderedStatic:
127 case OMPScheduleType::NomergeOrderedDynamicChunked:
128 case OMPScheduleType::NomergeOrderedGuidedChunked:
129 case OMPScheduleType::NomergeOrderedRuntime:
130 case OMPScheduleType::NomergeOrderedAuto:
131 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 SchedType & OMPScheduleType::MonotonicityMask;
140 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
151 bool HasSimdModifier) {
153 switch (ClauseKind) {
154 case OMP_SCHEDULE_Default:
155 case OMP_SCHEDULE_Static:
156 return HasChunks ? OMPScheduleType::BaseStaticChunked
157 : OMPScheduleType::BaseStatic;
158 case OMP_SCHEDULE_Dynamic:
159 return OMPScheduleType::BaseDynamicChunked;
160 case OMP_SCHEDULE_Guided:
161 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
162 : OMPScheduleType::BaseGuidedChunked;
163 case OMP_SCHEDULE_Auto:
165 case OMP_SCHEDULE_Runtime:
166 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
167 : OMPScheduleType::BaseRuntime;
175 bool HasOrderedClause) {
176 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
177 OMPScheduleType::None &&
178 "Must not have ordering nor monotonicity flags already set");
181 ? OMPScheduleType::ModifierOrdered
182 : OMPScheduleType::ModifierUnordered;
183 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
186 if (OrderingScheduleType ==
187 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
188 return OMPScheduleType::OrderedGuidedChunked;
189 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
190 OMPScheduleType::ModifierOrdered))
191 return OMPScheduleType::OrderedRuntime;
193 return OrderingScheduleType;
199 bool HasSimdModifier,
bool HasMonotonic,
200 bool HasNonmonotonic,
bool HasOrderedClause) {
201 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
202 OMPScheduleType::None &&
203 "Must not have monotonicity flags already set");
204 assert((!HasMonotonic || !HasNonmonotonic) &&
205 "Monotonic and Nonmonotonic are contradicting each other");
208 return ScheduleType | OMPScheduleType::ModifierMonotonic;
209 }
else if (HasNonmonotonic) {
210 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
220 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
221 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
227 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
235 bool HasSimdModifier,
bool HasMonotonicModifier,
236 bool HasNonmonotonicModifier,
bool HasOrderedClause) {
242 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
243 HasNonmonotonicModifier, HasOrderedClause);
257 auto *Br = cast<BranchInst>(Term);
258 assert(!Br->isConditional() &&
259 "BB's terminator must be an unconditional branch (or degenerate)");
262 Br->setSuccessor(0,
Target);
267 NewBr->setDebugLoc(
DL);
272 assert(New->getFirstInsertionPt() == New->begin() &&
273 "Target BB must not have PHI nodes");
277 New->splice(New->begin(), Old, IP.
getPoint(), Old->
end());
305 New->replaceSuccessorsPhiUsesWith(Old, New);
314 Builder.SetInsertPoint(
Builder.GetInsertBlock()->getTerminator());
328 Builder.SetInsertPoint(
Builder.GetInsertBlock()->getTerminator());
350enum OpenMPOffloadingRequiresDirFlags {
352 OMP_REQ_UNDEFINED = 0x000,
354 OMP_REQ_NONE = 0x001,
356 OMP_REQ_REVERSE_OFFLOAD = 0x002,
358 OMP_REQ_UNIFIED_ADDRESS = 0x004,
360 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
362 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
369 : RequiresFlags(OMP_REQ_UNDEFINED) {}
372 bool IsTargetDevice,
bool IsGPU,
bool OpenMPOffloadMandatory,
373 bool HasRequiresReverseOffload,
bool HasRequiresUnifiedAddress,
374 bool HasRequiresUnifiedSharedMemory,
bool HasRequiresDynamicAllocators)
375 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
376 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
377 RequiresFlags(OMP_REQ_UNDEFINED) {
378 if (HasRequiresReverseOffload)
379 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
380 if (HasRequiresUnifiedAddress)
381 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
382 if (HasRequiresUnifiedSharedMemory)
383 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
384 if (HasRequiresDynamicAllocators)
385 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
389 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
393 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
397 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
401 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
406 :
static_cast<int64_t
>(OMP_REQ_NONE);
411 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
413 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
418 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
420 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
425 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
427 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
432 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
434 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
452 Value *NumThreads3D =
455 ArgsVector = {Version,
476 auto FnAttrs = Attrs.getFnAttrs();
477 auto RetAttrs = Attrs.getRetAttrs();
479 for (
size_t ArgNo = 0; ArgNo < Fn.
arg_size(); ++ArgNo)
484 bool Param =
true) ->
void {
485 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
486 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
487 if (HasSignExt || HasZeroExt) {
488 assert(AS.getNumAttributes() == 1 &&
489 "Currently not handling extension attr combined with others.");
491 if (
auto AK = TargetLibraryInfo::getExtAttrForI32Param(
T, HasSignExt))
494 TargetLibraryInfo::getExtAttrForI32Return(
T, HasSignExt))
501#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
502#include "llvm/Frontend/OpenMP/OMPKinds.def"
506#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
508 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
509 addAttrSet(RetAttrs, RetAttrSet, false); \
510 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
511 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
512 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
514#include "llvm/Frontend/OpenMP/OMPKinds.def"
528#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
530 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
532 Fn = M.getFunction(Str); \
534#include "llvm/Frontend/OpenMP/OMPKinds.def"
540#define OMP_RTL(Enum, Str, ...) \
542 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
544#include "llvm/Frontend/OpenMP/OMPKinds.def"
548 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
558 LLVMContext::MD_callback,
560 2, {-1, -1},
true)}));
573 assert(Fn &&
"Failed to create OpenMP runtime function");
580 auto *Fn = dyn_cast<llvm::Function>(RTLFn.
getCallee());
581 assert(Fn &&
"Failed to create OpenMP runtime function pointer");
599 ParallelRegionBlockSet.
clear();
601 OI.collectBlocks(ParallelRegionBlockSet,
Blocks);
617 <<
" Exit: " << OI.ExitBB->getName() <<
"\n");
619 "Expected OpenMP outlining to be possible!");
621 for (
auto *V : OI.ExcludeArgsFromAggregate)
627 LLVM_DEBUG(
dbgs() <<
" Outlined function: " << *OutlinedFn <<
"\n");
629 "OpenMP outlined functions should not return a value!");
641 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
648 "Expected instructions to add in the outlined region entry");
655 if (
I.isTerminator())
658 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
661 OI.EntryBB->moveBefore(&ArtificialEntry);
668 if (OI.PostOutlineCB)
669 OI.PostOutlineCB(*OutlinedFn);
678 errs() <<
"Error of kind: " << Kind
679 <<
" when emitting offload entries and metadata during "
680 "OMPIRBuilder finalization \n";
705 unsigned Reserve2Flags) {
707 LocFlags |= OMP_IDENT_FLAG_KMPC;
723 if (
GV.getValueType() == OpenMPIRBuilder::Ident &&
GV.hasInitializer())
724 if (
GV.getInitializer() == Initializer)
729 M, OpenMPIRBuilder::Ident,
744 SrcLocStrSize = LocStr.
size();
753 if (
GV.isConstant() &&
GV.hasInitializer() &&
754 GV.getInitializer() == Initializer)
765 unsigned Line,
unsigned Column,
771 Buffer.
append(FunctionName);
773 Buffer.
append(std::to_string(Line));
775 Buffer.
append(std::to_string(Column));
783 StringRef UnknownLoc =
";unknown;unknown;0;0;;";
794 if (
DIFile *DIF = DIL->getFile())
795 if (std::optional<StringRef> Source = DIF->getSource())
801 DIL->getColumn(), SrcLocStrSize);
813 "omp_global_thread_num");
818 bool ForceSimpleCall,
bool CheckCancelFlag) {
826 bool ForceSimpleCall,
bool CheckCancelFlag) {
833 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
836 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
839 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
842 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
845 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
858 bool UseCancelBarrier =
863 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
864 : OMPRTL___kmpc_barrier),
867 if (UseCancelBarrier && CheckCancelFlag)
876 omp::Directive CanceledDirective) {
888 Value *CancelKind =
nullptr;
889 switch (CanceledDirective) {
890#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
891 case DirectiveEnum: \
892 CancelKind = Builder.getInt32(Value); \
894#include "llvm/Frontend/OpenMP/OMPKinds.def"
905 auto ExitCB = [
this, CanceledDirective, Loc](
InsertPointTy IP) {
906 if (CanceledDirective == OMPD_parallel) {
910 omp::Directive::OMPD_unknown,
false,
920 UI->eraseFromParent();
938 ".omp_offloading.entry_name");
953 M, OpenMPIRBuilder::OffloadEntry,
960 Entry->setAlignment(
Align(1));
971 auto *KernelArgsPtr =
984 NumThreads, HostPtr, KernelArgsPtr};
1012 assert(OutlinedFnID &&
"Invalid outlined function ID!");
1016 Value *Return =
nullptr;
1036 Args.NumTeams, Args.NumThreads,
1037 OutlinedFnID, ArgsVector));
1050 emitBlock(OffloadContBlock, CurFn,
true);
1055 omp::Directive CanceledDirective,
1058 "Unexpected cancellation!");
1097 omp::ProcBindKind ProcBind,
bool IsCancellable) {
1117 if (ProcBind != OMP_PROC_BIND_default) {
1162 if (IP.getBlock()->end() == IP.getPoint()) {
1168 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1169 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1170 "Unexpected insertion point for finalization call!");
1206 LLVM_DEBUG(
dbgs() <<
"Before body codegen: " << *OuterFn <<
"\n");
1209 assert(BodyGenCB &&
"Expected body generation callback!");
1211 BodyGenCB(InnerAllocaIP, CodeGenIP);
1213 LLVM_DEBUG(
dbgs() <<
"After body codegen: " << *OuterFn <<
"\n");
1220 if (
auto *
F = dyn_cast<llvm::Function>(RTLFn.
getCallee())) {
1221 if (!
F->hasMetadata(llvm::LLVMContext::MD_callback)) {
1230 llvm::LLVMContext::MD_callback,
1240 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1241 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1242 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1243 OutlinedFn.addFnAttr(Attribute::NoRecurse);
1245 assert(OutlinedFn.arg_size() >= 2 &&
1246 "Expected at least tid and bounded tid as arguments");
1247 unsigned NumCapturedVars =
1248 OutlinedFn.arg_size() - 2;
1250 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1255 Value *ForkCallArgs[] = {
1260 RealArgs.
append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1271 if (IfCondition && NumCapturedVars == 0) {
1275 if (IfCondition && RealArgs.
back()->getType() != PtrTy)
1293 I->eraseFromParent();
1301 assert(FiniInfo.DK == OMPD_parallel &&
1302 "Unexpected finalization stack state!");
1322 PRegOutlinedExitBB->
setName(
"omp.par.outlined.exit");
1323 Blocks.push_back(PRegOutlinedExitBB);
1339 Extractor.
findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1342 LLVM_DEBUG(
dbgs() <<
"Before privatization: " << *OuterFn <<
"\n");
1347 auto PrivHelper = [&](
Value &V) {
1348 if (&V == TIDAddr || &V == ZeroAddr) {
1354 for (
Use &U : V.uses())
1355 if (
auto *UserI = dyn_cast<Instruction>(U.getUser()))
1356 if (ParallelRegionBlockSet.
count(UserI->getParent()))
1366 if (!V.getType()->isPointerTy()) {
1385 Value *ReplacementValue =
nullptr;
1386 CallInst *CI = dyn_cast<CallInst>(&V);
1388 ReplacementValue = PrivTID;
1391 PrivCB(InnerAllocaIP,
Builder.
saveIP(), V, *Inner, ReplacementValue));
1392 assert(ReplacementValue &&
1393 "Expected copy/create callback to set replacement value!");
1394 if (ReplacementValue == &V)
1399 UPtr->set(ReplacementValue);
1416 for (
Value *Input : Inputs) {
1421 for (
Value *Output : Outputs)
1425 "OpenMP outlining should not produce live-out values!");
1427 LLVM_DEBUG(
dbgs() <<
"After privatization: " << *OuterFn <<
"\n");
1430 dbgs() <<
" PBR: " << BB->getName() <<
"\n";
1436 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1437 UI->eraseFromParent();
1531 Dependencies](
Function &OutlinedFn) {
1554 assert(OutlinedFn.getNumUses() == 1 &&
1555 "there must be a single user for the outlined function");
1556 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1560 bool HasShareds = StaleCI->
arg_size() > 0;
1599 assert(ArgStructAlloca &&
1600 "Unable to find the alloca instruction corresponding to arguments "
1601 "for extracted function");
1604 assert(ArgStructType &&
"Unable to find struct type corresponding to "
1605 "arguments for extracted function");
1617 WrapperArgTys.push_back(OutlinedFn.getArg(0)->getType());
1619 (
Twine(OutlinedFn.getName()) +
".wrapper").str(),
1627 TaskAllocFn, {Ident, ThreadID,
Flags,
1628 TaskSize, SharedsSize,
1640 Value *DepArrayPtr =
nullptr;
1641 if (Dependencies.
size()) {
1657 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1664 static_cast<unsigned int>(RTLDependInfoFields::Len));
1671 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1674 static_cast<unsigned int>(Dep.DepKind)),
1704 Instruction *ThenTI = IfTerminator, *ElseTI =
nullptr;
1722 if (Dependencies.
size()) {
1758 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1803 if (IP.getBlock()->end() != IP.getPoint())
1814 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
1815 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
1816 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
1847 unsigned CaseNumber = 0;
1848 for (
auto SectionCB : SectionCBs) {
1850 M.
getContext(),
"omp_section_loop.body.case", CurFn, Continue);
1868 Loc, LoopBodyGenCB, LB, UB, ST,
true,
false, AllocaIP,
"section_loop");
1870 applyStaticWorkshareLoop(Loc.
DL,
LoopInfo, AllocaIP, !IsNowait);
1874 assert(FiniInfo.DK == OMPD_sections &&
1875 "Unexpected finalization stack state!");
1881 AfterIP = {FiniBB, FiniBB->
begin()};
1895 if (IP.getBlock()->end() != IP.getPoint())
1914 Directive OMPD = Directive::OMPD_sections;
1917 return EmitOMPInlinedRegion(OMPD,
nullptr,
nullptr, BodyGenCB, FiniCBWrapper,
1930 M.getDataLayout().getDefaultGlobalsAddressSpace(),
1931 ".omp.reduction.func", &M);
1939 assert(RI.Variable &&
"expected non-null variable");
1940 assert(RI.PrivateVariable &&
"expected non-null private variable");
1941 assert(RI.ReductionGen &&
"expected non-null reduction generator callback");
1942 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
1943 "expected variables and their private equivalents to have the same "
1945 assert(RI.Variable->getType()->isPointerTy() &&
1946 "expected variables to be pointers");
1959 unsigned NumReductions = ReductionInfos.
size();
1966 for (
auto En :
enumerate(ReductionInfos)) {
1967 unsigned Index = En.index();
1973 "private.red.var." +
Twine(
Index) +
".casted");
1981 Value *RedArrayPtr =
1985 bool CanGenerateAtomic =
1991 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
1996 unsigned RedArrayByteSize =
DL.getTypeStoreSize(RedArrayTy);
1999 Value *Lock = getOMPCriticalRegionLock(
".reduction");
2001 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
2002 : RuntimeFunction::OMPRTL___kmpc_reduce);
2005 {Ident, ThreadId, NumVariables, RedArraySize,
2006 RedArrayPtr, ReductionFunc, Lock},
2025 for (
auto En :
enumerate(ReductionInfos)) {
2029 "red.value." +
Twine(En.index()));
2030 Value *PrivateRedValue =
2032 "red.private.value." +
Twine(En.index()));
2041 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
2042 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
2050 if (CanGenerateAtomic) {
2071 for (
auto En :
enumerate(ReductionInfos)) {
2074 RedArrayTy, LHSArrayPtr, 0, En.index());
2079 RedArrayTy, RHSArrayPtr, 0, En.index());
2104 Directive OMPD = Directive::OMPD_master;
2109 Value *Args[] = {Ident, ThreadId};
2117 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2128 Directive OMPD = Directive::OMPD_masked;
2134 Value *ArgsEnd[] = {Ident, ThreadId};
2142 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
2190 "omp_" +
Name +
".next",
true);
2201 CL->Header = Header;
2220 NextBB, NextBB,
Name);
2244 Value *Start,
Value *Stop,
Value *Step,
bool IsSigned,
bool InclusiveStop,
2254 auto *IndVarTy = cast<IntegerType>(Start->getType());
2255 assert(IndVarTy == Stop->
getType() &&
"Stop type mismatch");
2256 assert(IndVarTy == Step->
getType() &&
"Step type mismatch");
2290 Value *CountIfLooping;
2291 if (InclusiveStop) {
2301 "omp_" +
Name +
".tripcount");
2322 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
2325 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
2331 InsertPointTy AllocaIP,
2332 bool NeedsBarrier) {
2333 assert(CLI->
isValid() &&
"Requires a valid canonical loop");
2335 "Require dedicated allocate IP");
2347 Type *IVTy =
IV->getType();
2375 I32Type,
static_cast<int>(OMPScheduleType::UnorderedStatic));
2380 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
2381 PUpperBound, PStride, One,
Zero});
2386 CLI->setTripCount(TripCount);
2407 omp::Directive::OMPD_for,
false,
2418 bool NeedsBarrier,
Value *ChunkSize) {
2419 assert(CLI->
isValid() &&
"Requires a valid canonical loop");
2420 assert(ChunkSize &&
"Chunk size is required");
2425 Type *IVTy =
IV->getType();
2427 "Max supported tripcount bitwidth is 64 bits");
2429 :
Type::getInt64Ty(Ctx);
2444 Value *PLowerBound =
2446 Value *PUpperBound =
2455 Value *CastedChunkSize =
2457 Value *CastedTripCount =
2461 I32Type,
static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
2475 SchedulingType, PLastIter,
2476 PLowerBound, PUpperBound,
2481 Value *FirstChunkStart =
2483 Value *FirstChunkStop =
2488 Value *NextChunkStride =
2493 Value *DispatchCounter;
2497 FirstChunkStart, CastedTripCount, NextChunkStride,
2521 Value *IsLastChunk =
2523 Value *CountUntilOrigTripCount =
2526 IsLastChunk, CountUntilOrigTripCount, ChunkRange,
"omp_chunk.tripcount");
2527 Value *BackcastedChunkTC =
2529 CLI->setTripCount(BackcastedChunkTC);
2534 Value *BackcastedDispatchCounter =
2561 bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind,
2562 llvm::Value *ChunkSize,
bool HasSimdModifier,
bool HasMonotonicModifier,
2563 bool HasNonmonotonicModifier,
bool HasOrderedClause) {
2565 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
2566 HasNonmonotonicModifier, HasOrderedClause);
2568 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
2569 OMPScheduleType::ModifierOrdered;
2570 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
2571 case OMPScheduleType::BaseStatic:
2572 assert(!ChunkSize &&
"No chunk size with static-chunked schedule");
2574 return applyDynamicWorkshareLoop(
DL, CLI, AllocaIP, EffectiveScheduleType,
2575 NeedsBarrier, ChunkSize);
2577 return applyStaticWorkshareLoop(
DL, CLI, AllocaIP, NeedsBarrier);
2579 case OMPScheduleType::BaseStaticChunked:
2581 return applyDynamicWorkshareLoop(
DL, CLI, AllocaIP, EffectiveScheduleType,
2582 NeedsBarrier, ChunkSize);
2584 return applyStaticChunkedWorkshareLoop(
DL, CLI, AllocaIP, NeedsBarrier,
2587 case OMPScheduleType::BaseRuntime:
2588 case OMPScheduleType::BaseAuto:
2589 case OMPScheduleType::BaseGreedy:
2590 case OMPScheduleType::BaseBalanced:
2591 case OMPScheduleType::BaseSteal:
2592 case OMPScheduleType::BaseGuidedSimd:
2593 case OMPScheduleType::BaseRuntimeSimd:
2595 "schedule type does not support user-defined chunk sizes");
2597 case OMPScheduleType::BaseDynamicChunked:
2598 case OMPScheduleType::BaseGuidedChunked:
2599 case OMPScheduleType::BaseGuidedIterativeChunked:
2600 case OMPScheduleType::BaseGuidedAnalyticalChunked:
2601 case OMPScheduleType::BaseStaticBalancedChunked:
2602 return applyDynamicWorkshareLoop(
DL, CLI, AllocaIP, EffectiveScheduleType,
2603 NeedsBarrier, ChunkSize);
2619 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
2622 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
2635 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
2638 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
2650 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
2653 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
2660 assert(CLI->
isValid() &&
"Requires a valid canonical loop");
2662 "Require dedicated allocate IP");
2664 "Require valid schedule type");
2666 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
2667 OMPScheduleType::ModifierOrdered;
2678 Type *IVTy =
IV->getType();
2721 {SrcLoc, ThreadNum, SchedulingType, One,
2722 UpperBound, One, Chunk});
2732 PLowerBound, PUpperBound, PStride});
2742 auto *PI = cast<PHINode>(Phi);
2743 PI->setIncomingBlock(0, OuterCond);
2744 PI->setIncomingValue(0, LowerBound);
2748 auto *Br = cast<BranchInst>(Term);
2749 Br->setSuccessor(0, OuterCond);
2757 auto *CI = cast<CmpInst>(Comp);
2758 CI->setOperand(1, UpperBound);
2761 auto *BI = cast<BranchInst>(Branch);
2762 assert(BI->getSuccessor(1) == Exit);
2763 BI->setSuccessor(1, OuterCond);
2776 omp::Directive::OMPD_for,
false,
2796 auto HasRemainingUses = [&BBsToErase](
BasicBlock *BB) {
2797 for (
Use &U : BB->uses()) {
2798 auto *UseInst = dyn_cast<Instruction>(U.getUser());
2801 if (BBsToErase.count(UseInst->getParent()))
2809 bool Changed =
false;
2811 if (HasRemainingUses(BB)) {
2812 BBsToErase.erase(BB);
2827 assert(
Loops.size() >= 1 &&
"At least one loop required");
2828 size_t NumLoops =
Loops.size();
2832 return Loops.front();
2844 Loop->collectControlBlocks(OldControlBBs);
2848 if (ComputeIP.
isSet())
2855 Value *CollapsedTripCount =
nullptr;
2858 "All loops to collapse must be valid canonical loops");
2859 Value *OrigTripCount = L->getTripCount();
2860 if (!CollapsedTripCount) {
2861 CollapsedTripCount = OrigTripCount;
2873 OrigPreheader->
getNextNode(), OrigAfter,
"collapsed");
2881 Value *Leftover = Result->getIndVar();
2883 NewIndVars.
resize(NumLoops);
2884 for (
int i = NumLoops - 1; i >= 1; --i) {
2885 Value *OrigTripCount =
Loops[i]->getTripCount();
2888 NewIndVars[i] = NewIndVar;
2893 NewIndVars[0] = Leftover;
2902 BasicBlock *ContinueBlock = Result->getBody();
2904 auto ContinueWith = [&ContinueBlock, &ContinuePred,
DL](
BasicBlock *Dest,
2911 ContinueBlock =
nullptr;
2912 ContinuePred = NextSrc;
2919 for (
size_t i = 0; i < NumLoops - 1; ++i)
2920 ContinueWith(
Loops[i]->getBody(),
Loops[i + 1]->getHeader());
2926 for (
size_t i = NumLoops - 1; i > 0; --i)
2927 ContinueWith(
Loops[i]->getAfter(),
Loops[i - 1]->getLatch());
2930 ContinueWith(Result->getLatch(),
nullptr);
2937 for (
size_t i = 0; i < NumLoops; ++i)
2938 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
2952std::vector<CanonicalLoopInfo *>
2956 "Must pass as many tile sizes as there are loops");
2957 int NumLoops =
Loops.size();
2958 assert(NumLoops >= 1 &&
"At least one loop to tile required");
2970 Loop->collectControlBlocks(OldControlBBs);
2978 assert(L->isValid() &&
"All input loops must be valid canonical loops");
2979 OrigTripCounts.
push_back(L->getTripCount());
2990 for (
int i = 0; i < NumLoops - 1; ++i) {
3003 for (
int i = 0; i < NumLoops; ++i) {
3005 Value *OrigTripCount = OrigTripCounts[i];
3018 Value *FloorTripOverflow =
3024 "omp_floor" +
Twine(i) +
".tripcount",
true);
3032 std::vector<CanonicalLoopInfo *> Result;
3033 Result.reserve(NumLoops * 2);
3046 auto EmbeddNewLoop =
3047 [
this,
DL,
F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
3050 DL, TripCount,
F, InnerEnter, OutroInsertBefore,
Name);
3055 Enter = EmbeddedLoop->
getBody();
3056 Continue = EmbeddedLoop->
getLatch();
3057 OutroInsertBefore = EmbeddedLoop->
getLatch();
3058 return EmbeddedLoop;
3062 const Twine &NameBase) {
3065 EmbeddNewLoop(
P.value(), NameBase +
Twine(
P.index()));
3066 Result.push_back(EmbeddedLoop);
3070 EmbeddNewLoops(FloorCount,
"floor");
3076 for (
int i = 0; i < NumLoops; ++i) {
3080 Value *FloorIsEpilogue =
3082 Value *TileTripCount =
3089 EmbeddNewLoops(TileCounts,
"tile");
3094 for (std::pair<BasicBlock *, BasicBlock *>
P : InbetweenCode) {
3103 BodyEnter =
nullptr;
3104 BodyEntered = ExitBB;
3117 for (
int i = 0; i < NumLoops; ++i) {
3120 Value *OrigIndVar = OrigIndVars[i];
3148 if (Properties.
empty())
3171 assert(
Loop->isValid() &&
"Expecting a valid CanonicalLoopInfo");
3175 assert(Latch &&
"A valid CanonicalLoopInfo must have a unique latch");
3183 if (
I.mayReadOrWriteMemory()) {
3187 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
3209 const Twine &NamePrefix) {
3215 SplitBefore = dyn_cast<Instruction>(IfCond);
3261 VMap[
Block] = NewBB;
3271 if (TargetTriple.
isX86()) {
3272 if (Features.
lookup(
"avx512f"))
3274 else if (Features.
lookup(
"avx"))
3278 if (TargetTriple.
isPPC())
3280 if (TargetTriple.
isWasm())
3287 Value *IfCond, OrderKind Order,
3306 if (AlignedVars.
size()) {
3309 for (
auto &AlignedItem : AlignedVars) {
3310 Value *AlignedPtr = AlignedItem.first;
3311 Value *Alignment = AlignedItem.second;
3313 AlignedPtr, Alignment);
3320 createIfVersion(CanonicalLoop, IfCond, VMap,
"simd");
3324 "Cannot find value which corresponds to original loop latch");
3325 assert(isa<BasicBlock>(MappedLatch) &&
3326 "Cannot cast mapped latch block value to BasicBlock");
3327 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
3356 if ((Safelen ==
nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
3364 Ctx, {
MDString::get(Ctx,
"llvm.loop.parallel_accesses"), AccessGroup}));
3372 Ctx, {
MDString::get(Ctx,
"llvm.loop.vectorize.enable"), BoolConst}));
3374 if (Simdlen || Safelen) {
3378 ConstantInt *VectorizeWidth = Simdlen ==
nullptr ? Safelen : Simdlen;
3404static std::unique_ptr<TargetMachine>
3408 StringRef CPU =
F->getFnAttribute(
"target-cpu").getValueAsString();
3409 StringRef Features =
F->getFnAttribute(
"target-features").getValueAsString();
3410 const std::string &
Triple = M->getTargetTriple();
3420 std::nullopt, OptLevel));
3444 [&](
const Function &
F) {
return TM->getTargetTransformInfo(
F); });
3459 assert(L &&
"Expecting CanonicalLoopInfo to be recognized as a loop");
3464 nullptr, ORE,
static_cast<int>(OptLevel),
3485 <<
" Threshold=" << UP.
Threshold <<
"\n"
3488 <<
" PartialOptSizeThreshold="
3507 if (
auto *Load = dyn_cast<LoadInst>(&
I)) {
3508 Ptr = Load->getPointerOperand();
3509 }
else if (
auto *Store = dyn_cast<StoreInst>(&
I)) {
3510 Ptr = Store->getPointerOperand();
3514 Ptr =
Ptr->stripPointerCasts();
3516 if (
auto *Alloca = dyn_cast<AllocaInst>(
Ptr)) {
3517 if (Alloca->getParent() == &
F->getEntryBlock())
3523 unsigned NumInlineCandidates;
3524 bool NotDuplicatable;
3529 LLVM_DEBUG(
dbgs() <<
"Estimated loop size is " << LoopSizeIC <<
"\n");
3532 if (NotDuplicatable || Convergent || !LoopSizeIC.
isValid()) {
3536 unsigned LoopSize = *LoopSizeIC.
getValue();
3541 int MaxTripCount = 0;
3542 bool MaxOrZero =
false;
3543 unsigned TripMultiple = 0;
3545 bool UseUpperBound =
false;
3547 MaxTripCount, MaxOrZero, TripMultiple, LoopSize, UP, PP,
3549 unsigned Factor = UP.
Count;
3550 LLVM_DEBUG(
dbgs() <<
"Suggesting unroll factor of " << Factor <<
"\n");
3561 assert(Factor >= 0 &&
"Unroll factor must not be negative");
3577 Ctx, {
MDString::get(Ctx,
"llvm.loop.unroll.count"), FactorConst}));
3590 *UnrolledCLI =
Loop;
3595 "unrolling only makes sense with a factor of 2 or larger");
3597 Type *IndVarTy =
Loop->getIndVarType();
3604 std::vector<CanonicalLoopInfo *>
LoopNest =
3619 Ctx, {
MDString::get(Ctx,
"llvm.loop.unroll.count"), FactorConst})});
3622 (*UnrolledCLI)->assertOK();
3640 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
3660 Directive OMPD = Directive::OMPD_single;
3665 Value *Args[] = {Ident, ThreadId};
3680 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3685 omp::Directive::OMPD_unknown,
false,
3697 Directive OMPD = Directive::OMPD_critical;
3702 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
3703 Value *Args[] = {Ident, ThreadId, LockVar};
3720 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3728 const Twine &
Name,
bool IsDependSource) {
3731 [](
Value *SV) {
return SV->
getType()->isIntegerTy(64); }) &&
3732 "OpenMP runtime requires depend vec with i64 type");
3745 for (
unsigned I = 0;
I < NumLoops; ++
I) {
3759 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
3777 Directive OMPD = Directive::OMPD_ordered;
3786 Value *Args[] = {Ident, ThreadId};
3796 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3802 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB,
bool Conditional,
3803 bool HasFinalize,
bool IsCancellable) {
3812 if (!isa_and_nonnull<BranchInst>(SplitPos))
3819 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
3829 "Unexpected control flow graph state!!");
3830 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
3832 "Unexpected Control Flow State!");
3838 "Unexpected Insertion point location!");
3841 auto InsertBB = merged ? ExitPredBB : ExitBB;
3842 if (!isa_and_nonnull<BranchInst>(SplitPos))
3852 if (!Conditional || !EntryCall)
3872 UI->eraseFromParent();
3880 omp::Directive OMPD, InsertPointTy FinIP,
Instruction *ExitCall,
3888 "Unexpected finalization stack state!");
3891 assert(Fi.DK == OMPD &&
"Unexpected Directive for Finalization call!");
3941 if (isa_and_nonnull<BranchInst>(OMP_Entry->
getTerminator())) {
3943 "copyin.not.master.end");
3998 Value *DependenceAddress,
bool HaveNowaitClause) {
4006 if (Device ==
nullptr)
4009 if (NumDependences ==
nullptr) {
4016 Ident, ThreadId, InteropVar, InteropTypeVal,
4017 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
4026 Value *NumDependences,
Value *DependenceAddress,
bool HaveNowaitClause) {
4034 if (Device ==
nullptr)
4036 if (NumDependences ==
nullptr) {
4043 Ident, ThreadId, InteropVar, Device,
4044 NumDependences, DependenceAddress, HaveNowaitClauseVal};
4053 Value *NumDependences,
4054 Value *DependenceAddress,
4055 bool HaveNowaitClause) {
4062 if (Device ==
nullptr)
4064 if (NumDependences ==
nullptr) {
4071 Ident, ThreadId, InteropVar, Device,
4072 NumDependences, DependenceAddress, HaveNowaitClauseVal};
4120 const std::string DebugPrefix =
"_debug__";
4122 KernelName = KernelName.
drop_back(DebugPrefix.length());
4125 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
4128 Twine DynamicEnvironmentName = KernelName +
"_dynamic_environment";
4129 Constant *DynamicEnvironmentInitializer =
4133 DynamicEnvironmentInitializer, DynamicEnvironmentName,
4135 DL.getDefaultGlobalsAddressSpace());
4139 DynamicEnvironmentGV->
getType() == DynamicEnvironmentPtr
4140 ? DynamicEnvironmentGV
4142 DynamicEnvironmentPtr);
4145 ConfigurationEnvironment, {
4146 UseGenericStateMachineVal,
4147 MayUseNestedParallelismVal,
4151 KernelEnvironment, {
4152 ConfigurationEnvironmentInitializer,
4156 Twine KernelEnvironmentName = KernelName +
"_kernel_environment";
4159 KernelEnvironmentInitializer, KernelEnvironmentName,
4161 DL.getDefaultGlobalsAddressSpace());
4165 KernelEnvironmentGV->
getType() == KernelEnvironmentPtr
4166 ? KernelEnvironmentGV
4168 KernelEnvironmentPtr);
4195 UI->eraseFromParent();
4207 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
4216 if (Features.
count(
"+wavefrontsize64"))
4217 return omp::getAMDGPUGridValues<64>();
4218 return omp::getAMDGPUGridValues<32>();
4226void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
4227 Function *OutlinedFn, int32_t NumTeams, int32_t NumThreads) {
4238 OutlinedFn->
addFnAttr(
"omp_target_num_teams", std::to_string(NumTeams));
4243 if (NumThreads > 0) {
4245 OutlinedFn->
addFnAttr(
"amdgpu-flat-work-group-size",
4246 "1," + llvm::utostr(NumThreads));
4250 MDNode *ExistingOp =
nullptr;
4254 auto *
Kernel = dyn_cast<ConstantAsMetadata>(
Op->getOperand(0));
4257 auto *Prop = dyn_cast<MDString>(
Op->getOperand(1));
4258 if (!Prop || Prop->getString() !=
"maxntidx")
4264 auto *OldVal = dyn_cast<ConstantAsMetadata>(ExistingOp->
getOperand(2));
4266 cast<ConstantInt>(OldVal->getValue())->getZExtValue();
4270 std::min(OldLimit, NumThreads))));
4281 OutlinedFn->
addFnAttr(
"omp_target_thread_limit",
4282 std::to_string(NumThreads));
4289 assert(OutlinedFn &&
"The outlined function must exist if embedded");
4298Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(
Function *OutlinedFn,
4304 "Named kernel already exists?");
4313 int32_t NumThreads,
bool IsOffloadEntry,
Function *&OutlinedFn,
4320 ? GenerateFunctionCallback(EntryFnName)
4326 if (!IsOffloadEntry)
4329 std::string EntryFnIDName =
4331 ? std::string(EntryFnName)
4335 EntryInfo, OutlinedFn, EntryFnName, EntryFnIDName, NumTeams, NumThreads);
4341 int32_t NumThreads) {
4343 setOutlinedTargetRegionFunctionAttributes(OutlinedFn, NumTeams, NumThreads);
4344 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
4345 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
4347 EntryInfo, EntryAddr, OutlinedFnID,
4349 return OutlinedFnID;
4365 bool IsStandAlone = !BodyGenCB;
4390 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
4397 assert(MapperFunc &&
"MapperFunc missing for standalone target data");
4402 omp::OMPRTL___tgt_target_data_begin_mapper);
4406 for (
auto DeviceMap :
Info.DevicePtrInfoMap) {
4407 if (isa<AllocaInst>(DeviceMap.second.second)) {