63#define DEBUG_TYPE "openmp-ir-builder"
70 cl::desc(
"Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
75 "openmp-ir-builder-unroll-threshold-factor",
cl::Hidden,
76 cl::desc(
"Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
88 if (!IP1.isSet() || !IP2.isSet())
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
156 if (Features.
count(
"+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
169 bool HasSimdModifier) {
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
211 return OrderingScheduleType;
217 bool HasSimdModifier,
bool HasMonotonic,
218 bool HasNonmonotonic,
bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 }
else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
253 bool HasSimdModifier,
bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier,
bool HasOrderedClause) {
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
280 Br->setSuccessor(0,
Target);
285 NewBr->setDebugLoc(
DL);
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
295 New->splice(New->begin(), Old, IP.
getPoint(), Old->
end());
323 New->replaceSuccessorsPhiUsesWith(Old, New);
368 const Twine &
Name =
"",
bool AsPtr =
true) {
376 FakeVal = FakeValAddr;
404enum OpenMPOffloadingRequiresDirFlags {
406 OMP_REQ_UNDEFINED = 0x000,
408 OMP_REQ_NONE = 0x001,
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
426 bool IsTargetDevice,
bool IsGPU,
bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload,
bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory,
bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
460 :
static_cast<int64_t
>(OMP_REQ_NONE);
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
501 constexpr const size_t MaxDim = 3;
509 Value *NumThreads3D =
512 seq<unsigned>(1, std::min(KernelArgs.
NumTeams.size(), MaxDim)))
516 seq<unsigned>(1, std::min(KernelArgs.
NumThreads.size(), MaxDim)))
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (
size_t ArgNo = 0; ArgNo < Fn.
arg_size(); ++ArgNo)
548 bool Param =
true) ->
void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
555 if (
auto AK = TargetLibraryInfo::getExtAttrForI32Param(
T, HasSignExt))
558 TargetLibraryInfo::getExtAttrForI32Return(
T, HasSignExt))
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
596 Fn = M.getFunction(Str); \
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
604#define OMP_RTL(Enum, Str, ...) \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
622 LLVMContext::MD_callback,
624 2, {-1, -1},
true)}));
637 assert(Fn &&
"Failed to create OpenMP runtime function");
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.
getCallee());
645 assert(Fn &&
"Failed to create OpenMP runtime function pointer");
660 for (
auto Inst =
Block->getReverseIterator()->begin();
661 Inst !=
Block->getReverseIterator()->end();) {
662 if (
auto *
AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
686 ParallelRegionBlockSet.
clear();
688 OI.collectBlocks(ParallelRegionBlockSet,
Blocks);
707 ".omp_par", ArgsInZeroAddressSpace);
711 <<
" Exit: " << OI.ExitBB->getName() <<
"\n");
713 "Expected OpenMP outlining to be possible!");
715 for (
auto *V : OI.ExcludeArgsFromAggregate)
722 if (TargetCpuAttr.isStringAttribute())
725 auto TargetFeaturesAttr = OuterFn->
getFnAttribute(
"target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->
addFnAttr(TargetFeaturesAttr);
730 LLVM_DEBUG(
dbgs() <<
" Outlined function: " << *OutlinedFn <<
"\n");
732 "OpenMP outlined functions should not return a value!");
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
751 "Expected instructions to add in the outlined region entry");
758 if (
I.isTerminator())
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
764 OI.EntryBB->moveBefore(&ArtificialEntry);
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
803 errs() <<
"Error of kind: " << Kind
804 <<
" when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
814 emitUsed(
"llvm.compiler.used", LLVMCompilerUsed);
836 unsigned Reserve2Flags) {
838 LocFlags |= OMP_IDENT_FLAG_KMPC;
846 ConstantInt::get(
Int32, Reserve2Flags),
847 ConstantInt::get(
Int32, SrcLocStrSize), SrcLocStr};
854 if (
GV.getValueType() == OpenMPIRBuilder::Ident &&
GV.hasInitializer())
855 if (
GV.getInitializer() == Initializer)
860 M, OpenMPIRBuilder::Ident,
875 SrcLocStrSize = LocStr.
size();
884 if (
GV.isConstant() &&
GV.hasInitializer() &&
885 GV.getInitializer() == Initializer)
896 unsigned Line,
unsigned Column,
902 Buffer.
append(FunctionName);
904 Buffer.
append(std::to_string(Line));
906 Buffer.
append(std::to_string(Column));
914 StringRef UnknownLoc =
";unknown;unknown;0;0;;";
925 if (
DIFile *DIF = DIL->getFile())
926 if (std::optional<StringRef> Source = DIF->getSource())
932 DIL->getColumn(), SrcLocStrSize);
944 "omp_global_thread_num");
949 bool ForceSimpleCall,
bool CheckCancelFlag) {
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
968 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
971 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
984 bool UseCancelBarrier =
989 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
990 : OMPRTL___kmpc_barrier),
993 if (UseCancelBarrier && CheckCancelFlag)
1003 omp::Directive CanceledDirective) {
1015 Value *CancelKind =
nullptr;
1016 switch (CanceledDirective) {
1017#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1018 case DirectiveEnum: \
1019 CancelKind = Builder.getInt32(Value); \
1021#include "llvm/Frontend/OpenMP/OMPKinds.def"
1033 if (CanceledDirective == OMPD_parallel) {
1037 omp::Directive::OMPD_unknown,
1051 UI->eraseFromParent();
1064 auto *KernelArgsPtr =
1077 NumThreads, HostPtr, KernelArgsPtr};
1105 assert(OutlinedFnID &&
"Invalid outlined function ID!");
1109 Value *Return =
nullptr;
1129 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1130 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1146 emitBlock(OffloadContBlock, CurFn,
true);
1151 Value *CancelFlag, omp::Directive CanceledDirective,
1154 "Unexpected cancellation!");
1207 OutlinedFn.
addFnAttr(Attribute::NoUnwind);
1210 "Expected at least tid and bounded tid as arguments");
1211 unsigned NumCapturedVars = OutlinedFn.
arg_size() - 2;
1214 assert(CI &&
"Expected call instruction to outlined function");
1215 CI->
getParent()->setName(
"omp_parallel");
1218 Type *PtrTy = OMPIRBuilder->VoidPtr;
1222 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.
saveIP();
1226 Value *Args = ArgsAlloca;
1234 for (
unsigned Idx = 0;
Idx < NumCapturedVars;
Idx++) {
1246 Value *Parallel51CallArgs[] = {
1250 NumThreads ? NumThreads : Builder.
getInt32(-1),
1253 Builder.
CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1256 Builder.
getInt64(NumCapturedVars)};
1261 Builder.
CreateCall(RTLFn, Parallel51CallArgs);
1276 I->eraseFromParent();
1298 if (
auto *
F = dyn_cast<Function>(RTLFn.
getCallee())) {
1299 if (!
F->hasMetadata(LLVMContext::MD_callback)) {
1307 F->addMetadata(LLVMContext::MD_callback,
1316 OutlinedFn.
addFnAttr(Attribute::NoUnwind);
1319 "Expected at least tid and bounded tid as arguments");
1320 unsigned NumCapturedVars = OutlinedFn.
arg_size() - 2;
1323 CI->
getParent()->setName(
"omp_parallel");
1327 Value *ForkCallArgs[] = {
1328 Ident, Builder.
getInt32(NumCapturedVars),
1329 Builder.
CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1332 RealArgs.
append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1341 auto PtrTy = OMPIRBuilder->VoidPtr;
1342 if (IfCondition && NumCapturedVars == 0) {
1346 if (IfCondition && RealArgs.
back()->getType() != PtrTy)
1364 I->eraseFromParent();
1372 omp::ProcBindKind ProcBind,
bool IsCancellable) {
1399 if (ProcBind != OMP_PROC_BIND_default) {
1403 ConstantInt::get(
Int32,
unsigned(ProcBind),
true)};
1431 TIDAddrAlloca, PointerType ::get(
M.
getContext(), 0),
"tid.addr.ascast");
1436 "zero.addr.ascast");
1460 if (IP.getBlock()->end() == IP.getPoint()) {
1466 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1467 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1468 "Unexpected insertion point for finalization call!");
1504 LLVM_DEBUG(
dbgs() <<
"Before body codegen: " << *OuterFn <<
"\n");
1507 assert(BodyGenCB &&
"Expected body generation callback!");
1509 if (
Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1512 LLVM_DEBUG(
dbgs() <<
"After body codegen: " << *OuterFn <<
"\n");
1518 std::move(ToBeDeleted)](
Function &OutlinedFn) {
1520 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1521 ThreadID, ToBeDeletedVec);
1526 std::move(ToBeDeleted)](
Function &OutlinedFn) {
1528 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1545 PRegOutlinedExitBB->
setName(
"omp.par.outlined.exit");
1546 Blocks.push_back(PRegOutlinedExitBB);
1557 ".omp_par", ArgsInZeroAddressSpace);
1562 Extractor.
findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1568 if (
auto *
GV = dyn_cast_if_present<GlobalVariable>(
I))
1569 return GV->getValueType() == OpenMPIRBuilder::Ident;
1574 LLVM_DEBUG(
dbgs() <<
"Before privatization: " << *OuterFn <<
"\n");
1580 if (&V == TIDAddr || &V == ZeroAddr) {
1586 for (
Use &U : V.uses())
1587 if (
auto *UserI = dyn_cast<Instruction>(U.getUser()))
1588 if (ParallelRegionBlockSet.
count(UserI->getParent()))
1598 if (!V.getType()->isPointerTy()) {
1617 Value *ReplacementValue =
nullptr;
1618 CallInst *CI = dyn_cast<CallInst>(&V);
1620 ReplacementValue = PrivTID;
1623 PrivCB(InnerAllocaIP,
Builder.
saveIP(), V, *Inner, ReplacementValue);
1631 assert(ReplacementValue &&
1632 "Expected copy/create callback to set replacement value!");
1633 if (ReplacementValue == &V)
1638 UPtr->set(ReplacementValue);
1657 for (
Value *Input : Inputs) {
1659 if (
Error Err = PrivHelper(*Input))
1663 for (
Value *Output : Outputs)
1667 "OpenMP outlining should not produce live-out values!");
1669 LLVM_DEBUG(
dbgs() <<
"After privatization: " << *OuterFn <<
"\n");
1672 dbgs() <<
" PBR: " << BB->getName() <<
"\n";
1680 assert(FiniInfo.DK == OMPD_parallel &&
1681 "Unexpected finalization stack state!");
1686 if (
Error Err = FiniCB(PreFiniIP))
1692 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1693 UI->eraseFromParent();
1759 if (Dependencies.
empty())
1779 Type *DependInfo = OMPBuilder.DependInfo;
1782 Value *DepArray =
nullptr;
1788 DepArray = Builder.
CreateAlloca(DepArrayTy,
nullptr,
".dep.arr.addr");
1790 for (
const auto &[DepIdx, Dep] :
enumerate(Dependencies)) {
1796 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1801 DependInfo,
Base,
static_cast<unsigned int>(RTLDependInfoFields::Len));
1803 Builder.
getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1808 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1811 static_cast<unsigned int>(Dep.DepKind)),
1853 if (
Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1864 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP,
"global.tid",
false));
1866 OI.
PostOutlineCB = [
this, Ident, Tied, Final, IfCondition, Dependencies,
1867 Mergeable, EventHandle, TaskAllocaBB,
1868 ToBeDeleted](
Function &OutlinedFn)
mutable {
1870 assert(OutlinedFn.getNumUses() == 1 &&
1871 "there must be a single user for the outlined function");
1872 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1876 bool HasShareds = StaleCI->
arg_size() > 1;
1920 assert(ArgStructAlloca &&
1921 "Unable to find the alloca instruction corresponding to arguments "
1922 "for extracted function");
1925 assert(ArgStructType &&
"Unable to find struct type corresponding to "
1926 "arguments for extracted function");
1934 TaskAllocFn, {Ident, ThreadID,
Flags,
1935 TaskSize, SharedsSize,
1943 OMPRTL___kmpc_task_allow_completion_event);
1961 Value *DepArray =
nullptr;
1962 if (Dependencies.
size()) {
1977 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1984 static_cast<unsigned int>(RTLDependInfoFields::Len));
1991 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1994 static_cast<unsigned int>(Dep.DepKind)),
2025 Instruction *ThenTI = IfTerminator, *ElseTI =
nullptr;
2031 if (Dependencies.
size()) {
2055 if (Dependencies.
size()) {
2076 Shareds, [Shareds](
Use &U) {
return U.getUser() != Shareds; });
2080 I->eraseFromParent();
2129 if (IP.getBlock()->end() != IP.getPoint())
2140 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2141 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2142 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2173 unsigned CaseNumber = 0;
2174 for (
auto SectionCB : SectionCBs) {
2192 Value *LB = ConstantInt::get(I32Ty, 0);
2193 Value *UB = ConstantInt::get(I32Ty, SectionCBs.
size());
2194 Value *ST = ConstantInt::get(I32Ty, 1);
2196 Loc, LoopBodyGenCB, LB, UB, ST,
true,
false, AllocaIP,
"section_loop");
2201 applyStaticWorkshareLoop(Loc.
DL, *
LoopInfo, AllocaIP, !IsNowait);
2208 assert(FiniInfo.DK == OMPD_sections &&
2209 "Unexpected finalization stack state!");
2216 AfterIP = {FiniBB, FiniBB->
begin()};
2230 if (IP.getBlock()->end() != IP.getPoint())
2249 Directive OMPD = Directive::OMPD_sections;
2252 return EmitOMPInlinedRegion(OMPD,
nullptr,
nullptr, BodyGenCB, FiniCBWrapper,
2264 std::vector<WeakTrackingVH> &
List) {
2271 for (
unsigned I = 0, E =
List.size();
I != E; ++
I)
2275 if (UsedArray.
empty())
2282 GV->setSection(
"llvm.metadata");
2285Value *OpenMPIRBuilder::getGPUThreadID() {
2288 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2292Value *OpenMPIRBuilder::getGPUWarpSize() {
2297Value *OpenMPIRBuilder::getNVPTXWarpID() {
2302Value *OpenMPIRBuilder::getNVPTXLaneID() {
2304 assert(LaneIDBits < 32 &&
"Invalid LaneIDBits size in NVPTX device.");
2305 unsigned LaneIDMask = ~0
u >> (32u - LaneIDBits);
2310Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP,
Value *
From,
2315 assert(FromSize > 0 &&
"From size must be greater than zero");
2316 assert(ToSize > 0 &&
"To size must be greater than zero");
2317 if (FromType == ToType)
2319 if (FromSize == ToSize)
2334Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2339 assert(
Size <= 8 &&
"Unsupported bitwidth in shuffle instruction");
2343 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2347 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2348 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2349 Value *WarpSizeCast =
2351 Value *ShuffleCall =
2353 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2356void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP,
Value *SrcAddr,
2372 Value *ElemPtr = DstAddr;
2374 for (
unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2386 if ((
Size / IntSize) > 1) {
2410 Value *Res = createRuntimeShuffleFunction(
2419 Value *LocalElemPtr =
2426 Value *Res = createRuntimeShuffleFunction(
2440void OpenMPIRBuilder::emitReductionListCopy(
2441 InsertPointTy AllocaIP, CopyAction Action,
Type *ReductionArrayTy,
2443 CopyOptionsTy CopyOptions) {
2446 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2450 for (
auto En :
enumerate(ReductionInfos)) {
2451 const ReductionInfo &RI = En.value();
2452 Value *SrcElementAddr =
nullptr;
2453 Value *DestElementAddr =
nullptr;
2454 Value *DestElementPtrAddr =
nullptr;
2456 bool ShuffleInElement =
false;
2459 bool UpdateDestListPtr =
false;
2463 ReductionArrayTy, SrcBase,
2464 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2470 ReductionArrayTy, DestBase,
2471 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2477 ".omp.reduction.element");
2480 DestElementAddr = DestAlloca;
2483 DestElementAddr->
getName() +
".ascast");
2485 ShuffleInElement =
true;
2486 UpdateDestListPtr =
true;
2498 if (ShuffleInElement) {
2499 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2500 RemoteLaneOffset, ReductionArrayTy);
2502 switch (RI.EvaluationKind) {
2511 RI.ElementType, SrcElementAddr, 0, 0,
".realp");
2513 RI.ElementType->getStructElementType(0), SrcRealPtr,
".real");
2515 RI.ElementType, SrcElementAddr, 0, 1,
".imagp");
2517 RI.ElementType->getStructElementType(1), SrcImgPtr,
".imag");
2520 RI.ElementType, DestElementAddr, 0, 0,
".realp");
2522 RI.ElementType, DestElementAddr, 0, 1,
".imagp");
2544 if (UpdateDestListPtr) {
2547 DestElementAddr->
getName() +
".ascast");
2563 "_omp_reduction_inter_warp_copy_func", &
M);
2586 "__openmp_nvptx_data_transfer_temporary_storage";
2590 if (!TransferMedium) {
2599 Value *GPUThreadID = getGPUThreadID();
2601 Value *LaneID = getNVPTXLaneID();
2603 Value *WarpID = getNVPTXWarpID();
2612 Arg0Type,
nullptr, ReduceListArg->
getName() +
".addr");
2616 ReduceListAlloca, Arg0Type, ReduceListAlloca->
getName() +
".ascast");
2619 NumWarpsAlloca->
getName() +
".ascast");
2630 for (
auto En :
enumerate(ReductionInfos)) {
2635 const ReductionInfo &RI = En.value();
2637 for (
unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2640 unsigned NumIters = RealTySize / TySize;
2643 Value *Cnt =
nullptr;
2644 Value *CntAddr =
nullptr;
2654 CntAddr->
getName() +
".ascast");
2674 omp::Directive::OMPD_unknown,
2678 return BarrierIP1.takeError();
2689 auto *RedListArrayTy =
2695 {ConstantInt::get(IndexTy, 0),
2696 ConstantInt::get(IndexTy, En.index())});
2722 omp::Directive::OMPD_unknown,
2726 return BarrierIP2.takeError();
2733 Value *NumWarpsVal =
2736 Value *IsActiveThread =
2747 Value *TargetElemPtrPtr =
2749 {ConstantInt::get(IndexTy, 0),
2750 ConstantInt::get(IndexTy, En.index())});
2751 Value *TargetElemPtrVal =
2753 Value *TargetElemPtr = TargetElemPtrVal;
2759 Value *SrcMediumValue =
2778 RealTySize %= TySize;
2788Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2794 {Builder.getPtrTy(), Builder.getInt16Ty(),
2795 Builder.getInt16Ty(), Builder.getInt16Ty()},
2799 "_omp_reduction_shuffle_and_reduce_func", &
M);
2820 Type *ReduceListArgType = ReduceListArg->
getType();
2824 ReduceListArgType,
nullptr, ReduceListArg->
getName() +
".addr");
2826 LaneIDArg->
getName() +
".addr");
2828 LaneIDArgType,
nullptr, RemoteLaneOffsetArg->
getName() +
".addr");
2830 AlgoVerArg->
getName() +
".addr");
2837 RedListArrayTy,
nullptr,
".omp.reduction.remote_reduce_list");
2840 ReduceListAlloca, ReduceListArgType,
2841 ReduceListAlloca->
getName() +
".ascast");
2843 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->
getName() +
".ascast");
2845 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2846 RemoteLaneOffsetAlloca->
getName() +
".ascast");
2848 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->
getName() +
".ascast");
2851 RemoteReductionListAlloca->
getName() +
".ascast");
2860 Value *RemoteLaneOffset =
2869 emitReductionListCopy(
2871 ReduceList, RemoteListAddrCast, {RemoteLaneOffset,
nullptr,
nullptr});
2902 Value *RemoteOffsetComp =
2919 ->addFnAttr(Attribute::NoUnwind);
2940 ReductionInfos, RemoteListAddrCast, ReduceList);
2953Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2960 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2964 "_omp_reduction_list_to_global_copy_func", &
M);
2981 BufferArg->
getName() +
".addr");
2988 BufferArgAlloca->
getName() +
".ascast");
2993 ReduceListArgAlloca->
getName() +
".ascast");
2999 Value *LocalReduceList =
3001 Value *BufferArgVal =
3006 for (
auto En :
enumerate(ReductionInfos)) {
3007 const ReductionInfo &RI = En.value();
3008 auto *RedListArrayTy =
3012 RedListArrayTy, LocalReduceList,
3013 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3021 ReductionsBufferTy, BufferVD, 0, En.index());
3023 switch (RI.EvaluationKind) {
3031 RI.ElementType, ElemPtr, 0, 0,
".realp");
3033 RI.ElementType->getStructElementType(0), SrcRealPtr,
".real");
3035 RI.ElementType, ElemPtr, 0, 1,
".imagp");
3037 RI.ElementType->getStructElementType(1), SrcImgPtr,
".imag");
3040 RI.ElementType, GlobVal, 0, 0,
".realp");
3042 RI.ElementType, GlobVal, 0, 1,
".imagp");
3063Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3070 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3074 "_omp_reduction_list_to_global_reduce_func", &
M);
3091 BufferArg->
getName() +
".addr");
3096 auto *RedListArrayTy =
3101 Value *LocalReduceList =
3106 BufferArgAlloca->
getName() +
".ascast");
3111 ReduceListArgAlloca->
getName() +
".ascast");
3114 LocalReduceList->
getName() +
".ascast");
3124 for (
auto En :
enumerate(ReductionInfos)) {
3126 RedListArrayTy, LocalReduceListAddrCast,
3127 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3132 ReductionsBufferTy, BufferVD, 0, En.index());
3140 ->addFnAttr(Attribute::NoUnwind);
3146Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3153 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3157 "_omp_reduction_global_to_list_copy_func", &
M);
3174 BufferArg->
getName() +
".addr");
3181 BufferArgAlloca->
getName() +
".ascast");
3186 ReduceListArgAlloca->
getName() +
".ascast");
3191 Value *LocalReduceList =
3197 for (
auto En :
enumerate(ReductionInfos)) {
3199 auto *RedListArrayTy =
3203 RedListArrayTy, LocalReduceList,
3204 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3211 ReductionsBufferTy, BufferVD, 0, En.index());
3254Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3261 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3265 "_omp_reduction_global_to_list_reduce_func", &
M);
3282 BufferArg->
getName() +
".addr");
3292 Value *LocalReduceList =
3297 BufferArgAlloca->
getName() +
".ascast");
3302 ReduceListArgAlloca->
getName() +
".ascast");
3305 LocalReduceList->
getName() +
".ascast");
3315 for (
auto En :
enumerate(ReductionInfos)) {
3317 RedListArrayTy, ReductionList,
3318 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3323 ReductionsBufferTy, BufferVD, 0, En.index());
3331 ->addFnAttr(Attribute::NoUnwind);
3337std::string OpenMPIRBuilder::getReductionFuncName(
StringRef Name)
const {
3338 std::string Suffix =
3340 return (
Name + Suffix).str();
3345 ReductionGenCBKind ReductionGenCBKind,
AttributeList FuncAttrs) {
3347 {Builder.getPtrTy(), Builder.getPtrTy()},
3349 std::string
Name = getReductionFuncName(ReducerName);
3361 Value *LHSArrayPtr =
nullptr;
3362 Value *RHSArrayPtr =
nullptr;
3373 LHSAlloca, Arg0Type, LHSAlloca->
getName() +
".ascast");
3375 RHSAlloca, Arg1Type, RHSAlloca->
getName() +
".ascast");
3385 for (
auto En :
enumerate(ReductionInfos)) {
3386 const ReductionInfo &RI = En.value();
3388 RedArrayTy, RHSArrayPtr,
3389 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3392 RHSI8Ptr, RI.PrivateVariable->getType(),
3393 RHSI8Ptr->
getName() +
".ascast");
3396 RedArrayTy, LHSArrayPtr,
3397 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3400 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->
getName() +
".ascast");
3412 return AfterIP.takeError();
3414 return ReductionFunc;
3420 for (
auto En :
enumerate(ReductionInfos)) {
3421 unsigned Index = En.index();
3422 const ReductionInfo &RI = En.value();
3423 Value *LHSFixupPtr, *RHSFixupPtr;
3425 Builder.
saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3430 LHSPtrs[Index], [ReductionFunc](
const Use &U) {
3431 return cast<Instruction>(
U.getUser())->getParent()->getParent() ==
3435 RHSPtrs[Index], [ReductionFunc](
const Use &U) {
3436 return cast<Instruction>(
U.getUser())->getParent()->getParent() ==
3442 return ReductionFunc;
3450 assert(RI.Variable &&
"expected non-null variable");
3451 assert(RI.PrivateVariable &&
"expected non-null private variable");
3452 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3453 "expected non-null reduction generator callback");
3456 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3457 "expected variables and their private equivalents to have the same "
3460 assert(RI.Variable->getType()->isPointerTy() &&
3461 "expected variables to be pointers");
3468 bool IsNoWait,
bool IsTeamsReduction,
bool HasDistribute,
3470 unsigned ReductionBufNum,
Value *SrcLocInfo) {
3484 if (ReductionInfos.
size() == 0)
3499 if (!ReductionResult)
3501 Function *ReductionFunc = *ReductionResult;
3505 if (GridValue.has_value())
3523 Value *ReductionListAlloca =
3526 ReductionListAlloca, PtrTy, ReductionListAlloca->
getName() +
".ascast");
3530 for (
auto En :
enumerate(ReductionInfos)) {
3533 RedArrayTy, ReductionList,
3534 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3541 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3543 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3551 unsigned MaxDataSize = 0;
3553 for (
auto En :
enumerate(ReductionInfos)) {
3555 if (
Size > MaxDataSize)
3557 ReductionTypeArgs.
emplace_back(En.value().ElementType);
3559 Value *ReductionDataSize =
3561 if (!IsTeamsReduction) {
3562 Value *SarFuncCast =
3566 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3569 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3574 Ctx, ReductionTypeArgs,
"struct._globalized_locals_ty");
3576 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3577 Function *LtGCFunc = emitListToGlobalCopyFunction(
3578 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3579 Function *LtGRFunc = emitListToGlobalReduceFunction(
3580 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3581 Function *GtLCFunc = emitGlobalToListCopyFunction(
3582 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3583 Function *GtLRFunc = emitGlobalToListReduceFunction(
3584 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3588 RedFixedBuferFn, {},
"_openmp_teams_reductions_buffer_$_$ptr");
3590 Value *Args3[] = {SrcLocInfo,
3591 KernelTeamsReductionPtr,
3603 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3620 for (
auto En :
enumerate(ReductionInfos)) {
3627 Value *LHSPtr, *RHSPtr;
3629 &LHSPtr, &RHSPtr, CurFunc));
3634 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3638 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3642 assert(
false &&
"Unhandled ReductionGenCBKind");
3658 ".omp.reduction.func", &M);
3669 assert(RI.Variable &&
"expected non-null variable");
3670 assert(RI.PrivateVariable &&
"expected non-null private variable");
3671 assert(RI.ReductionGen &&
"expected non-null reduction generator callback");
3672 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3673 "expected variables and their private equivalents to have the same "
3675 assert(RI.Variable->getType()->isPointerTy() &&
3676 "expected variables to be pointers");
3689 unsigned NumReductions = ReductionInfos.
size();
3696 for (
auto En :
enumerate(ReductionInfos)) {
3697 unsigned Index = En.index();
3700 RedArrayTy, RedArray, 0, Index,
"red.array.elem." +
Twine(Index));
3715 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3720 unsigned RedArrayByteSize =
DL.getTypeStoreSize(RedArrayTy);
3723 Value *Lock = getOMPCriticalRegionLock(
".reduction");
3725 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3726 : RuntimeFunction::OMPRTL___kmpc_reduce);
3729 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3730 ReductionFunc, Lock},
3749 for (
auto En :
enumerate(ReductionInfos)) {
3755 if (!IsByRef[En.index()]) {
3757 "red.value." +
Twine(En.index()));
3759 Value *PrivateRedValue =
3761 "red.private.value." +
Twine(En.index()));
3772 if (!IsByRef[En.index()])
3776 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3777 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3785 if (CanGenerateAtomic &&
llvm::none_of(IsByRef, [](
bool P) {
return P; })) {
3809 for (
auto En :
enumerate(ReductionInfos)) {
3812 RedArrayTy, LHSArrayPtr, 0, En.index());
3817 RedArrayTy, RHSArrayPtr, 0, En.index());
3831 if (!IsByRef[En.index()])
3847 Directive OMPD = Directive::OMPD_master;
3852 Value *Args[] = {Ident, ThreadId};
3860 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3871 Directive OMPD = Directive::OMPD_masked;
3877 Value *ArgsEnd[] = {Ident, ThreadId};
3885 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3920 IndVarPHI->
addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3933 "omp_" +
Name +
".next",
true);
3944 CL->Header = Header;
3963 NextBB, NextBB,
Name);
3988 Value *Start,
Value *Stop,
Value *Step,
bool IsSigned,
bool InclusiveStop,
3998 auto *IndVarTy = cast<IntegerType>(Start->getType());
3999 assert(IndVarTy == Stop->
getType() &&
"Stop type mismatch");
4000 assert(IndVarTy == Step->
getType() &&
"Step type mismatch");
4006 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4034 Value *CountIfLooping;
4035 if (InclusiveStop) {
4045 "omp_" +
Name +
".tripcount");
4066 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4069 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4075 InsertPointTy AllocaIP,
4076 bool NeedsBarrier) {
4077 assert(CLI->
isValid() &&
"Requires a valid canonical loop");
4079 "Require dedicated allocate IP");
4091 Type *IVTy =
IV->getType();
4111 Constant *One = ConstantInt::get(IVTy, 1);
4119 Constant *SchedulingType = ConstantInt::get(
4120 I32Type,
static_cast<int>(OMPScheduleType::UnorderedStatic));
4125 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4126 PUpperBound, PStride, One,
Zero});
4131 CLI->setTripCount(TripCount);
4153 omp::Directive::OMPD_for,
false,
4156 return BarrierIP.takeError();
4166OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
DebugLoc DL,
4168 InsertPointTy AllocaIP,
4171 assert(CLI->
isValid() &&
"Requires a valid canonical loop");
4172 assert(ChunkSize &&
"Chunk size is required");
4177 Type *IVTy =
IV->getType();
4179 "Max supported tripcount bitwidth is 64 bits");
4181 :
Type::getInt64Ty(Ctx);
4184 Constant *One = ConstantInt::get(InternalIVTy, 1);
4196 Value *PLowerBound =
4198 Value *PUpperBound =
4207 Value *CastedChunkSize =
4209 Value *CastedTripCount =
4212 Constant *SchedulingType = ConstantInt::get(
4213 I32Type,
static_cast<int>(OMPScheduleType::UnorderedStaticChunked));