LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
41#include "llvm/IR/LLVMContext.h"
42#include "llvm/IR/MDBuilder.h"
43#include "llvm/IR/Metadata.h"
45#include "llvm/IR/PassManager.h"
47#include "llvm/IR/Value.h"
59
60#include <cstdint>
61#include <optional>
62
63#define DEBUG_TYPE "openmp-ir-builder"
64
65using namespace llvm;
66using namespace omp;
67
68static cl::opt<bool>
69 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
70 cl::desc("Use optimistic attributes describing "
71 "'as-if' properties of runtime calls."),
72 cl::init(false));
73
75 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
76 cl::desc("Factor for the unroll threshold to account for code "
77 "simplifications still taking place"),
78 cl::init(1.5));
79
80#ifndef NDEBUG
81/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
82/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
83/// an InsertPoint stores the instruction before something is inserted. For
84/// instance, if both point to the same instruction, two IRBuilders alternating
85/// creating instruction will cause the instructions to be interleaved.
88 if (!IP1.isSet() || !IP2.isSet())
89 return false;
90 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
91}
92
94 // Valid ordered/unordered and base algorithm combinations.
95 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
96 case OMPScheduleType::UnorderedStaticChunked:
97 case OMPScheduleType::UnorderedStatic:
98 case OMPScheduleType::UnorderedDynamicChunked:
99 case OMPScheduleType::UnorderedGuidedChunked:
100 case OMPScheduleType::UnorderedRuntime:
101 case OMPScheduleType::UnorderedAuto:
102 case OMPScheduleType::UnorderedTrapezoidal:
103 case OMPScheduleType::UnorderedGreedy:
104 case OMPScheduleType::UnorderedBalanced:
105 case OMPScheduleType::UnorderedGuidedIterativeChunked:
106 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
107 case OMPScheduleType::UnorderedSteal:
108 case OMPScheduleType::UnorderedStaticBalancedChunked:
109 case OMPScheduleType::UnorderedGuidedSimd:
110 case OMPScheduleType::UnorderedRuntimeSimd:
111 case OMPScheduleType::OrderedStaticChunked:
112 case OMPScheduleType::OrderedStatic:
113 case OMPScheduleType::OrderedDynamicChunked:
114 case OMPScheduleType::OrderedGuidedChunked:
115 case OMPScheduleType::OrderedRuntime:
116 case OMPScheduleType::OrderedAuto:
117 case OMPScheduleType::OrderdTrapezoidal:
118 case OMPScheduleType::NomergeUnorderedStaticChunked:
119 case OMPScheduleType::NomergeUnorderedStatic:
120 case OMPScheduleType::NomergeUnorderedDynamicChunked:
121 case OMPScheduleType::NomergeUnorderedGuidedChunked:
122 case OMPScheduleType::NomergeUnorderedRuntime:
123 case OMPScheduleType::NomergeUnorderedAuto:
124 case OMPScheduleType::NomergeUnorderedTrapezoidal:
125 case OMPScheduleType::NomergeUnorderedGreedy:
126 case OMPScheduleType::NomergeUnorderedBalanced:
127 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
128 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
129 case OMPScheduleType::NomergeUnorderedSteal:
130 case OMPScheduleType::NomergeOrderedStaticChunked:
131 case OMPScheduleType::NomergeOrderedStatic:
132 case OMPScheduleType::NomergeOrderedDynamicChunked:
133 case OMPScheduleType::NomergeOrderedGuidedChunked:
134 case OMPScheduleType::NomergeOrderedRuntime:
135 case OMPScheduleType::NomergeOrderedAuto:
136 case OMPScheduleType::NomergeOrderedTrapezoidal:
137 break;
138 default:
139 return false;
140 }
141
142 // Must not set both monotonicity modifiers at the same time.
143 OMPScheduleType MonotonicityFlags =
144 SchedType & OMPScheduleType::MonotonicityMask;
145 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
146 return false;
147
148 return true;
149}
150#endif
151
152static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
153 if (T.isAMDGPU()) {
154 StringRef Features =
155 Kernel->getFnAttribute("target-features").getValueAsString();
156 if (Features.count("+wavefrontsize64"))
157 return omp::getAMDGPUGridValues<64>();
158 return omp::getAMDGPUGridValues<32>();
159 }
160 if (T.isNVPTX())
162 llvm_unreachable("No grid value available for this architecture!");
163}
164
165/// Determine which scheduling algorithm to use, determined from schedule clause
166/// arguments.
167static OMPScheduleType
168getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
169 bool HasSimdModifier) {
170 // Currently, the default schedule it static.
171 switch (ClauseKind) {
172 case OMP_SCHEDULE_Default:
173 case OMP_SCHEDULE_Static:
174 return HasChunks ? OMPScheduleType::BaseStaticChunked
175 : OMPScheduleType::BaseStatic;
176 case OMP_SCHEDULE_Dynamic:
177 return OMPScheduleType::BaseDynamicChunked;
178 case OMP_SCHEDULE_Guided:
179 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
180 : OMPScheduleType::BaseGuidedChunked;
181 case OMP_SCHEDULE_Auto:
183 case OMP_SCHEDULE_Runtime:
184 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
185 : OMPScheduleType::BaseRuntime;
186 }
187 llvm_unreachable("unhandled schedule clause argument");
188}
189
190/// Adds ordering modifier flags to schedule type.
191static OMPScheduleType
193 bool HasOrderedClause) {
194 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
195 OMPScheduleType::None &&
196 "Must not have ordering nor monotonicity flags already set");
197
198 OMPScheduleType OrderingModifier = HasOrderedClause
199 ? OMPScheduleType::ModifierOrdered
200 : OMPScheduleType::ModifierUnordered;
201 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
202
203 // Unsupported combinations
204 if (OrderingScheduleType ==
205 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
206 return OMPScheduleType::OrderedGuidedChunked;
207 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
208 OMPScheduleType::ModifierOrdered))
209 return OMPScheduleType::OrderedRuntime;
210
211 return OrderingScheduleType;
212}
213
214/// Adds monotonicity modifier flags to schedule type.
215static OMPScheduleType
217 bool HasSimdModifier, bool HasMonotonic,
218 bool HasNonmonotonic, bool HasOrderedClause) {
219 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
220 OMPScheduleType::None &&
221 "Must not have monotonicity flags already set");
222 assert((!HasMonotonic || !HasNonmonotonic) &&
223 "Monotonic and Nonmonotonic are contradicting each other");
224
225 if (HasMonotonic) {
226 return ScheduleType | OMPScheduleType::ModifierMonotonic;
227 } else if (HasNonmonotonic) {
228 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
229 } else {
230 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
231 // If the static schedule kind is specified or if the ordered clause is
232 // specified, and if the nonmonotonic modifier is not specified, the
233 // effect is as if the monotonic modifier is specified. Otherwise, unless
234 // the monotonic modifier is specified, the effect is as if the
235 // nonmonotonic modifier is specified.
236 OMPScheduleType BaseScheduleType =
237 ScheduleType & ~OMPScheduleType::ModifierMask;
238 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
239 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
240 HasOrderedClause) {
241 // The monotonic is used by default in openmp runtime library, so no need
242 // to set it.
243 return ScheduleType;
244 } else {
245 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
246 }
247 }
248}
249
250/// Determine the schedule type using schedule and ordering clause arguments.
251static OMPScheduleType
252computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
253 bool HasSimdModifier, bool HasMonotonicModifier,
254 bool HasNonmonotonicModifier, bool HasOrderedClause) {
255 OMPScheduleType BaseSchedule =
256 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
257 OMPScheduleType OrderedSchedule =
258 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
260 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
261 HasNonmonotonicModifier, HasOrderedClause);
262
264 return Result;
265}
266
267/// Make \p Source branch to \p Target.
268///
269/// Handles two situations:
270/// * \p Source already has an unconditional branch.
271/// * \p Source is a degenerate block (no terminator because the BB is
272/// the current head of the IR construction).
274 if (Instruction *Term = Source->getTerminator()) {
275 auto *Br = cast<BranchInst>(Term);
276 assert(!Br->isConditional() &&
277 "BB's terminator must be an unconditional branch (or degenerate)");
278 BasicBlock *Succ = Br->getSuccessor(0);
279 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
280 Br->setSuccessor(0, Target);
281 return;
282 }
283
284 auto *NewBr = BranchInst::Create(Target, Source);
285 NewBr->setDebugLoc(DL);
286}
287
289 bool CreateBranch) {
290 assert(New->getFirstInsertionPt() == New->begin() &&
291 "Target BB must not have PHI nodes");
292
293 // Move instructions to new block.
294 BasicBlock *Old = IP.getBlock();
295 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
296
297 if (CreateBranch)
298 BranchInst::Create(New, Old);
299}
300
301void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
303 BasicBlock *Old = Builder.GetInsertBlock();
304
305 spliceBB(Builder.saveIP(), New, CreateBranch);
306 if (CreateBranch)
307 Builder.SetInsertPoint(Old->getTerminator());
308 else
309 Builder.SetInsertPoint(Old);
310
311 // SetInsertPoint also updates the Builder's debug location, but we want to
312 // keep the one the Builder was configured to use.
314}
315
318 BasicBlock *Old = IP.getBlock();
320 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
321 Old->getParent(), Old->getNextNode());
322 spliceBB(IP, New, CreateBranch);
323 New->replaceSuccessorsPhiUsesWith(Old, New);
324 return New;
325}
326
327BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
330 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
331 if (CreateBranch)
332 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
333 else
334 Builder.SetInsertPoint(Builder.GetInsertBlock());
335 // SetInsertPoint also updates the Builder's debug location, but we want to
336 // keep the one the Builder was configured to use.
338 return New;
339}
340
341BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
344 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
347 else
348 Builder.SetInsertPoint(Builder.GetInsertBlock());
349 // SetInsertPoint also updates the Builder's debug location, but we want to
350 // keep the one the Builder was configured to use.
352 return New;
353}
354
356 llvm::Twine Suffix) {
357 BasicBlock *Old = Builder.GetInsertBlock();
358 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
359}
360
361// This function creates a fake integer value and a fake use for the integer
362// value. It returns the fake value created. This is useful in modeling the
363// extra arguments to the outlined functions.
365 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
367 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
368 const Twine &Name = "", bool AsPtr = true) {
369 Builder.restoreIP(OuterAllocaIP);
370 Instruction *FakeVal;
371 AllocaInst *FakeValAddr =
372 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
373 ToBeDeleted.push_back(FakeValAddr);
374
375 if (AsPtr) {
376 FakeVal = FakeValAddr;
377 } else {
378 FakeVal =
379 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
380 ToBeDeleted.push_back(FakeVal);
381 }
382
383 // Generate a fake use of this value
384 Builder.restoreIP(InnerAllocaIP);
385 Instruction *UseFakeVal;
386 if (AsPtr) {
387 UseFakeVal =
388 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
389 } else {
390 UseFakeVal =
391 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
392 }
393 ToBeDeleted.push_back(UseFakeVal);
394 return FakeVal;
395}
396
397//===----------------------------------------------------------------------===//
398// OpenMPIRBuilderConfig
399//===----------------------------------------------------------------------===//
400
401namespace {
403/// Values for bit flags for marking which requires clauses have been used.
404enum OpenMPOffloadingRequiresDirFlags {
405 /// flag undefined.
406 OMP_REQ_UNDEFINED = 0x000,
407 /// no requires directive present.
408 OMP_REQ_NONE = 0x001,
409 /// reverse_offload clause.
410 OMP_REQ_REVERSE_OFFLOAD = 0x002,
411 /// unified_address clause.
412 OMP_REQ_UNIFIED_ADDRESS = 0x004,
413 /// unified_shared_memory clause.
414 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
415 /// dynamic_allocators clause.
416 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
417 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
418};
419
420} // anonymous namespace
421
423 : RequiresFlags(OMP_REQ_UNDEFINED) {}
424
426 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
427 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
428 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
429 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
430 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
431 RequiresFlags(OMP_REQ_UNDEFINED) {
432 if (HasRequiresReverseOffload)
433 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
434 if (HasRequiresUnifiedAddress)
435 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
436 if (HasRequiresUnifiedSharedMemory)
437 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
438 if (HasRequiresDynamicAllocators)
439 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
440}
441
443 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
444}
445
447 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
448}
449
451 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
452}
453
455 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
456}
457
459 return hasRequiresFlags() ? RequiresFlags
460 : static_cast<int64_t>(OMP_REQ_NONE);
461}
462
464 if (Value)
465 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
466 else
467 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
468}
469
471 if (Value)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 else
474 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
475}
476
478 if (Value)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 else
481 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
485 if (Value)
486 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
487 else
488 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
489}
490
491//===----------------------------------------------------------------------===//
492// OpenMPIRBuilder
493//===----------------------------------------------------------------------===//
494
496 IRBuilderBase &Builder,
497 SmallVector<Value *> &ArgsVector) {
499 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
500 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
501 constexpr const size_t MaxDim = 3;
502 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
503 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
504
505 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
506
507 Value *NumTeams3D =
508 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
509 Value *NumThreads3D =
510 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
511 for (unsigned I :
512 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
513 NumTeams3D =
514 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
515 for (unsigned I :
516 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
517 NumThreads3D =
518 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
519
520 ArgsVector = {Version,
521 PointerNum,
522 KernelArgs.RTArgs.BasePointersArray,
523 KernelArgs.RTArgs.PointersArray,
524 KernelArgs.RTArgs.SizesArray,
525 KernelArgs.RTArgs.MapTypesArray,
526 KernelArgs.RTArgs.MapNamesArray,
527 KernelArgs.RTArgs.MappersArray,
528 KernelArgs.NumIterations,
529 Flags,
530 NumTeams3D,
531 NumThreads3D,
532 KernelArgs.DynCGGroupMem};
533}
534
536 LLVMContext &Ctx = Fn.getContext();
537
538 // Get the function's current attributes.
539 auto Attrs = Fn.getAttributes();
540 auto FnAttrs = Attrs.getFnAttrs();
541 auto RetAttrs = Attrs.getRetAttrs();
543 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
544 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
545
546 // Add AS to FnAS while taking special care with integer extensions.
547 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
548 bool Param = true) -> void {
549 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
550 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
551 if (HasSignExt || HasZeroExt) {
552 assert(AS.getNumAttributes() == 1 &&
553 "Currently not handling extension attr combined with others.");
554 if (Param) {
555 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
556 FnAS = FnAS.addAttribute(Ctx, AK);
557 } else if (auto AK =
558 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
559 FnAS = FnAS.addAttribute(Ctx, AK);
560 } else {
561 FnAS = FnAS.addAttributes(Ctx, AS);
562 }
563 };
564
565#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567
568 // Add attributes to the function declaration.
569 switch (FnID) {
570#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
571 case Enum: \
572 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
573 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
574 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
575 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
576 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
577 break;
578#include "llvm/Frontend/OpenMP/OMPKinds.def"
579 default:
580 // Attributes are optional.
581 break;
582 }
583}
584
587 FunctionType *FnTy = nullptr;
588 Function *Fn = nullptr;
589
590 // Try to find the declation in the module first.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
593 case Enum: \
594 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
595 IsVarArg); \
596 Fn = M.getFunction(Str); \
597 break;
598#include "llvm/Frontend/OpenMP/OMPKinds.def"
599 }
600
601 if (!Fn) {
602 // Create a new declaration if we need one.
603 switch (FnID) {
604#define OMP_RTL(Enum, Str, ...) \
605 case Enum: \
606 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 }
610
611 // Add information if the runtime function takes a callback function
612 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
613 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
614 LLVMContext &Ctx = Fn->getContext();
615 MDBuilder MDB(Ctx);
616 // Annotate the callback behavior of the runtime function:
617 // - The callback callee is argument number 2 (microtask).
618 // - The first two arguments of the callback callee are unknown (-1).
619 // - All variadic arguments to the runtime function are passed to the
620 // callback callee.
621 Fn->addMetadata(
622 LLVMContext::MD_callback,
624 2, {-1, -1}, /* VarArgsArePassed */ true)}));
625 }
626 }
627
628 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
629 << " with type " << *Fn->getFunctionType() << "\n");
630 addAttributes(FnID, *Fn);
631
632 } else {
633 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
634 << " with type " << *Fn->getFunctionType() << "\n");
635 }
636
637 assert(Fn && "Failed to create OpenMP runtime function");
638
639 return {FnTy, Fn};
640}
641
644 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
645 assert(Fn && "Failed to create OpenMP runtime function pointer");
646 return Fn;
647}
648
649void OpenMPIRBuilder::initialize() { initializeTypes(M); }
650
653 BasicBlock &EntryBlock = Function->getEntryBlock();
654 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
655
656 // Loop over blocks looking for constant allocas, skipping the entry block
657 // as any allocas there are already in the desired location.
658 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
659 Block++) {
660 for (auto Inst = Block->getReverseIterator()->begin();
661 Inst != Block->getReverseIterator()->end();) {
662 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
663 Inst++;
664 if (!isa<ConstantData>(AllocaInst->getArraySize()))
665 continue;
666 AllocaInst->moveBeforePreserving(MoveLocInst);
667 } else {
668 Inst++;
669 }
670 }
671 }
672}
673
675 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
677 SmallVector<OutlineInfo, 16> DeferredOutlines;
678 for (OutlineInfo &OI : OutlineInfos) {
679 // Skip functions that have not finalized yet; may happen with nested
680 // function generation.
681 if (Fn && OI.getFunction() != Fn) {
682 DeferredOutlines.push_back(OI);
683 continue;
684 }
685
686 ParallelRegionBlockSet.clear();
687 Blocks.clear();
688 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
689
690 Function *OuterFn = OI.getFunction();
691 CodeExtractorAnalysisCache CEAC(*OuterFn);
692 // If we generate code for the target device, we need to allocate
693 // struct for aggregate params in the device default alloca address space.
694 // OpenMP runtime requires that the params of the extracted functions are
695 // passed as zero address space pointers. This flag ensures that
696 // CodeExtractor generates correct code for extracted functions
697 // which are used by OpenMP runtime.
698 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
699 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
700 /* AggregateArgs */ true,
701 /* BlockFrequencyInfo */ nullptr,
702 /* BranchProbabilityInfo */ nullptr,
703 /* AssumptionCache */ nullptr,
704 /* AllowVarArgs */ true,
705 /* AllowAlloca */ true,
706 /* AllocaBlock*/ OI.OuterAllocaBB,
707 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
708
709 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
710 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
711 << " Exit: " << OI.ExitBB->getName() << "\n");
712 assert(Extractor.isEligible() &&
713 "Expected OpenMP outlining to be possible!");
714
715 for (auto *V : OI.ExcludeArgsFromAggregate)
716 Extractor.excludeArgFromAggregate(V);
717
718 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
719
720 // Forward target-cpu, target-features attributes to the outlined function.
721 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
722 if (TargetCpuAttr.isStringAttribute())
723 OutlinedFn->addFnAttr(TargetCpuAttr);
724
725 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
726 if (TargetFeaturesAttr.isStringAttribute())
727 OutlinedFn->addFnAttr(TargetFeaturesAttr);
728
729 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
730 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
731 assert(OutlinedFn->getReturnType()->isVoidTy() &&
732 "OpenMP outlined functions should not return a value!");
733
734 // For compability with the clang CG we move the outlined function after the
735 // one with the parallel region.
736 OutlinedFn->removeFromParent();
737 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
738
739 // Remove the artificial entry introduced by the extractor right away, we
740 // made our own entry block after all.
741 {
742 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
743 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
744 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
745 // Move instructions from the to-be-deleted ArtificialEntry to the entry
746 // basic block of the parallel region. CodeExtractor generates
747 // instructions to unwrap the aggregate argument and may sink
748 // allocas/bitcasts for values that are solely used in the outlined region
749 // and do not escape.
750 assert(!ArtificialEntry.empty() &&
751 "Expected instructions to add in the outlined region entry");
752 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
753 End = ArtificialEntry.rend();
754 It != End;) {
755 Instruction &I = *It;
756 It++;
757
758 if (I.isTerminator())
759 continue;
760
761 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
762 }
763
764 OI.EntryBB->moveBefore(&ArtificialEntry);
765 ArtificialEntry.eraseFromParent();
766 }
767 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
768 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
769
770 // Run a user callback, e.g. to add attributes.
771 if (OI.PostOutlineCB)
772 OI.PostOutlineCB(*OutlinedFn);
773 }
774
775 // Remove work items that have been completed.
776 OutlineInfos = std::move(DeferredOutlines);
777
778 // The createTarget functions embeds user written code into
779 // the target region which may inject allocas which need to
780 // be moved to the entry block of our target or risk malformed
781 // optimisations by later passes, this is only relevant for
782 // the device pass which appears to be a little more delicate
783 // when it comes to optimisations (however, we do not block on
784 // that here, it's up to the inserter to the list to do so).
785 // This notbaly has to occur after the OutlinedInfo candidates
786 // have been extracted so we have an end product that will not
787 // be implicitly adversely affected by any raises unless
788 // intentionally appended to the list.
789 // NOTE: This only does so for ConstantData, it could be extended
790 // to ConstantExpr's with further effort, however, they should
791 // largely be folded when they get here. Extending it to runtime
792 // defined/read+writeable allocation sizes would be non-trivial
793 // (need to factor in movement of any stores to variables the
794 // allocation size depends on, as well as the usual loads,
795 // otherwise it'll yield the wrong result after movement) and
796 // likely be more suitable as an LLVM optimisation pass.
799
800 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
801 [](EmitMetadataErrorKind Kind,
802 const TargetRegionEntryInfo &EntryInfo) -> void {
803 errs() << "Error of kind: " << Kind
804 << " when emitting offload entries and metadata during "
805 "OMPIRBuilder finalization \n";
806 };
807
810
811 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
812 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
813 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
814 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
815 }
816}
817
819 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
820}
821
824 auto *GV =
825 new GlobalVariable(M, I32Ty,
826 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
827 ConstantInt::get(I32Ty, Value), Name);
828 GV->setVisibility(GlobalValue::HiddenVisibility);
829
830 return GV;
831}
832
834 uint32_t SrcLocStrSize,
835 IdentFlag LocFlags,
836 unsigned Reserve2Flags) {
837 // Enable "C-mode".
838 LocFlags |= OMP_IDENT_FLAG_KMPC;
839
840 Constant *&Ident =
841 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
842 if (!Ident) {
844 Constant *IdentData[] = {I32Null,
845 ConstantInt::get(Int32, uint32_t(LocFlags)),
846 ConstantInt::get(Int32, Reserve2Flags),
847 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
848 Constant *Initializer =
849 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
850
851 // Look for existing encoding of the location + flags, not needed but
852 // minimizes the difference to the existing solution while we transition.
853 for (GlobalVariable &GV : M.globals())
854 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
855 if (GV.getInitializer() == Initializer)
856 Ident = &GV;
857
858 if (!Ident) {
859 auto *GV = new GlobalVariable(
860 M, OpenMPIRBuilder::Ident,
861 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
864 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
865 GV->setAlignment(Align(8));
866 Ident = GV;
867 }
868 }
869
871}
872
874 uint32_t &SrcLocStrSize) {
875 SrcLocStrSize = LocStr.size();
876 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
877 if (!SrcLocStr) {
878 Constant *Initializer =
880
881 // Look for existing encoding of the location, not needed but minimizes the
882 // difference to the existing solution while we transition.
883 for (GlobalVariable &GV : M.globals())
884 if (GV.isConstant() && GV.hasInitializer() &&
885 GV.getInitializer() == Initializer)
886 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
887
888 SrcLocStr = Builder.CreateGlobalString(LocStr, /* Name */ "",
889 /* AddressSpace */ 0, &M);
890 }
891 return SrcLocStr;
892}
893
895 StringRef FileName,
896 unsigned Line, unsigned Column,
897 uint32_t &SrcLocStrSize) {
898 SmallString<128> Buffer;
899 Buffer.push_back(';');
900 Buffer.append(FileName);
901 Buffer.push_back(';');
902 Buffer.append(FunctionName);
903 Buffer.push_back(';');
904 Buffer.append(std::to_string(Line));
905 Buffer.push_back(';');
906 Buffer.append(std::to_string(Column));
907 Buffer.push_back(';');
908 Buffer.push_back(';');
909 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
910}
911
912Constant *
914 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
915 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
916}
917
919 uint32_t &SrcLocStrSize,
920 Function *F) {
921 DILocation *DIL = DL.get();
922 if (!DIL)
923 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
924 StringRef FileName = M.getName();
925 if (DIFile *DIF = DIL->getFile())
926 if (std::optional<StringRef> Source = DIF->getSource())
927 FileName = *Source;
928 StringRef Function = DIL->getScope()->getSubprogram()->getName();
929 if (Function.empty() && F)
930 Function = F->getName();
931 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
932 DIL->getColumn(), SrcLocStrSize);
933}
934
936 uint32_t &SrcLocStrSize) {
937 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
938 Loc.IP.getBlock()->getParent());
939}
940
942 return Builder.CreateCall(
943 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
944 "omp_global_thread_num");
945}
946
949 bool ForceSimpleCall, bool CheckCancelFlag) {
950 if (!updateToLocation(Loc))
951 return Loc.IP;
952
953 // Build call __kmpc_cancel_barrier(loc, thread_id) or
954 // __kmpc_barrier(loc, thread_id);
955
956 IdentFlag BarrierLocFlags;
957 switch (Kind) {
958 case OMPD_for:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
960 break;
961 case OMPD_sections:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
963 break;
964 case OMPD_single:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
966 break;
967 case OMPD_barrier:
968 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
969 break;
970 default:
971 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
972 break;
973 }
974
975 uint32_t SrcLocStrSize;
976 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
977 Value *Args[] = {
978 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
979 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
980
981 // If we are in a cancellable parallel region, barriers are cancellation
982 // points.
983 // TODO: Check why we would force simple calls or to ignore the cancel flag.
984 bool UseCancelBarrier =
985 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
986
987 Value *Result =
989 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
990 : OMPRTL___kmpc_barrier),
991 Args);
992
993 if (UseCancelBarrier && CheckCancelFlag)
994 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
995 return Err;
996
997 return Builder.saveIP();
998}
999
1002 Value *IfCondition,
1003 omp::Directive CanceledDirective) {
1004 if (!updateToLocation(Loc))
1005 return Loc.IP;
1006
1007 // LLVM utilities like blocks with terminators.
1008 auto *UI = Builder.CreateUnreachable();
1009
1010 Instruction *ThenTI = UI, *ElseTI = nullptr;
1011 if (IfCondition)
1012 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1013 Builder.SetInsertPoint(ThenTI);
1014
1015 Value *CancelKind = nullptr;
1016 switch (CanceledDirective) {
1017#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1018 case DirectiveEnum: \
1019 CancelKind = Builder.getInt32(Value); \
1020 break;
1021#include "llvm/Frontend/OpenMP/OMPKinds.def"
1022 default:
1023 llvm_unreachable("Unknown cancel kind!");
1024 }
1025
1026 uint32_t SrcLocStrSize;
1027 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1028 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1029 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1030 Value *Result = Builder.CreateCall(
1031 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1032 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1033 if (CanceledDirective == OMPD_parallel) {
1035 Builder.restoreIP(IP);
1037 omp::Directive::OMPD_unknown,
1038 /* ForceSimpleCall */ false,
1039 /* CheckCancelFlag */ false)
1040 .takeError();
1041 }
1042 return Error::success();
1043 };
1044
1045 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1046 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1047 return Err;
1048
1049 // Update the insertion point and remove the terminator we introduced.
1050 Builder.SetInsertPoint(UI->getParent());
1051 UI->eraseFromParent();
1052
1053 return Builder.saveIP();
1054}
1055
1057 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1058 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1059 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1060 if (!updateToLocation(Loc))
1061 return Loc.IP;
1062
1063 Builder.restoreIP(AllocaIP);
1064 auto *KernelArgsPtr =
1065 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1066 Builder.restoreIP(Loc.IP);
1067
1068 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1069 llvm::Value *Arg =
1070 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1072 KernelArgs[I], Arg,
1073 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1074 }
1075
1076 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1077 NumThreads, HostPtr, KernelArgsPtr};
1078
1079 Return = Builder.CreateCall(
1080 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1081 OffloadingArgs);
1082
1083 return Builder.saveIP();
1084}
1085
1087 const LocationDescription &Loc, Value *OutlinedFnID,
1088 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1089 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1090
1091 if (!updateToLocation(Loc))
1092 return Loc.IP;
1093
1094 Builder.restoreIP(Loc.IP);
1095 // On top of the arrays that were filled up, the target offloading call
1096 // takes as arguments the device id as well as the host pointer. The host
1097 // pointer is used by the runtime library to identify the current target
1098 // region, so it only has to be unique and not necessarily point to
1099 // anything. It could be the pointer to the outlined function that
1100 // implements the target region, but we aren't using that so that the
1101 // compiler doesn't need to keep that, and could therefore inline the host
1102 // function if proven worthwhile during optimization.
1103
1104 // From this point on, we need to have an ID of the target region defined.
1105 assert(OutlinedFnID && "Invalid outlined function ID!");
1106 (void)OutlinedFnID;
1107
1108 // Return value of the runtime offloading call.
1109 Value *Return = nullptr;
1110
1111 // Arguments for the target kernel.
1112 SmallVector<Value *> ArgsVector;
1113 getKernelArgsVector(Args, Builder, ArgsVector);
1114
1115 // The target region is an outlined function launched by the runtime
1116 // via calls to __tgt_target_kernel().
1117 //
1118 // Note that on the host and CPU targets, the runtime implementation of
1119 // these calls simply call the outlined function without forking threads.
1120 // The outlined functions themselves have runtime calls to
1121 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1122 // the compiler in emitTeamsCall() and emitParallelCall().
1123 //
1124 // In contrast, on the NVPTX target, the implementation of
1125 // __tgt_target_teams() launches a GPU kernel with the requested number
1126 // of teams and threads so no additional calls to the runtime are required.
1127 // Check the error code and execute the host version if required.
1129 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1130 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1131
1132 BasicBlock *OffloadFailedBlock =
1133 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1134 BasicBlock *OffloadContBlock =
1135 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1137 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1138
1139 auto CurFn = Builder.GetInsertBlock()->getParent();
1140 emitBlock(OffloadFailedBlock, CurFn);
1141 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1142 if (!AfterIP)
1143 return AfterIP.takeError();
1144 Builder.restoreIP(*AfterIP);
1145 emitBranch(OffloadContBlock);
1146 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1147 return Builder.saveIP();
1148}
1149
1151 Value *CancelFlag, omp::Directive CanceledDirective,
1152 FinalizeCallbackTy ExitCB) {
1153 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1154 "Unexpected cancellation!");
1155
1156 // For a cancel barrier we create two new blocks.
1158 BasicBlock *NonCancellationBlock;
1159 if (Builder.GetInsertPoint() == BB->end()) {
1160 // TODO: This branch will not be needed once we moved to the
1161 // OpenMPIRBuilder codegen completely.
1162 NonCancellationBlock = BasicBlock::Create(
1163 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1164 } else {
1165 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1168 }
1169 BasicBlock *CancellationBlock = BasicBlock::Create(
1170 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1171
1172 // Jump to them based on the return value.
1173 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1174 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1175 /* TODO weight */ nullptr, nullptr);
1176
1177 // From the cancellation block we finalize all variables and go to the
1178 // post finalization block that is known to the FiniCB callback.
1179 Builder.SetInsertPoint(CancellationBlock);
1180 if (ExitCB)
1181 if (Error Err = ExitCB(Builder.saveIP()))
1182 return Err;
1183 auto &FI = FinalizationStack.back();
1184 if (Error Err = FI.FiniCB(Builder.saveIP()))
1185 return Err;
1186
1187 // The continuation block is where code generation continues.
1188 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1189 return Error::success();
1190}
1191
1192// Callback used to create OpenMP runtime calls to support
1193// omp parallel clause for the device.
1194// We need to use this callback to replace call to the OutlinedFn in OuterFn
1195// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1197 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1198 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1199 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1200 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1201 // Add some known attributes.
1202 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1203 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1204 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1205 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1206 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1207 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1208
1209 assert(OutlinedFn.arg_size() >= 2 &&
1210 "Expected at least tid and bounded tid as arguments");
1211 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1212
1213 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1214 assert(CI && "Expected call instruction to outlined function");
1215 CI->getParent()->setName("omp_parallel");
1216
1217 Builder.SetInsertPoint(CI);
1218 Type *PtrTy = OMPIRBuilder->VoidPtr;
1219 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1220
1221 // Add alloca for kernel args
1222 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1223 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1224 AllocaInst *ArgsAlloca =
1225 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1226 Value *Args = ArgsAlloca;
1227 // Add address space cast if array for storing arguments is not allocated
1228 // in address space 0
1229 if (ArgsAlloca->getAddressSpace())
1230 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1231 Builder.restoreIP(CurrentIP);
1232
1233 // Store captured vars which are used by kmpc_parallel_51
1234 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1235 Value *V = *(CI->arg_begin() + 2 + Idx);
1236 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1237 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1238 Builder.CreateStore(V, StoreAddress);
1239 }
1240
1241 Value *Cond =
1242 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1243 : Builder.getInt32(1);
1244
1245 // Build kmpc_parallel_51 call
1246 Value *Parallel51CallArgs[] = {
1247 /* identifier*/ Ident,
1248 /* global thread num*/ ThreadID,
1249 /* if expression */ Cond,
1250 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1251 /* Proc bind */ Builder.getInt32(-1),
1252 /* outlined function */
1253 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1254 /* wrapper function */ NullPtrValue,
1255 /* arguments of the outlined funciton*/ Args,
1256 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1257
1258 FunctionCallee RTLFn =
1259 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1260
1261 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1262
1263 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1264 << *Builder.GetInsertBlock()->getParent() << "\n");
1265
1266 // Initialize the local TID stack location with the argument value.
1267 Builder.SetInsertPoint(PrivTID);
1268 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1269 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1270 PrivTIDAddr);
1271
1272 // Remove redundant call to the outlined function.
1273 CI->eraseFromParent();
1274
1275 for (Instruction *I : ToBeDeleted) {
1276 I->eraseFromParent();
1277 }
1278}
1279
1280// Callback used to create OpenMP runtime calls to support
1281// omp parallel clause for the host.
1282// We need to use this callback to replace call to the OutlinedFn in OuterFn
1283// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1284static void
1286 Function *OuterFn, Value *Ident, Value *IfCondition,
1287 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1288 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1289 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1290 FunctionCallee RTLFn;
1291 if (IfCondition) {
1292 RTLFn =
1293 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1294 } else {
1295 RTLFn =
1296 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1297 }
1298 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1299 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1300 LLVMContext &Ctx = F->getContext();
1301 MDBuilder MDB(Ctx);
1302 // Annotate the callback behavior of the __kmpc_fork_call:
1303 // - The callback callee is argument number 2 (microtask).
1304 // - The first two arguments of the callback callee are unknown (-1).
1305 // - All variadic arguments to the __kmpc_fork_call are passed to the
1306 // callback callee.
1307 F->addMetadata(LLVMContext::MD_callback,
1309 2, {-1, -1},
1310 /* VarArgsArePassed */ true)}));
1311 }
1312 }
1313 // Add some known attributes.
1314 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1315 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1316 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1317
1318 assert(OutlinedFn.arg_size() >= 2 &&
1319 "Expected at least tid and bounded tid as arguments");
1320 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1321
1322 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1323 CI->getParent()->setName("omp_parallel");
1324 Builder.SetInsertPoint(CI);
1325
1326 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1327 Value *ForkCallArgs[] = {
1328 Ident, Builder.getInt32(NumCapturedVars),
1329 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1330
1331 SmallVector<Value *, 16> RealArgs;
1332 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1333 if (IfCondition) {
1334 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1335 RealArgs.push_back(Cond);
1336 }
1337 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1338
1339 // __kmpc_fork_call_if always expects a void ptr as the last argument
1340 // If there are no arguments, pass a null pointer.
1341 auto PtrTy = OMPIRBuilder->VoidPtr;
1342 if (IfCondition && NumCapturedVars == 0) {
1343 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1344 RealArgs.push_back(NullPtrValue);
1345 }
1346 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1347 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1348
1349 Builder.CreateCall(RTLFn, RealArgs);
1350
1351 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1352 << *Builder.GetInsertBlock()->getParent() << "\n");
1353
1354 // Initialize the local TID stack location with the argument value.
1355 Builder.SetInsertPoint(PrivTID);
1356 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1357 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1358 PrivTIDAddr);
1359
1360 // Remove redundant call to the outlined function.
1361 CI->eraseFromParent();
1362
1363 for (Instruction *I : ToBeDeleted) {
1364 I->eraseFromParent();
1365 }
1366}
1367
1369 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1370 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1371 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1372 omp::ProcBindKind ProcBind, bool IsCancellable) {
1373 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1374
1375 if (!updateToLocation(Loc))
1376 return Loc.IP;
1377
1378 uint32_t SrcLocStrSize;
1379 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1380 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1381 Value *ThreadID = getOrCreateThreadID(Ident);
1382 // If we generate code for the target device, we need to allocate
1383 // struct for aggregate params in the device default alloca address space.
1384 // OpenMP runtime requires that the params of the extracted functions are
1385 // passed as zero address space pointers. This flag ensures that extracted
1386 // function arguments are declared in zero address space
1387 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1388
1389 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1390 // only if we compile for host side.
1391 if (NumThreads && !Config.isTargetDevice()) {
1392 Value *Args[] = {
1393 Ident, ThreadID,
1394 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1396 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1397 }
1398
1399 if (ProcBind != OMP_PROC_BIND_default) {
1400 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1401 Value *Args[] = {
1402 Ident, ThreadID,
1403 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1405 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1406 }
1407
1408 BasicBlock *InsertBB = Builder.GetInsertBlock();
1409 Function *OuterFn = InsertBB->getParent();
1410
1411 // Save the outer alloca block because the insertion iterator may get
1412 // invalidated and we still need this later.
1413 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1414
1415 // Vector to remember instructions we used only during the modeling but which
1416 // we want to delete at the end.
1418
1419 // Change the location to the outer alloca insertion point to create and
1420 // initialize the allocas we pass into the parallel region.
1421 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1422 Builder.restoreIP(NewOuter);
1423 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1424 AllocaInst *ZeroAddrAlloca =
1425 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1426 Instruction *TIDAddr = TIDAddrAlloca;
1427 Instruction *ZeroAddr = ZeroAddrAlloca;
1428 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1429 // Add additional casts to enforce pointers in zero address space
1430 TIDAddr = new AddrSpaceCastInst(
1431 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1432 TIDAddr->insertAfter(TIDAddrAlloca);
1433 ToBeDeleted.push_back(TIDAddr);
1434 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1435 PointerType ::get(M.getContext(), 0),
1436 "zero.addr.ascast");
1437 ZeroAddr->insertAfter(ZeroAddrAlloca);
1438 ToBeDeleted.push_back(ZeroAddr);
1439 }
1440
1441 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1442 // associated arguments in the outlined function, so we delete them later.
1443 ToBeDeleted.push_back(TIDAddrAlloca);
1444 ToBeDeleted.push_back(ZeroAddrAlloca);
1445
1446 // Create an artificial insertion point that will also ensure the blocks we
1447 // are about to split are not degenerated.
1448 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1449
1450 BasicBlock *EntryBB = UI->getParent();
1451 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1452 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1453 BasicBlock *PRegPreFiniBB =
1454 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1455 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1456
1457 auto FiniCBWrapper = [&](InsertPointTy IP) {
1458 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1459 // target to the region exit block.
1460 if (IP.getBlock()->end() == IP.getPoint()) {
1462 Builder.restoreIP(IP);
1463 Instruction *I = Builder.CreateBr(PRegExitBB);
1464 IP = InsertPointTy(I->getParent(), I->getIterator());
1465 }
1466 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1467 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1468 "Unexpected insertion point for finalization call!");
1469 return FiniCB(IP);
1470 };
1471
1472 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1473
1474 // Generate the privatization allocas in the block that will become the entry
1475 // of the outlined function.
1476 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1477 InsertPointTy InnerAllocaIP = Builder.saveIP();
1478
1479 AllocaInst *PrivTIDAddr =
1480 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1481 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1482
1483 // Add some fake uses for OpenMP provided arguments.
1484 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1485 Instruction *ZeroAddrUse =
1486 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1487 ToBeDeleted.push_back(ZeroAddrUse);
1488
1489 // EntryBB
1490 // |
1491 // V
1492 // PRegionEntryBB <- Privatization allocas are placed here.
1493 // |
1494 // V
1495 // PRegionBodyBB <- BodeGen is invoked here.
1496 // |
1497 // V
1498 // PRegPreFiniBB <- The block we will start finalization from.
1499 // |
1500 // V
1501 // PRegionExitBB <- A common exit to simplify block collection.
1502 //
1503
1504 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1505
1506 // Let the caller create the body.
1507 assert(BodyGenCB && "Expected body generation callback!");
1508 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1509 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1510 return Err;
1511
1512 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1513
1514 OutlineInfo OI;
1515 if (Config.isTargetDevice()) {
1516 // Generate OpenMP target specific runtime call
1517 OI.PostOutlineCB = [=, ToBeDeletedVec =
1518 std::move(ToBeDeleted)](Function &OutlinedFn) {
1519 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1520 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1521 ThreadID, ToBeDeletedVec);
1522 };
1523 } else {
1524 // Generate OpenMP host runtime call
1525 OI.PostOutlineCB = [=, ToBeDeletedVec =
1526 std::move(ToBeDeleted)](Function &OutlinedFn) {
1527 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1528 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1529 };
1530 }
1531
1532 OI.OuterAllocaBB = OuterAllocaBlock;
1533 OI.EntryBB = PRegEntryBB;
1534 OI.ExitBB = PRegExitBB;
1535
1536 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1538 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1539
1540 // Ensure a single exit node for the outlined region by creating one.
1541 // We might have multiple incoming edges to the exit now due to finalizations,
1542 // e.g., cancel calls that cause the control flow to leave the region.
1543 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1544 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1545 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1546 Blocks.push_back(PRegOutlinedExitBB);
1547
1548 CodeExtractorAnalysisCache CEAC(*OuterFn);
1549 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1550 /* AggregateArgs */ false,
1551 /* BlockFrequencyInfo */ nullptr,
1552 /* BranchProbabilityInfo */ nullptr,
1553 /* AssumptionCache */ nullptr,
1554 /* AllowVarArgs */ true,
1555 /* AllowAlloca */ true,
1556 /* AllocationBlock */ OuterAllocaBlock,
1557 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1558
1559 // Find inputs to, outputs from the code region.
1560 BasicBlock *CommonExit = nullptr;
1561 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1562 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1563
1564 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1565 /*CollectGlobalInputs=*/true);
1566
1567 Inputs.remove_if([&](Value *I) {
1568 if (auto *GV = dyn_cast_if_present<GlobalVariable>(I))
1569 return GV->getValueType() == OpenMPIRBuilder::Ident;
1570
1571 return false;
1572 });
1573
1574 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1575
1576 FunctionCallee TIDRTLFn =
1577 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1578
1579 auto PrivHelper = [&](Value &V) -> Error {
1580 if (&V == TIDAddr || &V == ZeroAddr) {
1581 OI.ExcludeArgsFromAggregate.push_back(&V);
1582 return Error::success();
1583 }
1584
1586 for (Use &U : V.uses())
1587 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1588 if (ParallelRegionBlockSet.count(UserI->getParent()))
1589 Uses.insert(&U);
1590
1591 // __kmpc_fork_call expects extra arguments as pointers. If the input
1592 // already has a pointer type, everything is fine. Otherwise, store the
1593 // value onto stack and load it back inside the to-be-outlined region. This
1594 // will ensure only the pointer will be passed to the function.
1595 // FIXME: if there are more than 15 trailing arguments, they must be
1596 // additionally packed in a struct.
1597 Value *Inner = &V;
1598 if (!V.getType()->isPointerTy()) {
1600 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1601
1602 Builder.restoreIP(OuterAllocaIP);
1603 Value *Ptr =
1604 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1605
1606 // Store to stack at end of the block that currently branches to the entry
1607 // block of the to-be-outlined region.
1608 Builder.SetInsertPoint(InsertBB,
1609 InsertBB->getTerminator()->getIterator());
1610 Builder.CreateStore(&V, Ptr);
1611
1612 // Load back next to allocations in the to-be-outlined region.
1613 Builder.restoreIP(InnerAllocaIP);
1614 Inner = Builder.CreateLoad(V.getType(), Ptr);
1615 }
1616
1617 Value *ReplacementValue = nullptr;
1618 CallInst *CI = dyn_cast<CallInst>(&V);
1619 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1620 ReplacementValue = PrivTID;
1621 } else {
1622 InsertPointOrErrorTy AfterIP =
1623 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1624 if (!AfterIP)
1625 return AfterIP.takeError();
1626 Builder.restoreIP(*AfterIP);
1627 InnerAllocaIP = {
1628 InnerAllocaIP.getBlock(),
1629 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1630
1631 assert(ReplacementValue &&
1632 "Expected copy/create callback to set replacement value!");
1633 if (ReplacementValue == &V)
1634 return Error::success();
1635 }
1636
1637 for (Use *UPtr : Uses)
1638 UPtr->set(ReplacementValue);
1639
1640 return Error::success();
1641 };
1642
1643 // Reset the inner alloca insertion as it will be used for loading the values
1644 // wrapped into pointers before passing them into the to-be-outlined region.
1645 // Configure it to insert immediately after the fake use of zero address so
1646 // that they are available in the generated body and so that the
1647 // OpenMP-related values (thread ID and zero address pointers) remain leading
1648 // in the argument list.
1649 InnerAllocaIP = IRBuilder<>::InsertPoint(
1650 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1651
1652 // Reset the outer alloca insertion point to the entry of the relevant block
1653 // in case it was invalidated.
1654 OuterAllocaIP = IRBuilder<>::InsertPoint(
1655 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1656
1657 for (Value *Input : Inputs) {
1658 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1659 if (Error Err = PrivHelper(*Input))
1660 return Err;
1661 }
1662 LLVM_DEBUG({
1663 for (Value *Output : Outputs)
1664 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1665 });
1666 assert(Outputs.empty() &&
1667 "OpenMP outlining should not produce live-out values!");
1668
1669 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1670 LLVM_DEBUG({
1671 for (auto *BB : Blocks)
1672 dbgs() << " PBR: " << BB->getName() << "\n";
1673 });
1674
1675 // Adjust the finalization stack, verify the adjustment, and call the
1676 // finalize function a last time to finalize values between the pre-fini
1677 // block and the exit block if we left the parallel "the normal way".
1678 auto FiniInfo = FinalizationStack.pop_back_val();
1679 (void)FiniInfo;
1680 assert(FiniInfo.DK == OMPD_parallel &&
1681 "Unexpected finalization stack state!");
1682
1683 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1684
1685 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1686 if (Error Err = FiniCB(PreFiniIP))
1687 return Err;
1688
1689 // Register the outlined info.
1690 addOutlineInfo(std::move(OI));
1691
1692 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1693 UI->eraseFromParent();
1694
1695 return AfterIP;
1696}
1697
1699 // Build call void __kmpc_flush(ident_t *loc)
1700 uint32_t SrcLocStrSize;
1701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1702 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1703
1704 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1705}
1706
1708 if (!updateToLocation(Loc))
1709 return;
1710 emitFlush(Loc);
1711}
1712
1714 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1715 // global_tid);
1716 uint32_t SrcLocStrSize;
1717 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1718 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1719 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1720
1721 // Ignore return result until untied tasks are supported.
1722 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1723 Args);
1724}
1725
1727 if (!updateToLocation(Loc))
1728 return;
1729 emitTaskwaitImpl(Loc);
1730}
1731
1733 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1734 uint32_t SrcLocStrSize;
1735 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1736 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1738 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1739
1740 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1741 Args);
1742}
1743
1745 if (!updateToLocation(Loc))
1746 return;
1747 emitTaskyieldImpl(Loc);
1748}
1749
1750// Processes the dependencies in Dependencies and does the following
1751// - Allocates space on the stack of an array of DependInfo objects
1752// - Populates each DependInfo object with relevant information of
1753// the corresponding dependence.
1754// - All code is inserted in the entry block of the current function.
1756 OpenMPIRBuilder &OMPBuilder,
1758 // Early return if we have no dependencies to process
1759 if (Dependencies.empty())
1760 return nullptr;
1761
1762 // Given a vector of DependData objects, in this function we create an
1763 // array on the stack that holds kmp_dep_info objects corresponding
1764 // to each dependency. This is then passed to the OpenMP runtime.
1765 // For example, if there are 'n' dependencies then the following psedo
1766 // code is generated. Assume the first dependence is on a variable 'a'
1767 //
1768 // \code{c}
1769 // DepArray = alloc(n x sizeof(kmp_depend_info);
1770 // idx = 0;
1771 // DepArray[idx].base_addr = ptrtoint(&a);
1772 // DepArray[idx].len = 8;
1773 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1774 // ++idx;
1775 // DepArray[idx].base_addr = ...;
1776 // \endcode
1777
1778 IRBuilderBase &Builder = OMPBuilder.Builder;
1779 Type *DependInfo = OMPBuilder.DependInfo;
1780 Module &M = OMPBuilder.M;
1781
1782 Value *DepArray = nullptr;
1783 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1784 Builder.SetInsertPoint(
1786
1787 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1788 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1789
1790 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1791 Value *Base =
1792 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1793 // Store the pointer to the variable
1794 Value *Addr = Builder.CreateStructGEP(
1795 DependInfo, Base,
1796 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1797 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1798 Builder.CreateStore(DepValPtr, Addr);
1799 // Store the size of the variable
1800 Value *Size = Builder.CreateStructGEP(
1801 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1802 Builder.CreateStore(
1803 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1804 Size);
1805 // Store the dependency kind
1806 Value *Flags = Builder.CreateStructGEP(
1807 DependInfo, Base,
1808 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1809 Builder.CreateStore(
1810 ConstantInt::get(Builder.getInt8Ty(),
1811 static_cast<unsigned int>(Dep.DepKind)),
1812 Flags);
1813 }
1814 Builder.restoreIP(OldIP);
1815 return DepArray;
1816}
1817
1819 const LocationDescription &Loc, InsertPointTy AllocaIP,
1820 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1821 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle) {
1822
1823 if (!updateToLocation(Loc))
1824 return InsertPointTy();
1825
1826 uint32_t SrcLocStrSize;
1827 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1828 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1829 // The current basic block is split into four basic blocks. After outlining,
1830 // they will be mapped as follows:
1831 // ```
1832 // def current_fn() {
1833 // current_basic_block:
1834 // br label %task.exit
1835 // task.exit:
1836 // ; instructions after task
1837 // }
1838 // def outlined_fn() {
1839 // task.alloca:
1840 // br label %task.body
1841 // task.body:
1842 // ret void
1843 // }
1844 // ```
1845 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1846 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1847 BasicBlock *TaskAllocaBB =
1848 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1849
1850 InsertPointTy TaskAllocaIP =
1851 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1852 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1853 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1854 return Err;
1855
1856 OutlineInfo OI;
1857 OI.EntryBB = TaskAllocaBB;
1858 OI.OuterAllocaBB = AllocaIP.getBlock();
1859 OI.ExitBB = TaskExitBB;
1860
1861 // Add the thread ID argument.
1864 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1865
1866 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1867 Mergeable, EventHandle, TaskAllocaBB,
1868 ToBeDeleted](Function &OutlinedFn) mutable {
1869 // Replace the Stale CI by appropriate RTL function call.
1870 assert(OutlinedFn.getNumUses() == 1 &&
1871 "there must be a single user for the outlined function");
1872 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1873
1874 // HasShareds is true if any variables are captured in the outlined region,
1875 // false otherwise.
1876 bool HasShareds = StaleCI->arg_size() > 1;
1877 Builder.SetInsertPoint(StaleCI);
1878
1879 // Gather the arguments for emitting the runtime call for
1880 // @__kmpc_omp_task_alloc
1881 Function *TaskAllocFn =
1882 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1883
1884 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1885 // call.
1886 Value *ThreadID = getOrCreateThreadID(Ident);
1887
1888 // Argument - `flags`
1889 // Task is tied iff (Flags & 1) == 1.
1890 // Task is untied iff (Flags & 1) == 0.
1891 // Task is final iff (Flags & 2) == 2.
1892 // Task is not final iff (Flags & 2) == 0.
1893 // Task is mergeable iff (Flags & 4) == 4.
1894 // Task is not mergeable iff (Flags & 4) == 0.
1895 // TODO: Handle the other flags.
1896 Value *Flags = Builder.getInt32(Tied);
1897 if (Final) {
1898 Value *FinalFlag =
1900 Flags = Builder.CreateOr(FinalFlag, Flags);
1901 }
1902
1903 if (Mergeable)
1905
1906 // Argument - `sizeof_kmp_task_t` (TaskSize)
1907 // Tasksize refers to the size in bytes of kmp_task_t data structure
1908 // including private vars accessed in task.
1909 // TODO: add kmp_task_t_with_privates (privates)
1910 Value *TaskSize = Builder.getInt64(
1912
1913 // Argument - `sizeof_shareds` (SharedsSize)
1914 // SharedsSize refers to the shareds array size in the kmp_task_t data
1915 // structure.
1916 Value *SharedsSize = Builder.getInt64(0);
1917 if (HasShareds) {
1918 AllocaInst *ArgStructAlloca =
1919 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1920 assert(ArgStructAlloca &&
1921 "Unable to find the alloca instruction corresponding to arguments "
1922 "for extracted function");
1923 StructType *ArgStructType =
1924 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1925 assert(ArgStructType && "Unable to find struct type corresponding to "
1926 "arguments for extracted function");
1927 SharedsSize =
1929 }
1930 // Emit the @__kmpc_omp_task_alloc runtime call
1931 // The runtime call returns a pointer to an area where the task captured
1932 // variables must be copied before the task is run (TaskData)
1933 CallInst *TaskData = Builder.CreateCall(
1934 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1935 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1936 /*task_func=*/&OutlinedFn});
1937
1938 // Emit detach clause initialization.
1939 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
1940 // task_descriptor);
1941 if (EventHandle) {
1943 OMPRTL___kmpc_task_allow_completion_event);
1944 llvm::Value *EventVal =
1945 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
1946 llvm::Value *EventHandleAddr =
1948 Builder.getPtrTy(0));
1949 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
1950 Builder.CreateStore(EventVal, EventHandleAddr);
1951 }
1952 // Copy the arguments for outlined function
1953 if (HasShareds) {
1954 Value *Shareds = StaleCI->getArgOperand(1);
1955 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1956 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1957 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1958 SharedsSize);
1959 }
1960
1961 Value *DepArray = nullptr;
1962 if (Dependencies.size()) {
1963 InsertPointTy OldIP = Builder.saveIP();
1965 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1966
1967 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1968 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1969
1970 unsigned P = 0;
1971 for (const DependData &Dep : Dependencies) {
1972 Value *Base =
1973 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1974 // Store the pointer to the variable
1976 DependInfo, Base,
1977 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1978 Value *DepValPtr =
1980 Builder.CreateStore(DepValPtr, Addr);
1981 // Store the size of the variable
1983 DependInfo, Base,
1984 static_cast<unsigned int>(RTLDependInfoFields::Len));
1986 Dep.DepValueType)),
1987 Size);
1988 // Store the dependency kind
1990 DependInfo, Base,
1991 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1993 ConstantInt::get(Builder.getInt8Ty(),
1994 static_cast<unsigned int>(Dep.DepKind)),
1995 Flags);
1996 ++P;
1997 }
1998
1999 Builder.restoreIP(OldIP);
2000 }
2001
2002 // In the presence of the `if` clause, the following IR is generated:
2003 // ...
2004 // %data = call @__kmpc_omp_task_alloc(...)
2005 // br i1 %if_condition, label %then, label %else
2006 // then:
2007 // call @__kmpc_omp_task(...)
2008 // br label %exit
2009 // else:
2010 // ;; Wait for resolution of dependencies, if any, before
2011 // ;; beginning the task
2012 // call @__kmpc_omp_wait_deps(...)
2013 // call @__kmpc_omp_task_begin_if0(...)
2014 // call @outlined_fn(...)
2015 // call @__kmpc_omp_task_complete_if0(...)
2016 // br label %exit
2017 // exit:
2018 // ...
2019 if (IfCondition) {
2020 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2021 // terminator.
2022 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2023 Instruction *IfTerminator =
2024 Builder.GetInsertPoint()->getParent()->getTerminator();
2025 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2026 Builder.SetInsertPoint(IfTerminator);
2027 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2028 &ElseTI);
2029 Builder.SetInsertPoint(ElseTI);
2030
2031 if (Dependencies.size()) {
2032 Function *TaskWaitFn =
2033 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2035 TaskWaitFn,
2036 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2037 ConstantInt::get(Builder.getInt32Ty(), 0),
2039 }
2040 Function *TaskBeginFn =
2041 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2042 Function *TaskCompleteFn =
2043 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2044 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2045 CallInst *CI = nullptr;
2046 if (HasShareds)
2047 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2048 else
2049 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2050 CI->setDebugLoc(StaleCI->getDebugLoc());
2051 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2052 Builder.SetInsertPoint(ThenTI);
2053 }
2054
2055 if (Dependencies.size()) {
2056 Function *TaskFn =
2057 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2059 TaskFn,
2060 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2061 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2063
2064 } else {
2065 // Emit the @__kmpc_omp_task runtime call to spawn the task
2066 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2067 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2068 }
2069
2070 StaleCI->eraseFromParent();
2071
2072 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2073 if (HasShareds) {
2074 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2075 OutlinedFn.getArg(1)->replaceUsesWithIf(
2076 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2077 }
2078
2079 for (Instruction *I : llvm::reverse(ToBeDeleted))
2080 I->eraseFromParent();
2081 };
2082
2083 addOutlineInfo(std::move(OI));
2084 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2085
2086 return Builder.saveIP();
2087}
2088
2091 InsertPointTy AllocaIP,
2092 BodyGenCallbackTy BodyGenCB) {
2093 if (!updateToLocation(Loc))
2094 return InsertPointTy();
2095
2096 uint32_t SrcLocStrSize;
2097 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2098 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2099 Value *ThreadID = getOrCreateThreadID(Ident);
2100
2101 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2102 Function *TaskgroupFn =
2103 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2104 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2105
2106 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2107 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2108 return Err;
2109
2110 Builder.SetInsertPoint(TaskgroupExitBB);
2111 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2112 Function *EndTaskgroupFn =
2113 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2114 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2115
2116 return Builder.saveIP();
2117}
2118
2120 const LocationDescription &Loc, InsertPointTy AllocaIP,
2122 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2123 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2124
2125 if (!updateToLocation(Loc))
2126 return Loc.IP;
2127
2128 auto FiniCBWrapper = [&](InsertPointTy IP) {
2129 if (IP.getBlock()->end() != IP.getPoint())
2130 return FiniCB(IP);
2131 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2132 // will fail because that function requires the Finalization Basic Block to
2133 // have a terminator, which is already removed by EmitOMPRegionBody.
2134 // IP is currently at cancelation block.
2135 // We need to backtrack to the condition block to fetch
2136 // the exit block and create a branch from cancelation
2137 // to exit block.
2139 Builder.restoreIP(IP);
2140 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2141 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2142 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2143 Instruction *I = Builder.CreateBr(ExitBB);
2144 IP = InsertPointTy(I->getParent(), I->getIterator());
2145 return FiniCB(IP);
2146 };
2147
2148 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2149
2150 // Each section is emitted as a switch case
2151 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2152 // -> OMP.createSection() which generates the IR for each section
2153 // Iterate through all sections and emit a switch construct:
2154 // switch (IV) {
2155 // case 0:
2156 // <SectionStmt[0]>;
2157 // break;
2158 // ...
2159 // case <NumSection> - 1:
2160 // <SectionStmt[<NumSection> - 1]>;
2161 // break;
2162 // }
2163 // ...
2164 // section_loop.after:
2165 // <FiniCB>;
2166 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2167 Builder.restoreIP(CodeGenIP);
2169 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2170 Function *CurFn = Continue->getParent();
2171 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2172
2173 unsigned CaseNumber = 0;
2174 for (auto SectionCB : SectionCBs) {
2176 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2177 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2178 Builder.SetInsertPoint(CaseBB);
2179 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2180 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2181 CaseEndBr->getIterator()}))
2182 return Err;
2183 CaseNumber++;
2184 }
2185 // remove the existing terminator from body BB since there can be no
2186 // terminators after switch/case
2187 return Error::success();
2188 };
2189 // Loop body ends here
2190 // LowerBound, UpperBound, and STride for createCanonicalLoop
2191 Type *I32Ty = Type::getInt32Ty(M.getContext());
2192 Value *LB = ConstantInt::get(I32Ty, 0);
2193 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2194 Value *ST = ConstantInt::get(I32Ty, 1);
2196 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2197 if (!LoopInfo)
2198 return LoopInfo.takeError();
2199
2200 InsertPointOrErrorTy WsloopIP =
2201 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP, !IsNowait);
2202 if (!WsloopIP)
2203 return WsloopIP.takeError();
2204 InsertPointTy AfterIP = *WsloopIP;
2205
2206 // Apply the finalization callback in LoopAfterBB
2207 auto FiniInfo = FinalizationStack.pop_back_val();
2208 assert(FiniInfo.DK == OMPD_sections &&
2209 "Unexpected finalization stack state!");
2210 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2211 Builder.restoreIP(AfterIP);
2212 BasicBlock *FiniBB =
2213 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2214 if (Error Err = CB(Builder.saveIP()))
2215 return Err;
2216 AfterIP = {FiniBB, FiniBB->begin()};
2217 }
2218
2219 return AfterIP;
2220}
2221
2224 BodyGenCallbackTy BodyGenCB,
2225 FinalizeCallbackTy FiniCB) {
2226 if (!updateToLocation(Loc))
2227 return Loc.IP;
2228
2229 auto FiniCBWrapper = [&](InsertPointTy IP) {
2230 if (IP.getBlock()->end() != IP.getPoint())
2231 return FiniCB(IP);
2232 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2233 // will fail because that function requires the Finalization Basic Block to
2234 // have a terminator, which is already removed by EmitOMPRegionBody.
2235 // IP is currently at cancelation block.
2236 // We need to backtrack to the condition block to fetch
2237 // the exit block and create a branch from cancelation
2238 // to exit block.
2240 Builder.restoreIP(IP);
2241 auto *CaseBB = Loc.IP.getBlock();
2242 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2243 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2244 Instruction *I = Builder.CreateBr(ExitBB);
2245 IP = InsertPointTy(I->getParent(), I->getIterator());
2246 return FiniCB(IP);
2247 };
2248
2249 Directive OMPD = Directive::OMPD_sections;
2250 // Since we are using Finalization Callback here, HasFinalize
2251 // and IsCancellable have to be true
2252 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2253 /*Conditional*/ false, /*hasFinalize*/ true,
2254 /*IsCancellable*/ true);
2255}
2256
2259 IT++;
2260 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2261}
2262
2263void OpenMPIRBuilder::emitUsed(StringRef Name,
2264 std::vector<WeakTrackingVH> &List) {
2265 if (List.empty())
2266 return;
2267
2268 // Convert List to what ConstantArray needs.
2270 UsedArray.resize(List.size());
2271 for (unsigned I = 0, E = List.size(); I != E; ++I)
2273 cast<Constant>(&*List[I]), Builder.getPtrTy());
2274
2275 if (UsedArray.empty())
2276 return;
2277 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2278
2279 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2280 ConstantArray::get(ATy, UsedArray), Name);
2281
2282 GV->setSection("llvm.metadata");
2283}
2284
2285Value *OpenMPIRBuilder::getGPUThreadID() {
2286 return Builder.CreateCall(
2288 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2289 {});
2290}
2291
2292Value *OpenMPIRBuilder::getGPUWarpSize() {
2293 return Builder.CreateCall(
2294 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2295}
2296
2297Value *OpenMPIRBuilder::getNVPTXWarpID() {
2298 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2299 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2300}
2301
2302Value *OpenMPIRBuilder::getNVPTXLaneID() {
2303 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2304 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2305 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2306 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2307 "nvptx_lane_id");
2308}
2309
2310Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2311 Type *ToType) {
2312 Type *FromType = From->getType();
2313 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2314 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2315 assert(FromSize > 0 && "From size must be greater than zero");
2316 assert(ToSize > 0 && "To size must be greater than zero");
2317 if (FromType == ToType)
2318 return From;
2319 if (FromSize == ToSize)
2320 return Builder.CreateBitCast(From, ToType);
2321 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2322 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2323 InsertPointTy SaveIP = Builder.saveIP();
2324 Builder.restoreIP(AllocaIP);
2325 Value *CastItem = Builder.CreateAlloca(ToType);
2326 Builder.restoreIP(SaveIP);
2327
2329 CastItem, Builder.getPtrTy(0));
2330 Builder.CreateStore(From, ValCastItem);
2331 return Builder.CreateLoad(ToType, CastItem);
2332}
2333
2334Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2335 Value *Element,
2336 Type *ElementType,
2337 Value *Offset) {
2338 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2339 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2340
2341 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2342 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2343 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2344 Value *WarpSize =
2345 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2347 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2348 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2349 Value *WarpSizeCast =
2350 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2351 Value *ShuffleCall =
2352 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2353 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2354}
2355
2356void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2357 Value *DstAddr, Type *ElemType,
2358 Value *Offset, Type *ReductionArrayTy) {
2360 // Create the loop over the big sized data.
2361 // ptr = (void*)Elem;
2362 // ptrEnd = (void*) Elem + 1;
2363 // Step = 8;
2364 // while (ptr + Step < ptrEnd)
2365 // shuffle((int64_t)*ptr);
2366 // Step = 4;
2367 // while (ptr + Step < ptrEnd)
2368 // shuffle((int32_t)*ptr);
2369 // ...
2370 Type *IndexTy = Builder.getIndexTy(
2372 Value *ElemPtr = DstAddr;
2373 Value *Ptr = SrcAddr;
2374 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2375 if (Size < IntSize)
2376 continue;
2377 Type *IntType = Builder.getIntNTy(IntSize * 8);
2379 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2380 Value *SrcAddrGEP =
2381 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2383 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2384
2385 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2386 if ((Size / IntSize) > 1) {
2388 SrcAddrGEP, Builder.getPtrTy());
2389 BasicBlock *PreCondBB =
2390 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2391 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2392 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2393 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2394 emitBlock(PreCondBB, CurFunc);
2395 PHINode *PhiSrc =
2396 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2397 PhiSrc->addIncoming(Ptr, CurrentBB);
2398 PHINode *PhiDest =
2399 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2400 PhiDest->addIncoming(ElemPtr, CurrentBB);
2401 Ptr = PhiSrc;
2402 ElemPtr = PhiDest;
2403 Value *PtrDiff = Builder.CreatePtrDiff(
2404 Builder.getInt8Ty(), PtrEnd,
2407 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2408 ExitBB);
2409 emitBlock(ThenBB, CurFunc);
2410 Value *Res = createRuntimeShuffleFunction(
2411 AllocaIP,
2413 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2414 IntType, Offset);
2415 Builder.CreateAlignedStore(Res, ElemPtr,
2416 M.getDataLayout().getPrefTypeAlign(ElemType));
2417 Value *LocalPtr =
2418 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2419 Value *LocalElemPtr =
2420 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2421 PhiSrc->addIncoming(LocalPtr, ThenBB);
2422 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2423 emitBranch(PreCondBB);
2424 emitBlock(ExitBB, CurFunc);
2425 } else {
2426 Value *Res = createRuntimeShuffleFunction(
2427 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2428 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2429 Res->getType()->getScalarSizeInBits())
2430 Res = Builder.CreateTrunc(Res, ElemType);
2431 Builder.CreateStore(Res, ElemPtr);
2432 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2433 ElemPtr =
2434 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2435 }
2436 Size = Size % IntSize;
2437 }
2438}
2439
2440void OpenMPIRBuilder::emitReductionListCopy(
2441 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2442 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2443 CopyOptionsTy CopyOptions) {
2444 Type *IndexTy = Builder.getIndexTy(
2446 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2447
2448 // Iterates, element-by-element, through the source Reduce list and
2449 // make a copy.
2450 for (auto En : enumerate(ReductionInfos)) {
2451 const ReductionInfo &RI = En.value();
2452 Value *SrcElementAddr = nullptr;
2453 Value *DestElementAddr = nullptr;
2454 Value *DestElementPtrAddr = nullptr;
2455 // Should we shuffle in an element from a remote lane?
2456 bool ShuffleInElement = false;
2457 // Set to true to update the pointer in the dest Reduce list to a
2458 // newly created element.
2459 bool UpdateDestListPtr = false;
2460
2461 // Step 1.1: Get the address for the src element in the Reduce list.
2462 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2463 ReductionArrayTy, SrcBase,
2464 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2465 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2466
2467 // Step 1.2: Create a temporary to store the element in the destination
2468 // Reduce list.
2469 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2470 ReductionArrayTy, DestBase,
2471 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2472 switch (Action) {
2474 InsertPointTy CurIP = Builder.saveIP();
2475 Builder.restoreIP(AllocaIP);
2476 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2477 ".omp.reduction.element");
2478 DestAlloca->setAlignment(
2479 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2480 DestElementAddr = DestAlloca;
2481 DestElementAddr =
2482 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2483 DestElementAddr->getName() + ".ascast");
2484 Builder.restoreIP(CurIP);
2485 ShuffleInElement = true;
2486 UpdateDestListPtr = true;
2487 break;
2488 }
2490 DestElementAddr =
2491 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2492 break;
2493 }
2494 }
2495
2496 // Now that all active lanes have read the element in the
2497 // Reduce list, shuffle over the value from the remote lane.
2498 if (ShuffleInElement) {
2499 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2500 RemoteLaneOffset, ReductionArrayTy);
2501 } else {
2502 switch (RI.EvaluationKind) {
2503 case EvalKind::Scalar: {
2504 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2505 // Store the source element value to the dest element address.
2506 Builder.CreateStore(Elem, DestElementAddr);
2507 break;
2508 }
2509 case EvalKind::Complex: {
2511 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2512 Value *SrcReal = Builder.CreateLoad(
2513 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2515 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2516 Value *SrcImg = Builder.CreateLoad(
2517 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2518
2520 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2522 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2523 Builder.CreateStore(SrcReal, DestRealPtr);
2524 Builder.CreateStore(SrcImg, DestImgPtr);
2525 break;
2526 }
2527 case EvalKind::Aggregate: {
2528 Value *SizeVal = Builder.getInt64(
2529 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2531 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2532 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2533 SizeVal, false);
2534 break;
2535 }
2536 };
2537 }
2538
2539 // Step 3.1: Modify reference in dest Reduce list as needed.
2540 // Modifying the reference in Reduce list to point to the newly
2541 // created element. The element is live in the current function
2542 // scope and that of functions it invokes (i.e., reduce_function).
2543 // RemoteReduceData[i] = (void*)&RemoteElem
2544 if (UpdateDestListPtr) {
2546 DestElementAddr, Builder.getPtrTy(),
2547 DestElementAddr->getName() + ".ascast");
2548 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2549 }
2550 }
2551}
2552
2553Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2554 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2555 AttributeList FuncAttrs) {
2556 InsertPointTy SavedIP = Builder.saveIP();
2557 LLVMContext &Ctx = M.getContext();
2559 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2560 /* IsVarArg */ false);
2561 Function *WcFunc =
2563 "_omp_reduction_inter_warp_copy_func", &M);
2564 WcFunc->setAttributes(FuncAttrs);
2565 WcFunc->addParamAttr(0, Attribute::NoUndef);
2566 WcFunc->addParamAttr(1, Attribute::NoUndef);
2567 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2568 Builder.SetInsertPoint(EntryBB);
2569
2570 // ReduceList: thread local Reduce list.
2571 // At the stage of the computation when this function is called, partially
2572 // aggregated values reside in the first lane of every active warp.
2573 Argument *ReduceListArg = WcFunc->getArg(0);
2574 // NumWarps: number of warps active in the parallel region. This could
2575 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2576 Argument *NumWarpsArg = WcFunc->getArg(1);
2577
2578 // This array is used as a medium to transfer, one reduce element at a time,
2579 // the data from the first lane of every warp to lanes in the first warp
2580 // in order to perform the final step of a reduction in a parallel region
2581 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2582 // for reduced latency, as well as to have a distinct copy for concurrently
2583 // executing target regions. The array is declared with common linkage so
2584 // as to be shared across compilation units.
2585 StringRef TransferMediumName =
2586 "__openmp_nvptx_data_transfer_temporary_storage";
2587 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2588 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2589 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2590 if (!TransferMedium) {
2591 TransferMedium = new GlobalVariable(
2592 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2593 UndefValue::get(ArrayTy), TransferMediumName,
2594 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2595 /*AddressSpace=*/3);
2596 }
2597
2598 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2599 Value *GPUThreadID = getGPUThreadID();
2600 // nvptx_lane_id = nvptx_id % warpsize
2601 Value *LaneID = getNVPTXLaneID();
2602 // nvptx_warp_id = nvptx_id / warpsize
2603 Value *WarpID = getNVPTXWarpID();
2604
2605 InsertPointTy AllocaIP =
2608 Type *Arg0Type = ReduceListArg->getType();
2609 Type *Arg1Type = NumWarpsArg->getType();
2610 Builder.restoreIP(AllocaIP);
2611 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2612 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2613 AllocaInst *NumWarpsAlloca =
2614 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2616 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2618 NumWarpsAlloca, Builder.getPtrTy(0),
2619 NumWarpsAlloca->getName() + ".ascast");
2620 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2621 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2622 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2623 InsertPointTy CodeGenIP =
2625 Builder.restoreIP(CodeGenIP);
2626
2627 Value *ReduceList =
2628 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2629
2630 for (auto En : enumerate(ReductionInfos)) {
2631 //
2632 // Warp master copies reduce element to transfer medium in __shared__
2633 // memory.
2634 //
2635 const ReductionInfo &RI = En.value();
2636 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2637 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2638 Type *CType = Builder.getIntNTy(TySize * 8);
2639
2640 unsigned NumIters = RealTySize / TySize;
2641 if (NumIters == 0)
2642 continue;
2643 Value *Cnt = nullptr;
2644 Value *CntAddr = nullptr;
2645 BasicBlock *PrecondBB = nullptr;
2646 BasicBlock *ExitBB = nullptr;
2647 if (NumIters > 1) {
2648 CodeGenIP = Builder.saveIP();
2649 Builder.restoreIP(AllocaIP);
2650 CntAddr =
2651 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2652
2653 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2654 CntAddr->getName() + ".ascast");
2655 Builder.restoreIP(CodeGenIP);
2657 CntAddr,
2658 /*Volatile=*/false);
2659 PrecondBB = BasicBlock::Create(Ctx, "precond");
2660 ExitBB = BasicBlock::Create(Ctx, "exit");
2661 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2662 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2663 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2664 /*Volatile=*/false);
2666 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2667 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2669 }
2670
2671 // kmpc_barrier.
2672 InsertPointOrErrorTy BarrierIP1 =
2673 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2674 omp::Directive::OMPD_unknown,
2675 /* ForceSimpleCall */ false,
2676 /* CheckCancelFlag */ true);
2677 if (!BarrierIP1)
2678 return BarrierIP1.takeError();
2679 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2680 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2681 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2682
2683 // if (lane_id == 0)
2684 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2685 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2687
2688 // Reduce element = LocalReduceList[i]
2689 auto *RedListArrayTy =
2690 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2691 Type *IndexTy = Builder.getIndexTy(
2693 Value *ElemPtrPtr =
2694 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2695 {ConstantInt::get(IndexTy, 0),
2696 ConstantInt::get(IndexTy, En.index())});
2697 // elemptr = ((CopyType*)(elemptrptr)) + I
2698 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2699 if (NumIters > 1)
2700 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2701
2702 // Get pointer to location in transfer medium.
2703 // MediumPtr = &medium[warp_id]
2704 Value *MediumPtr = Builder.CreateInBoundsGEP(
2705 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2706 // elem = *elemptr
2707 //*MediumPtr = elem
2708 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2709 // Store the source element value to the dest element address.
2710 Builder.CreateStore(Elem, MediumPtr,
2711 /*IsVolatile*/ true);
2712 Builder.CreateBr(MergeBB);
2713
2714 // else
2716 Builder.CreateBr(MergeBB);
2717
2718 // endif
2720 InsertPointOrErrorTy BarrierIP2 =
2721 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2722 omp::Directive::OMPD_unknown,
2723 /* ForceSimpleCall */ false,
2724 /* CheckCancelFlag */ true);
2725 if (!BarrierIP2)
2726 return BarrierIP2.takeError();
2727
2728 // Warp 0 copies reduce element from transfer medium
2729 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2730 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2731 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2732
2733 Value *NumWarpsVal =
2734 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2735 // Up to 32 threads in warp 0 are active.
2736 Value *IsActiveThread =
2737 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2738 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2739
2740 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2741
2742 // SecMediumPtr = &medium[tid]
2743 // SrcMediumVal = *SrcMediumPtr
2744 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2745 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2746 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2747 Value *TargetElemPtrPtr =
2748 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2749 {ConstantInt::get(IndexTy, 0),
2750 ConstantInt::get(IndexTy, En.index())});
2751 Value *TargetElemPtrVal =
2752 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2753 Value *TargetElemPtr = TargetElemPtrVal;
2754 if (NumIters > 1)
2755 TargetElemPtr =
2756 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2757
2758 // *TargetElemPtr = SrcMediumVal;
2759 Value *SrcMediumValue =
2760 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2761 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2762 Builder.CreateBr(W0MergeBB);
2763
2764 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2765 Builder.CreateBr(W0MergeBB);
2766
2767 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2768
2769 if (NumIters > 1) {
2770 Cnt = Builder.CreateNSWAdd(
2771 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2772 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2773
2774 auto *CurFn = Builder.GetInsertBlock()->getParent();
2775 emitBranch(PrecondBB);
2776 emitBlock(ExitBB, CurFn);
2777 }
2778 RealTySize %= TySize;
2779 }
2780 }
2781
2783 Builder.restoreIP(SavedIP);
2784
2785 return WcFunc;
2786}
2787
2788Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2789 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2790 AttributeList FuncAttrs) {
2791 LLVMContext &Ctx = M.getContext();
2792 FunctionType *FuncTy =
2794 {Builder.getPtrTy(), Builder.getInt16Ty(),
2795 Builder.getInt16Ty(), Builder.getInt16Ty()},
2796 /* IsVarArg */ false);
2797 Function *SarFunc =
2799 "_omp_reduction_shuffle_and_reduce_func", &M);
2800 SarFunc->setAttributes(FuncAttrs);
2801 SarFunc->addParamAttr(0, Attribute::NoUndef);
2802 SarFunc->addParamAttr(1, Attribute::NoUndef);
2803 SarFunc->addParamAttr(2, Attribute::NoUndef);
2804 SarFunc->addParamAttr(3, Attribute::NoUndef);
2805 SarFunc->addParamAttr(1, Attribute::SExt);
2806 SarFunc->addParamAttr(2, Attribute::SExt);
2807 SarFunc->addParamAttr(3, Attribute::SExt);
2808 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2809 Builder.SetInsertPoint(EntryBB);
2810
2811 // Thread local Reduce list used to host the values of data to be reduced.
2812 Argument *ReduceListArg = SarFunc->getArg(0);
2813 // Current lane id; could be logical.
2814 Argument *LaneIDArg = SarFunc->getArg(1);
2815 // Offset of the remote source lane relative to the current lane.
2816 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2817 // Algorithm version. This is expected to be known at compile time.
2818 Argument *AlgoVerArg = SarFunc->getArg(3);
2819
2820 Type *ReduceListArgType = ReduceListArg->getType();
2821 Type *LaneIDArgType = LaneIDArg->getType();
2822 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2823 Value *ReduceListAlloca = Builder.CreateAlloca(
2824 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2825 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2826 LaneIDArg->getName() + ".addr");
2827 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2828 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2829 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2830 AlgoVerArg->getName() + ".addr");
2831 ArrayType *RedListArrayTy =
2832 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2833
2834 // Create a local thread-private variable to host the Reduce list
2835 // from a remote lane.
2836 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2837 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2838
2840 ReduceListAlloca, ReduceListArgType,
2841 ReduceListAlloca->getName() + ".ascast");
2843 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2844 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2845 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2846 RemoteLaneOffsetAlloca->getName() + ".ascast");
2848 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2850 RemoteReductionListAlloca, Builder.getPtrTy(),
2851 RemoteReductionListAlloca->getName() + ".ascast");
2852
2853 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2854 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2855 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2856 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2857
2858 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2859 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2860 Value *RemoteLaneOffset =
2861 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2862 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2863
2864 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2865
2866 // This loop iterates through the list of reduce elements and copies,
2867 // element by element, from a remote lane in the warp to RemoteReduceList,
2868 // hosted on the thread's stack.
2869 emitReductionListCopy(
2870 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2871 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2872
2873 // The actions to be performed on the Remote Reduce list is dependent
2874 // on the algorithm version.
2875 //
2876 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2877 // LaneId % 2 == 0 && Offset > 0):
2878 // do the reduction value aggregation
2879 //
2880 // The thread local variable Reduce list is mutated in place to host the
2881 // reduced data, which is the aggregated value produced from local and
2882 // remote lanes.
2883 //
2884 // Note that AlgoVer is expected to be a constant integer known at compile
2885 // time.
2886 // When AlgoVer==0, the first conjunction evaluates to true, making
2887 // the entire predicate true during compile time.
2888 // When AlgoVer==1, the second conjunction has only the second part to be
2889 // evaluated during runtime. Other conjunctions evaluates to false
2890 // during compile time.
2891 // When AlgoVer==2, the third conjunction has only the second part to be
2892 // evaluated during runtime. Other conjunctions evaluates to false
2893 // during compile time.
2894 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2895 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2896 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2897 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2898 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2899 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2900 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2901 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2902 Value *RemoteOffsetComp =
2903 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2904 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2905 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2906 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2907
2908 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2909 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2910 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2911
2912 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2915 ReduceList, Builder.getPtrTy());
2916 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2917 RemoteListAddrCast, Builder.getPtrTy());
2918 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2919 ->addFnAttr(Attribute::NoUnwind);
2920 Builder.CreateBr(MergeBB);
2921
2923 Builder.CreateBr(MergeBB);
2924
2926
2927 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2928 // Reduce list.
2929 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2930 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2931 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2932
2933 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2934 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2935 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2936 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2937
2938 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2939 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2940 ReductionInfos, RemoteListAddrCast, ReduceList);
2941 Builder.CreateBr(CpyMergeBB);
2942
2943 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2944 Builder.CreateBr(CpyMergeBB);
2945
2946 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2947
2949
2950 return SarFunc;
2951}
2952
2953Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2954 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2955 AttributeList FuncAttrs) {
2957 LLVMContext &Ctx = M.getContext();
2960 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2961 /* IsVarArg */ false);
2962 Function *LtGCFunc =
2964 "_omp_reduction_list_to_global_copy_func", &M);
2965 LtGCFunc->setAttributes(FuncAttrs);
2966 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2967 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2968 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2969
2970 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2971 Builder.SetInsertPoint(EntryBlock);
2972
2973 // Buffer: global reduction buffer.
2974 Argument *BufferArg = LtGCFunc->getArg(0);
2975 // Idx: index of the buffer.
2976 Argument *IdxArg = LtGCFunc->getArg(1);
2977 // ReduceList: thread local Reduce list.
2978 Argument *ReduceListArg = LtGCFunc->getArg(2);
2979
2980 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2981 BufferArg->getName() + ".addr");
2982 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2983 IdxArg->getName() + ".addr");
2984 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2985 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2987 BufferArgAlloca, Builder.getPtrTy(),
2988 BufferArgAlloca->getName() + ".ascast");
2990 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2991 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2992 ReduceListArgAlloca, Builder.getPtrTy(),
2993 ReduceListArgAlloca->getName() + ".ascast");
2994
2995 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2996 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2997 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2998
2999 Value *LocalReduceList =
3000 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3001 Value *BufferArgVal =
3002 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3003 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3004 Type *IndexTy = Builder.getIndexTy(
3006 for (auto En : enumerate(ReductionInfos)) {
3007 const ReductionInfo &RI = En.value();
3008 auto *RedListArrayTy =
3009 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3010 // Reduce element = LocalReduceList[i]
3011 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3012 RedListArrayTy, LocalReduceList,
3013 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3014 // elemptr = ((CopyType*)(elemptrptr)) + I
3015 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3016
3017 // Global = Buffer.VD[Idx];
3018 Value *BufferVD =
3019 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3021 ReductionsBufferTy, BufferVD, 0, En.index());
3022
3023 switch (RI.EvaluationKind) {
3024 case EvalKind::Scalar: {
3025 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3026 Builder.CreateStore(TargetElement, GlobVal);
3027 break;
3028 }
3029 case EvalKind::Complex: {
3031 RI.ElementType, ElemPtr, 0, 0, ".realp");
3032 Value *SrcReal = Builder.CreateLoad(
3033 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3035 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3036 Value *SrcImg = Builder.CreateLoad(
3037 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3038
3040 RI.ElementType, GlobVal, 0, 0, ".realp");
3042 RI.ElementType, GlobVal, 0, 1, ".imagp");
3043 Builder.CreateStore(SrcReal, DestRealPtr);
3044 Builder.CreateStore(SrcImg, DestImgPtr);
3045 break;
3046 }
3047 case EvalKind::Aggregate: {
3048 Value *SizeVal =
3049 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3051 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3052 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3053 break;
3054 }
3055 }
3056 }
3057
3059 Builder.restoreIP(OldIP);
3060 return LtGCFunc;
3061}
3062
3063Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3064 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3065 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3067 LLVMContext &Ctx = M.getContext();
3070 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3071 /* IsVarArg */ false);
3072 Function *LtGRFunc =
3074 "_omp_reduction_list_to_global_reduce_func", &M);
3075 LtGRFunc->setAttributes(FuncAttrs);
3076 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3077 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3078 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3079
3080 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3081 Builder.SetInsertPoint(EntryBlock);
3082
3083 // Buffer: global reduction buffer.
3084 Argument *BufferArg = LtGRFunc->getArg(0);
3085 // Idx: index of the buffer.
3086 Argument *IdxArg = LtGRFunc->getArg(1);
3087 // ReduceList: thread local Reduce list.
3088 Argument *ReduceListArg = LtGRFunc->getArg(2);
3089
3090 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3091 BufferArg->getName() + ".addr");
3092 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3093 IdxArg->getName() + ".addr");
3094 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3095 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3096 auto *RedListArrayTy =
3097 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3098
3099 // 1. Build a list of reduction variables.
3100 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3101 Value *LocalReduceList =
3102 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3103
3105 BufferArgAlloca, Builder.getPtrTy(),
3106 BufferArgAlloca->getName() + ".ascast");
3108 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3109 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3110 ReduceListArgAlloca, Builder.getPtrTy(),
3111 ReduceListArgAlloca->getName() + ".ascast");
3112 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3113 LocalReduceList, Builder.getPtrTy(),
3114 LocalReduceList->getName() + ".ascast");
3115
3116 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3117 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3118 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3119
3120 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3121 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3122 Type *IndexTy = Builder.getIndexTy(
3124 for (auto En : enumerate(ReductionInfos)) {
3125 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3126 RedListArrayTy, LocalReduceListAddrCast,
3127 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3128 Value *BufferVD =
3129 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3130 // Global = Buffer.VD[Idx];
3132 ReductionsBufferTy, BufferVD, 0, En.index());
3133 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3134 }
3135
3136 // Call reduce_function(GlobalReduceList, ReduceList)
3137 Value *ReduceList =
3138 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3139 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3140 ->addFnAttr(Attribute::NoUnwind);
3142 Builder.restoreIP(OldIP);
3143 return LtGRFunc;
3144}
3145
3146Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3147 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3148 AttributeList FuncAttrs) {
3150 LLVMContext &Ctx = M.getContext();
3153 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3154 /* IsVarArg */ false);
3155 Function *LtGCFunc =
3157 "_omp_reduction_global_to_list_copy_func", &M);
3158 LtGCFunc->setAttributes(FuncAttrs);
3159 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3160 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3161 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3162
3163 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3164 Builder.SetInsertPoint(EntryBlock);
3165
3166 // Buffer: global reduction buffer.
3167 Argument *BufferArg = LtGCFunc->getArg(0);
3168 // Idx: index of the buffer.
3169 Argument *IdxArg = LtGCFunc->getArg(1);
3170 // ReduceList: thread local Reduce list.
3171 Argument *ReduceListArg = LtGCFunc->getArg(2);
3172
3173 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3174 BufferArg->getName() + ".addr");
3175 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3176 IdxArg->getName() + ".addr");
3177 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3178 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3180 BufferArgAlloca, Builder.getPtrTy(),
3181 BufferArgAlloca->getName() + ".ascast");
3183 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3184 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3185 ReduceListArgAlloca, Builder.getPtrTy(),
3186 ReduceListArgAlloca->getName() + ".ascast");
3187 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3188 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3189 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3190
3191 Value *LocalReduceList =
3192 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3193 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3194 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3195 Type *IndexTy = Builder.getIndexTy(
3197 for (auto En : enumerate(ReductionInfos)) {
3198 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3199 auto *RedListArrayTy =
3200 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3201 // Reduce element = LocalReduceList[i]
3202 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3203 RedListArrayTy, LocalReduceList,
3204 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3205 // elemptr = ((CopyType*)(elemptrptr)) + I
3206 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3207 // Global = Buffer.VD[Idx];
3208 Value *BufferVD =
3209 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3211 ReductionsBufferTy, BufferVD, 0, En.index());
3212
3213 switch (RI.EvaluationKind) {
3214 case EvalKind::Scalar: {
3215 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3216 Builder.CreateStore(TargetElement, ElemPtr);
3217 break;
3218 }
3219 case EvalKind::Complex: {
3221 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3222 Value *SrcReal = Builder.CreateLoad(
3223 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3225 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3226 Value *SrcImg = Builder.CreateLoad(
3227 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3228
3230 RI.ElementType, ElemPtr, 0, 0, ".realp");
3232 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3233 Builder.CreateStore(SrcReal, DestRealPtr);
3234 Builder.CreateStore(SrcImg, DestImgPtr);
3235 break;
3236 }
3237 case EvalKind::Aggregate: {
3238 Value *SizeVal =
3242 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3243 SizeVal, false);
3244 break;
3245 }
3246 }
3247 }
3248
3250 Builder.restoreIP(OldIP);
3251 return LtGCFunc;
3252}
3253
3254Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3255 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3256 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3258 LLVMContext &Ctx = M.getContext();
3259 auto *FuncTy = FunctionType::get(
3261 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3262 /* IsVarArg */ false);
3263 Function *LtGRFunc =
3265 "_omp_reduction_global_to_list_reduce_func", &M);
3266 LtGRFunc->setAttributes(FuncAttrs);
3267 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3268 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3269 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3270
3271 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3272 Builder.SetInsertPoint(EntryBlock);
3273
3274 // Buffer: global reduction buffer.
3275 Argument *BufferArg = LtGRFunc->getArg(0);
3276 // Idx: index of the buffer.
3277 Argument *IdxArg = LtGRFunc->getArg(1);
3278 // ReduceList: thread local Reduce list.
3279 Argument *ReduceListArg = LtGRFunc->getArg(2);
3280
3281 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3282 BufferArg->getName() + ".addr");
3283 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3284 IdxArg->getName() + ".addr");
3285 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3286 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3287 ArrayType *RedListArrayTy =
3288 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3289
3290 // 1. Build a list of reduction variables.
3291 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3292 Value *LocalReduceList =
3293 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3294
3296 BufferArgAlloca, Builder.getPtrTy(),
3297 BufferArgAlloca->getName() + ".ascast");
3299 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3300 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3301 ReduceListArgAlloca, Builder.getPtrTy(),
3302 ReduceListArgAlloca->getName() + ".ascast");
3304 LocalReduceList, Builder.getPtrTy(),
3305 LocalReduceList->getName() + ".ascast");
3306
3307 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3308 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3309 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3310
3311 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3312 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3313 Type *IndexTy = Builder.getIndexTy(
3315 for (auto En : enumerate(ReductionInfos)) {
3316 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3317 RedListArrayTy, ReductionList,
3318 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3319 // Global = Buffer.VD[Idx];
3320 Value *BufferVD =
3321 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3323 ReductionsBufferTy, BufferVD, 0, En.index());
3324 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3325 }
3326
3327 // Call reduce_function(ReduceList, GlobalReduceList)
3328 Value *ReduceList =
3329 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3330 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3331 ->addFnAttr(Attribute::NoUnwind);
3333 Builder.restoreIP(OldIP);
3334 return LtGRFunc;
3335}
3336
3337std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3338 std::string Suffix =
3339 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3340 return (Name + Suffix).str();
3341}
3342
3343Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3344 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3345 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3346 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3347 {Builder.getPtrTy(), Builder.getPtrTy()},
3348 /* IsVarArg */ false);
3349 std::string Name = getReductionFuncName(ReducerName);
3350 Function *ReductionFunc =
3352 ReductionFunc->setAttributes(FuncAttrs);
3353 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3354 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3355 BasicBlock *EntryBB =
3356 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3357 Builder.SetInsertPoint(EntryBB);
3358
3359 // Need to alloca memory here and deal with the pointers before getting
3360 // LHS/RHS pointers out
3361 Value *LHSArrayPtr = nullptr;
3362 Value *RHSArrayPtr = nullptr;
3363 Argument *Arg0 = ReductionFunc->getArg(0);
3364 Argument *Arg1 = ReductionFunc->getArg(1);
3365 Type *Arg0Type = Arg0->getType();
3366 Type *Arg1Type = Arg1->getType();
3367
3368 Value *LHSAlloca =
3369 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3370 Value *RHSAlloca =
3371 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3373 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3375 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3376 Builder.CreateStore(Arg0, LHSAddrCast);
3377 Builder.CreateStore(Arg1, RHSAddrCast);
3378 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3379 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3380
3381 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3382 Type *IndexTy = Builder.getIndexTy(
3384 SmallVector<Value *> LHSPtrs, RHSPtrs;
3385 for (auto En : enumerate(ReductionInfos)) {
3386 const ReductionInfo &RI = En.value();
3387 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3388 RedArrayTy, RHSArrayPtr,
3389 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3390 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3392 RHSI8Ptr, RI.PrivateVariable->getType(),
3393 RHSI8Ptr->getName() + ".ascast");
3394
3395 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3396 RedArrayTy, LHSArrayPtr,
3397 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3398 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3400 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3401
3403 LHSPtrs.emplace_back(LHSPtr);
3404 RHSPtrs.emplace_back(RHSPtr);
3405 } else {
3406 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3407 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3408 Value *Reduced;
3409 InsertPointOrErrorTy AfterIP =
3410 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3411 if (!AfterIP)
3412 return AfterIP.takeError();
3413 if (!Builder.GetInsertBlock())
3414 return ReductionFunc;
3415 Builder.CreateStore(Reduced, LHSPtr);
3416 }
3417 }
3418
3420 for (auto En : enumerate(ReductionInfos)) {
3421 unsigned Index = En.index();
3422 const ReductionInfo &RI = En.value();
3423 Value *LHSFixupPtr, *RHSFixupPtr;
3424 Builder.restoreIP(RI.ReductionGenClang(
3425 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3426
3427 // Fix the CallBack code genereated to use the correct Values for the LHS
3428 // and RHS
3429 LHSFixupPtr->replaceUsesWithIf(
3430 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3431 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3432 ReductionFunc;
3433 });
3434 RHSFixupPtr->replaceUsesWithIf(
3435 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3436 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3437 ReductionFunc;
3438 });
3439 }
3440
3442 return ReductionFunc;
3443}
3444
3445static void
3447 bool IsGPU) {
3448 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3449 (void)RI;
3450 assert(RI.Variable && "expected non-null variable");
3451 assert(RI.PrivateVariable && "expected non-null private variable");
3452 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3453 "expected non-null reduction generator callback");
3454 if (!IsGPU) {
3455 assert(
3456 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3457 "expected variables and their private equivalents to have the same "
3458 "type");
3459 }
3460 assert(RI.Variable->getType()->isPointerTy() &&
3461 "expected variables to be pointers");
3462 }
3463}
3464
3466 const LocationDescription &Loc, InsertPointTy AllocaIP,
3467 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3468 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3469 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3470 unsigned ReductionBufNum, Value *SrcLocInfo) {
3471 if (!updateToLocation(Loc))
3472 return InsertPointTy();
3473 Builder.restoreIP(CodeGenIP);
3474 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3475 LLVMContext &Ctx = M.getContext();
3476
3477 // Source location for the ident struct
3478 if (!SrcLocInfo) {
3479 uint32_t SrcLocStrSize;
3480 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3481 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3482 }
3483
3484 if (ReductionInfos.size() == 0)
3485 return Builder.saveIP();
3486
3487 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3488 AttributeList FuncAttrs;
3489 AttrBuilder AttrBldr(Ctx);
3490 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3491 AttrBldr.addAttribute(Attr);
3492 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3493 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3494
3495 CodeGenIP = Builder.saveIP();
3496 Expected<Function *> ReductionResult =
3497 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3498 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3499 if (!ReductionResult)
3500 return ReductionResult.takeError();
3501 Function *ReductionFunc = *ReductionResult;
3502 Builder.restoreIP(CodeGenIP);
3503
3504 // Set the grid value in the config needed for lowering later on
3505 if (GridValue.has_value())
3506 Config.setGridValue(GridValue.value());
3507 else
3508 Config.setGridValue(getGridValue(T, ReductionFunc));
3509
3510 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3511 // RedList, shuffle_reduce_func, interwarp_copy_func);
3512 // or
3513 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3514 Value *Res;
3515
3516 // 1. Build a list of reduction variables.
3517 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3518 auto Size = ReductionInfos.size();
3519 Type *PtrTy = PointerType::getUnqual(Ctx);
3520 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3521 CodeGenIP = Builder.saveIP();
3522 Builder.restoreIP(AllocaIP);
3523 Value *ReductionListAlloca =
3524 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3526 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3527 Builder.restoreIP(CodeGenIP);
3528 Type *IndexTy = Builder.getIndexTy(
3530 for (auto En : enumerate(ReductionInfos)) {
3531 const ReductionInfo &RI = En.value();
3532 Value *ElemPtr = Builder.CreateInBoundsGEP(
3533 RedArrayTy, ReductionList,
3534 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3535 Value *CastElem =
3537 Builder.CreateStore(CastElem, ElemPtr);
3538 }
3539 CodeGenIP = Builder.saveIP();
3540 Function *SarFunc =
3541 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3542 Expected<Function *> CopyResult =
3543 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3544 if (!CopyResult)
3545 return CopyResult.takeError();
3546 Function *WcFunc = *CopyResult;
3547 Builder.restoreIP(CodeGenIP);
3548
3549 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3550
3551 unsigned MaxDataSize = 0;
3552 SmallVector<Type *> ReductionTypeArgs;
3553 for (auto En : enumerate(ReductionInfos)) {
3554 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3555 if (Size > MaxDataSize)
3556 MaxDataSize = Size;
3557 ReductionTypeArgs.emplace_back(En.value().ElementType);
3558 }
3559 Value *ReductionDataSize =
3560 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3561 if (!IsTeamsReduction) {
3562 Value *SarFuncCast =
3564 Value *WcFuncCast =
3566 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3567 WcFuncCast};
3569 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3570 Res = Builder.CreateCall(Pv2Ptr, Args);
3571 } else {
3572 CodeGenIP = Builder.saveIP();
3573 StructType *ReductionsBufferTy = StructType::create(
3574 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3575 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3576 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3577 Function *LtGCFunc = emitListToGlobalCopyFunction(
3578 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3579 Function *LtGRFunc = emitListToGlobalReduceFunction(
3580 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3581 Function *GtLCFunc = emitGlobalToListCopyFunction(
3582 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3583 Function *GtLRFunc = emitGlobalToListReduceFunction(
3584 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3585 Builder.restoreIP(CodeGenIP);
3586
3587 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3588 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3589
3590 Value *Args3[] = {SrcLocInfo,
3591 KernelTeamsReductionPtr,
3592 Builder.getInt32(ReductionBufNum),
3593 ReductionDataSize,
3594 RL,
3595 SarFunc,
3596 WcFunc,
3597 LtGCFunc,
3598 LtGRFunc,
3599 GtLCFunc,
3600 GtLRFunc};
3601
3602 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3603 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3604 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3605 }
3606
3607 // 5. Build if (res == 1)
3608 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3609 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3611 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3612
3613 // 6. Build then branch: where we have reduced values in the master
3614 // thread in each team.
3615 // __kmpc_end_reduce{_nowait}(<gtid>);
3616 // break;
3617 emitBlock(ThenBB, CurFunc);
3618
3619 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3620 for (auto En : enumerate(ReductionInfos)) {
3621 const ReductionInfo &RI = En.value();
3622 Value *LHS = RI.Variable;
3623 Value *RHS =
3625
3627 Value *LHSPtr, *RHSPtr;
3629 &LHSPtr, &RHSPtr, CurFunc));
3630
3631 // Fix the CallBack code genereated to use the correct Values for the LHS
3632 // and RHS
3633 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3634 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3635 ReductionFunc;
3636 });
3637 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3638 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3639 ReductionFunc;
3640 });
3641 } else {
3642 assert(false && "Unhandled ReductionGenCBKind");
3643 }
3644 }
3645 emitBlock(ExitBB, CurFunc);
3646
3648
3649 return Builder.saveIP();
3650}
3651
3653 Type *VoidTy = Type::getVoidTy(M.getContext());
3654 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3655 auto *FuncTy =
3656 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3658 ".omp.reduction.func", &M);
3659}
3660
3663 InsertPointTy AllocaIP,
3664 ArrayRef<ReductionInfo> ReductionInfos,
3665 ArrayRef<bool> IsByRef, bool IsNoWait) {
3666 assert(ReductionInfos.size() == IsByRef.size());
3667 for (const ReductionInfo &RI : ReductionInfos) {
3668 (void)RI;
3669 assert(RI.Variable && "expected non-null variable");
3670 assert(RI.PrivateVariable && "expected non-null private variable");
3671 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3672 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3673 "expected variables and their private equivalents to have the same "
3674 "type");
3675 assert(RI.Variable->getType()->isPointerTy() &&
3676 "expected variables to be pointers");
3677 }
3678
3679 if (!updateToLocation(Loc))
3680 return InsertPointTy();
3681
3682 BasicBlock *InsertBlock = Loc.IP.getBlock();
3683 BasicBlock *ContinuationBlock =
3684 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3685 InsertBlock->getTerminator()->eraseFromParent();
3686
3687 // Create and populate array of type-erased pointers to private reduction
3688 // values.
3689 unsigned NumReductions = ReductionInfos.size();
3690 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3692 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3693
3694 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3695
3696 for (auto En : enumerate(ReductionInfos)) {
3697 unsigned Index = En.index();
3698 const ReductionInfo &RI = En.value();
3699 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3700 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3701 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3702 }
3703
3704 // Emit a call to the runtime function that orchestrates the reduction.
3705 // Declare the reduction function in the process.
3707 Module *Module = Func->getParent();
3708 uint32_t SrcLocStrSize;
3709 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3710 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3711 return RI.AtomicReductionGen;
3712 });
3713 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3714 CanGenerateAtomic
3715 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3716 : IdentFlag(0));
3717 Value *ThreadId = getOrCreateThreadID(Ident);
3718 Constant *NumVariables = Builder.getInt32(NumReductions);
3719 const DataLayout &DL = Module->getDataLayout();
3720 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3721 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3722 Function *ReductionFunc = getFreshReductionFunc(*Module);
3723 Value *Lock = getOMPCriticalRegionLock(".reduction");
3725 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3726 : RuntimeFunction::OMPRTL___kmpc_reduce);
3727 CallInst *ReduceCall =
3728 Builder.CreateCall(ReduceFunc,
3729 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3730 ReductionFunc, Lock},
3731 "reduce");
3732
3733 // Create final reduction entry blocks for the atomic and non-atomic case.
3734 // Emit IR that dispatches control flow to one of the blocks based on the
3735 // reduction supporting the atomic mode.
3736 BasicBlock *NonAtomicRedBlock =
3737 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3738 BasicBlock *AtomicRedBlock =
3739 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3740 SwitchInst *Switch =
3741 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3742 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3743 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3744
3745 // Populate the non-atomic reduction using the elementwise reduction function.
3746 // This loads the elements from the global and private variables and reduces
3747 // them before storing back the result to the global variable.
3748 Builder.SetInsertPoint(NonAtomicRedBlock);
3749 for (auto En : enumerate(ReductionInfos)) {
3750 const ReductionInfo &RI = En.value();
3752 // We have one less load for by-ref case because that load is now inside of
3753 // the reduction region
3754 Value *RedValue = RI.Variable;
3755 if (!IsByRef[En.index()]) {
3756 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3757 "red.value." + Twine(En.index()));
3758 }
3759 Value *PrivateRedValue =
3761 "red.private.value." + Twine(En.index()));
3762 Value *Reduced;
3763 InsertPointOrErrorTy AfterIP =
3764 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3765 if (!AfterIP)
3766 return AfterIP.takeError();
3767 Builder.restoreIP(*AfterIP);
3768
3769 if (!Builder.GetInsertBlock())
3770 return InsertPointTy();
3771 // for by-ref case, the load is inside of the reduction region
3772 if (!IsByRef[En.index()])
3773 Builder.CreateStore(Reduced, RI.Variable);
3774 }
3775 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3776 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3777 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3778 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3779 Builder.CreateBr(ContinuationBlock);
3780
3781 // Populate the atomic reduction using the atomic elementwise reduction
3782 // function. There are no loads/stores here because they will be happening
3783 // inside the atomic elementwise reduction.
3784 Builder.SetInsertPoint(AtomicRedBlock);
3785 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3786 for (const ReductionInfo &RI : ReductionInfos) {
3789 if (!AfterIP)
3790 return AfterIP.takeError();
3791 Builder.restoreIP(*AfterIP);
3792 if (!Builder.GetInsertBlock())
3793 return InsertPointTy();
3794 }
3795 Builder.CreateBr(ContinuationBlock);
3796 } else {
3798 }
3799
3800 // Populate the outlined reduction function using the elementwise reduction
3801 // function. Partial values are extracted from the type-erased array of
3802 // pointers to private variables.
3803 BasicBlock *ReductionFuncBlock =
3804 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3805 Builder.SetInsertPoint(ReductionFuncBlock);
3806 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3807 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3808
3809 for (auto En : enumerate(ReductionInfos)) {
3810 const ReductionInfo &RI = En.value();
3812 RedArrayTy, LHSArrayPtr, 0, En.index());
3813 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3814 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3815 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3817 RedArrayTy, RHSArrayPtr, 0, En.index());
3818 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3819 Value *RHSPtr =
3821 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3822 Value *Reduced;
3823 InsertPointOrErrorTy AfterIP =
3824 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3825 if (!AfterIP)
3826 return AfterIP.takeError();
3827 Builder.restoreIP(*AfterIP);
3828 if (!Builder.GetInsertBlock())
3829 return InsertPointTy();
3830 // store is inside of the reduction region when using by-ref
3831 if (!IsByRef[En.index()])
3832 Builder.CreateStore(Reduced, LHSPtr);
3833 }
3835
3836 Builder.SetInsertPoint(ContinuationBlock);
3837 return Builder.saveIP();
3838}
3839
3842 BodyGenCallbackTy BodyGenCB,
3843 FinalizeCallbackTy FiniCB) {
3844 if (!updateToLocation(Loc))
3845 return Loc.IP;
3846
3847 Directive OMPD = Directive::OMPD_master;
3848 uint32_t SrcLocStrSize;
3849 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3850 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3851 Value *ThreadId = getOrCreateThreadID(Ident);
3852 Value *Args[] = {Ident, ThreadId};
3853
3854 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3855 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3856
3857 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3858 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3859
3860 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3861 /*Conditional*/ true, /*hasFinalize*/ true);
3862}
3863
3866 BodyGenCallbackTy BodyGenCB,
3867 FinalizeCallbackTy FiniCB, Value *Filter) {
3868 if (!updateToLocation(Loc))
3869 return Loc.IP;
3870
3871 Directive OMPD = Directive::OMPD_masked;
3872 uint32_t SrcLocStrSize;
3873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3874 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3875 Value *ThreadId = getOrCreateThreadID(Ident);
3876 Value *Args[] = {Ident, ThreadId, Filter};
3877 Value *ArgsEnd[] = {Ident, ThreadId};
3878
3879 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3880 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3881
3882 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3883 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3884
3885 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3886 /*Conditional*/ true, /*hasFinalize*/ true);
3887}
3888
3890 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3891 BasicBlock *PostInsertBefore, const Twine &Name) {
3892 Module *M = F->getParent();
3893 LLVMContext &Ctx = M->getContext();
3894 Type *IndVarTy = TripCount->getType();
3895
3896 // Create the basic block structure.
3897 BasicBlock *Preheader =
3898 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3899 BasicBlock *Header =
3900 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3901 BasicBlock *Cond =
3902 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3903 BasicBlock *Body =
3904 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3905 BasicBlock *Latch =
3906 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3907 BasicBlock *Exit =
3908 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3909 BasicBlock *After =
3910 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3911
3912 // Use specified DebugLoc for new instructions.
3914
3915 Builder.SetInsertPoint(Preheader);
3916 Builder.CreateBr(Header);
3917
3918 Builder.SetInsertPoint(Header);
3919 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3920 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3922
3924 Value *Cmp =
3925 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3926 Builder.CreateCondBr(Cmp, Body, Exit);
3927
3928 Builder.SetInsertPoint(Body);
3929 Builder.CreateBr(Latch);
3930
3931 Builder.SetInsertPoint(Latch);
3932 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3933 "omp_" + Name + ".next", /*HasNUW=*/true);
3934 Builder.CreateBr(Header);
3935 IndVarPHI->addIncoming(Next, Latch);
3936
3937 Builder.SetInsertPoint(Exit);
3939
3940 // Remember and return the canonical control flow.
3941 LoopInfos.emplace_front();
3942 CanonicalLoopInfo *CL = &LoopInfos.front();
3943
3944 CL->Header = Header;
3945 CL->Cond = Cond;
3946 CL->Latch = Latch;
3947 CL->Exit = Exit;
3948
3949#ifndef NDEBUG
3950 CL->assertOK();
3951#endif
3952 return CL;
3953}
3954
3957 LoopBodyGenCallbackTy BodyGenCB,
3958 Value *TripCount, const Twine &Name) {
3959 BasicBlock *BB = Loc.IP.getBlock();
3960 BasicBlock *NextBB = BB->getNextNode();
3961
3962 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3963 NextBB, NextBB, Name);
3964 BasicBlock *After = CL->getAfter();
3965
3966 // If location is not set, don't connect the loop.
3967 if (updateToLocation(Loc)) {
3968 // Split the loop at the insertion point: Branch to the preheader and move
3969 // every following instruction to after the loop (the After BB). Also, the
3970 // new successor is the loop's after block.
3971 spliceBB(Builder, After, /*CreateBranch=*/false);
3973 }
3974
3975 // Emit the body content. We do it after connecting the loop to the CFG to
3976 // avoid that the callback encounters degenerate BBs.
3977 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
3978 return Err;
3979
3980#ifndef NDEBUG
3981 CL->assertOK();
3982#endif
3983 return CL;
3984}
3985
3987 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3988 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3989 InsertPointTy ComputeIP, const Twine &Name) {
3990
3991 // Consider the following difficulties (assuming 8-bit signed integers):
3992 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3993 // DO I = 1, 100, 50
3994 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3995 // DO I = 100, 0, -128
3996
3997 // Start, Stop and Step must be of the same integer type.
3998 auto *IndVarTy = cast<IntegerType>(Start->getType());
3999 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4000 assert(IndVarTy == Step->getType() && "Step type mismatch");
4001
4002 LocationDescription ComputeLoc =
4003 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4004 updateToLocation(ComputeLoc);
4005
4006 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4007 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4008
4009 // Like Step, but always positive.
4010 Value *Incr = Step;
4011
4012 // Distance between Start and Stop; always positive.
4013 Value *Span;
4014
4015 // Condition whether there are no iterations are executed at all, e.g. because
4016 // UB < LB.
4017 Value *ZeroCmp;
4018
4019 if (IsSigned) {
4020 // Ensure that increment is positive. If not, negate and invert LB and UB.
4021 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4022 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4023 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4024 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4025 Span = Builder.CreateSub(UB, LB, "", false, true);
4026 ZeroCmp = Builder.CreateICmp(
4027 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4028 } else {
4029 Span = Builder.CreateSub(Stop, Start, "", true);
4030 ZeroCmp = Builder.CreateICmp(
4031 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4032 }
4033
4034 Value *CountIfLooping;
4035 if (InclusiveStop) {
4036 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4037 } else {
4038 // Avoid incrementing past stop since it could overflow.
4039 Value *CountIfTwo = Builder.CreateAdd(
4040 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4041 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4042 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4043 }
4044 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4045 "omp_" + Name + ".tripcount");
4046
4047 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4048 Builder.restoreIP(CodeGenIP);
4049 Value *Span = Builder.CreateMul(IV, Step);
4050 Value *IndVar = Builder.CreateAdd(Span, Start);
4051 return BodyGenCB(Builder.saveIP(), IndVar);
4052 };
4053 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
4054 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4055}
4056
4057// Returns an LLVM function to call for initializing loop bounds using OpenMP
4058// static scheduling depending on `type`. Only i32 and i64 are supported by the
4059// runtime. Always interpret integers as unsigned similarly to
4060// CanonicalLoopInfo.
4062 OpenMPIRBuilder &OMPBuilder) {
4063 unsigned Bitwidth = Ty->getIntegerBitWidth();
4064 if (Bitwidth == 32)
4065 return OMPBuilder.getOrCreateRuntimeFunction(
4066 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4067 if (Bitwidth == 64)
4068 return OMPBuilder.getOrCreateRuntimeFunction(
4069 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4070 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4071}
4072
4074OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
4075 InsertPointTy AllocaIP,
4076 bool NeedsBarrier) {
4077 assert(CLI->isValid() && "Requires a valid canonical loop");
4078 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4079 "Require dedicated allocate IP");
4080
4081 // Set up the source location value for OpenMP runtime.
4084
4085 uint32_t SrcLocStrSize;
4086 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4087 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4088
4089 // Declare useful OpenMP runtime functions.
4090 Value *IV = CLI->getIndVar();
4091 Type *IVTy = IV->getType();
4092 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4093 FunctionCallee StaticFini =
4094 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4095
4096 // Allocate space for computed loop bounds as expected by the "init" function.
4097 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4098
4099 Type *I32Type = Type::getInt32Ty(M.getContext());
4100 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4101 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4102 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4103 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4104
4105 // At the end of the preheader, prepare for calling the "init" function by
4106 // storing the current loop bounds into the allocated space. A canonical loop
4107 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4108 // and produces an inclusive upper bound.
4110 Constant *Zero = ConstantInt::get(IVTy, 0);
4111 Constant *One = ConstantInt::get(IVTy, 1);
4112 Builder.CreateStore(Zero, PLowerBound);
4113 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4114 Builder.CreateStore(UpperBound, PUpperBound);
4115 Builder.CreateStore(One, PStride);
4116
4117 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4118
4119 Constant *SchedulingType = ConstantInt::get(
4120 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4121
4122 // Call the "init" function and update the trip count of the loop with the
4123 // value it produced.
4124 Builder.CreateCall(StaticInit,
4125 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4126 PUpperBound, PStride, One, Zero});
4127 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4128 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4129 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4130 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4131 CLI->setTripCount(TripCount);
4132
4133 // Update all uses of the induction variable except the one in the condition
4134 // block that compares it with the actual upper bound, and the increment in
4135 // the latch block.
4136
4137 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4139 CLI->getBody()->getFirstInsertionPt());
4141 return Builder.CreateAdd(OldIV, LowerBound);
4142 });
4143
4144 // In the "exit" block, call the "fini" function.
4146 CLI->getExit()->getTerminator()->getIterator());
4147 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4148
4149 // Add the barrier if requested.
4150 if (NeedsBarrier) {
4151 InsertPointOrErrorTy BarrierIP =
4152 createBarrier(LocationDescription(Builder.saveIP(), DL),
4153 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4154 /* CheckCancelFlag */ false);
4155 if (!BarrierIP)
4156 return BarrierIP.takeError();
4157 }
4158
4159 InsertPointTy AfterIP = CLI->getAfterIP();
4160 CLI->invalidate();
4161
4162 return AfterIP;
4163}
4164
4166OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4167 CanonicalLoopInfo *CLI,
4168 InsertPointTy AllocaIP,
4169 bool NeedsBarrier,
4170 Value *ChunkSize) {
4171 assert(CLI->isValid() && "Requires a valid canonical loop");
4172 assert(ChunkSize && "Chunk size is required");
4173
4174 LLVMContext &Ctx = CLI->getFunction()->getContext();
4175 Value *IV = CLI->getIndVar();
4176 Value *OrigTripCount = CLI->getTripCount();
4177 Type *IVTy = IV->getType();
4178 assert(IVTy->getIntegerBitWidth() <= 64 &&
4179 "Max supported tripcount bitwidth is 64 bits");
4180 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4181 : Type::getInt64Ty(Ctx);
4182 Type *I32Type = Type::getInt32Ty(M.getContext());
4183 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4184 Constant *One = ConstantInt::get(InternalIVTy, 1);
4185
4186 // Declare useful OpenMP runtime functions.
4187 FunctionCallee StaticInit =
4188 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4189 FunctionCallee StaticFini =
4190 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4191
4192 // Allocate space for computed loop bounds as expected by the "init" function.
4193 Builder.restoreIP(AllocaIP);
4195 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4196 Value *PLowerBound =
4197 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4198 Value *PUpperBound =
4199 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4200 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4201
4202 // Set up the source location value for the OpenMP runtime.
4205
4206 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4207 Value *CastedChunkSize =
4208 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4209 Value *CastedTripCount =
4210 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4211
4212 Constant *SchedulingType = ConstantInt::get(
4213 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4214 Builder.CreateStore(Zero, PLowerBound);
4215 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4216 Builder.CreateStore(OrigUpperBound, PUpperBound);
4217 Builder.CreateStore(One, PStride);
4218
4219 // Call the "init" function and update the trip count of the loop with the
4220 // value it produced.
4221 uint32_t SrcLocStrSize;
4222 Constant *SrcLocStr =