LLVM 19.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
45#include "llvm/IR/Value.h"
57
58#include <cstdint>
59#include <optional>
60#include <stack>
61
62#define DEBUG_TYPE "openmp-ir-builder"
63
64using namespace llvm;
65using namespace omp;
66
67static cl::opt<bool>
68 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
69 cl::desc("Use optimistic attributes describing "
70 "'as-if' properties of runtime calls."),
71 cl::init(false));
72
74 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
75 cl::desc("Factor for the unroll threshold to account for code "
76 "simplifications still taking place"),
77 cl::init(1.5));
78
79#ifndef NDEBUG
80/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
81/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
82/// an InsertPoint stores the instruction before something is inserted. For
83/// instance, if both point to the same instruction, two IRBuilders alternating
84/// creating instruction will cause the instructions to be interleaved.
87 if (!IP1.isSet() || !IP2.isSet())
88 return false;
89 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
90}
91
93 // Valid ordered/unordered and base algorithm combinations.
94 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
95 case OMPScheduleType::UnorderedStaticChunked:
96 case OMPScheduleType::UnorderedStatic:
97 case OMPScheduleType::UnorderedDynamicChunked:
98 case OMPScheduleType::UnorderedGuidedChunked:
99 case OMPScheduleType::UnorderedRuntime:
100 case OMPScheduleType::UnorderedAuto:
101 case OMPScheduleType::UnorderedTrapezoidal:
102 case OMPScheduleType::UnorderedGreedy:
103 case OMPScheduleType::UnorderedBalanced:
104 case OMPScheduleType::UnorderedGuidedIterativeChunked:
105 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
106 case OMPScheduleType::UnorderedSteal:
107 case OMPScheduleType::UnorderedStaticBalancedChunked:
108 case OMPScheduleType::UnorderedGuidedSimd:
109 case OMPScheduleType::UnorderedRuntimeSimd:
110 case OMPScheduleType::OrderedStaticChunked:
111 case OMPScheduleType::OrderedStatic:
112 case OMPScheduleType::OrderedDynamicChunked:
113 case OMPScheduleType::OrderedGuidedChunked:
114 case OMPScheduleType::OrderedRuntime:
115 case OMPScheduleType::OrderedAuto:
116 case OMPScheduleType::OrderdTrapezoidal:
117 case OMPScheduleType::NomergeUnorderedStaticChunked:
118 case OMPScheduleType::NomergeUnorderedStatic:
119 case OMPScheduleType::NomergeUnorderedDynamicChunked:
120 case OMPScheduleType::NomergeUnorderedGuidedChunked:
121 case OMPScheduleType::NomergeUnorderedRuntime:
122 case OMPScheduleType::NomergeUnorderedAuto:
123 case OMPScheduleType::NomergeUnorderedTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedGreedy:
125 case OMPScheduleType::NomergeUnorderedBalanced:
126 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
128 case OMPScheduleType::NomergeUnorderedSteal:
129 case OMPScheduleType::NomergeOrderedStaticChunked:
130 case OMPScheduleType::NomergeOrderedStatic:
131 case OMPScheduleType::NomergeOrderedDynamicChunked:
132 case OMPScheduleType::NomergeOrderedGuidedChunked:
133 case OMPScheduleType::NomergeOrderedRuntime:
134 case OMPScheduleType::NomergeOrderedAuto:
135 case OMPScheduleType::NomergeOrderedTrapezoidal:
136 break;
137 default:
138 return false;
139 }
140
141 // Must not set both monotonicity modifiers at the same time.
142 OMPScheduleType MonotonicityFlags =
143 SchedType & OMPScheduleType::MonotonicityMask;
144 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
145 return false;
146
147 return true;
148}
149#endif
150
151static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
152 if (T.isAMDGPU()) {
153 StringRef Features =
154 Kernel->getFnAttribute("target-features").getValueAsString();
155 if (Features.count("+wavefrontsize64"))
156 return omp::getAMDGPUGridValues<64>();
157 return omp::getAMDGPUGridValues<32>();
158 }
159 if (T.isNVPTX())
161 llvm_unreachable("No grid value available for this architecture!");
162}
163
164/// Determine which scheduling algorithm to use, determined from schedule clause
165/// arguments.
166static OMPScheduleType
167getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
168 bool HasSimdModifier) {
169 // Currently, the default schedule it static.
170 switch (ClauseKind) {
171 case OMP_SCHEDULE_Default:
172 case OMP_SCHEDULE_Static:
173 return HasChunks ? OMPScheduleType::BaseStaticChunked
174 : OMPScheduleType::BaseStatic;
175 case OMP_SCHEDULE_Dynamic:
176 return OMPScheduleType::BaseDynamicChunked;
177 case OMP_SCHEDULE_Guided:
178 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
179 : OMPScheduleType::BaseGuidedChunked;
180 case OMP_SCHEDULE_Auto:
182 case OMP_SCHEDULE_Runtime:
183 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
184 : OMPScheduleType::BaseRuntime;
185 }
186 llvm_unreachable("unhandled schedule clause argument");
187}
188
189/// Adds ordering modifier flags to schedule type.
190static OMPScheduleType
192 bool HasOrderedClause) {
193 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
194 OMPScheduleType::None &&
195 "Must not have ordering nor monotonicity flags already set");
196
197 OMPScheduleType OrderingModifier = HasOrderedClause
198 ? OMPScheduleType::ModifierOrdered
199 : OMPScheduleType::ModifierUnordered;
200 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
201
202 // Unsupported combinations
203 if (OrderingScheduleType ==
204 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedGuidedChunked;
206 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
207 OMPScheduleType::ModifierOrdered))
208 return OMPScheduleType::OrderedRuntime;
209
210 return OrderingScheduleType;
211}
212
213/// Adds monotonicity modifier flags to schedule type.
214static OMPScheduleType
216 bool HasSimdModifier, bool HasMonotonic,
217 bool HasNonmonotonic, bool HasOrderedClause) {
218 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
219 OMPScheduleType::None &&
220 "Must not have monotonicity flags already set");
221 assert((!HasMonotonic || !HasNonmonotonic) &&
222 "Monotonic and Nonmonotonic are contradicting each other");
223
224 if (HasMonotonic) {
225 return ScheduleType | OMPScheduleType::ModifierMonotonic;
226 } else if (HasNonmonotonic) {
227 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
228 } else {
229 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
230 // If the static schedule kind is specified or if the ordered clause is
231 // specified, and if the nonmonotonic modifier is not specified, the
232 // effect is as if the monotonic modifier is specified. Otherwise, unless
233 // the monotonic modifier is specified, the effect is as if the
234 // nonmonotonic modifier is specified.
235 OMPScheduleType BaseScheduleType =
236 ScheduleType & ~OMPScheduleType::ModifierMask;
237 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
238 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
239 HasOrderedClause) {
240 // The monotonic is used by default in openmp runtime library, so no need
241 // to set it.
242 return ScheduleType;
243 } else {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 }
246 }
247}
248
249/// Determine the schedule type using schedule and ordering clause arguments.
250static OMPScheduleType
251computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
252 bool HasSimdModifier, bool HasMonotonicModifier,
253 bool HasNonmonotonicModifier, bool HasOrderedClause) {
254 OMPScheduleType BaseSchedule =
255 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
256 OMPScheduleType OrderedSchedule =
257 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
259 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
260 HasNonmonotonicModifier, HasOrderedClause);
261
263 return Result;
264}
265
266/// Make \p Source branch to \p Target.
267///
268/// Handles two situations:
269/// * \p Source already has an unconditional branch.
270/// * \p Source is a degenerate block (no terminator because the BB is
271/// the current head of the IR construction).
273 if (Instruction *Term = Source->getTerminator()) {
274 auto *Br = cast<BranchInst>(Term);
275 assert(!Br->isConditional() &&
276 "BB's terminator must be an unconditional branch (or degenerate)");
277 BasicBlock *Succ = Br->getSuccessor(0);
278 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
279 Br->setSuccessor(0, Target);
280 return;
281 }
282
283 auto *NewBr = BranchInst::Create(Target, Source);
284 NewBr->setDebugLoc(DL);
285}
286
288 bool CreateBranch) {
289 assert(New->getFirstInsertionPt() == New->begin() &&
290 "Target BB must not have PHI nodes");
291
292 // Move instructions to new block.
293 BasicBlock *Old = IP.getBlock();
294 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
295
296 if (CreateBranch)
297 BranchInst::Create(New, Old);
298}
299
300void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
302 BasicBlock *Old = Builder.GetInsertBlock();
303
304 spliceBB(Builder.saveIP(), New, CreateBranch);
305 if (CreateBranch)
306 Builder.SetInsertPoint(Old->getTerminator());
307 else
308 Builder.SetInsertPoint(Old);
309
310 // SetInsertPoint also updates the Builder's debug location, but we want to
311 // keep the one the Builder was configured to use.
313}
314
317 BasicBlock *Old = IP.getBlock();
319 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
320 Old->getParent(), Old->getNextNode());
321 spliceBB(IP, New, CreateBranch);
322 New->replaceSuccessorsPhiUsesWith(Old, New);
323 return New;
324}
325
326BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
329 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
330 if (CreateBranch)
331 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
332 else
333 Builder.SetInsertPoint(Builder.GetInsertBlock());
334 // SetInsertPoint also updates the Builder's debug location, but we want to
335 // keep the one the Builder was configured to use.
337 return New;
338}
339
340BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
343 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
344 if (CreateBranch)
345 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
346 else
347 Builder.SetInsertPoint(Builder.GetInsertBlock());
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
351 return New;
352}
353
355 llvm::Twine Suffix) {
356 BasicBlock *Old = Builder.GetInsertBlock();
357 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
358}
359
360// This function creates a fake integer value and a fake use for the integer
361// value. It returns the fake value created. This is useful in modeling the
362// extra arguments to the outlined functions.
364 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
365 std::stack<Instruction *> &ToBeDeleted,
366 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
367 const Twine &Name = "", bool AsPtr = true) {
368 Builder.restoreIP(OuterAllocaIP);
369 Instruction *FakeVal;
370 AllocaInst *FakeValAddr =
371 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
372 ToBeDeleted.push(FakeValAddr);
373
374 if (AsPtr) {
375 FakeVal = FakeValAddr;
376 } else {
377 FakeVal =
378 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
379 ToBeDeleted.push(FakeVal);
380 }
381
382 // Generate a fake use of this value
383 Builder.restoreIP(InnerAllocaIP);
384 Instruction *UseFakeVal;
385 if (AsPtr) {
386 UseFakeVal =
387 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
388 } else {
389 UseFakeVal =
390 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
391 }
392 ToBeDeleted.push(UseFakeVal);
393 return FakeVal;
394}
395
396//===----------------------------------------------------------------------===//
397// OpenMPIRBuilderConfig
398//===----------------------------------------------------------------------===//
399
400namespace {
402/// Values for bit flags for marking which requires clauses have been used.
403enum OpenMPOffloadingRequiresDirFlags {
404 /// flag undefined.
405 OMP_REQ_UNDEFINED = 0x000,
406 /// no requires directive present.
407 OMP_REQ_NONE = 0x001,
408 /// reverse_offload clause.
409 OMP_REQ_REVERSE_OFFLOAD = 0x002,
410 /// unified_address clause.
411 OMP_REQ_UNIFIED_ADDRESS = 0x004,
412 /// unified_shared_memory clause.
413 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
414 /// dynamic_allocators clause.
415 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
416 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
417};
418
419} // anonymous namespace
420
422 : RequiresFlags(OMP_REQ_UNDEFINED) {}
423
425 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
426 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
427 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
428 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
429 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
430 RequiresFlags(OMP_REQ_UNDEFINED) {
431 if (HasRequiresReverseOffload)
432 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
433 if (HasRequiresUnifiedAddress)
434 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
435 if (HasRequiresUnifiedSharedMemory)
436 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
437 if (HasRequiresDynamicAllocators)
438 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
439}
440
442 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
443}
444
446 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
447}
448
450 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
451}
452
454 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
455}
456
458 return hasRequiresFlags() ? RequiresFlags
459 : static_cast<int64_t>(OMP_REQ_NONE);
460}
461
463 if (Value)
464 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
465 else
466 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
467}
468
470 if (Value)
471 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
472 else
473 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
474}
475
477 if (Value)
478 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
479 else
480 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 if (Value)
485 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
486 else
487 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
488}
489
490//===----------------------------------------------------------------------===//
491// OpenMPIRBuilder
492//===----------------------------------------------------------------------===//
493
495 IRBuilderBase &Builder,
496 SmallVector<Value *> &ArgsVector) {
498 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
499 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
500 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, 3));
501 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
502
503 Value *NumTeams3D =
504 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams, {0});
505 Value *NumThreads3D =
506 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
507
508 ArgsVector = {Version,
509 PointerNum,
510 KernelArgs.RTArgs.BasePointersArray,
511 KernelArgs.RTArgs.PointersArray,
512 KernelArgs.RTArgs.SizesArray,
513 KernelArgs.RTArgs.MapTypesArray,
514 KernelArgs.RTArgs.MapNamesArray,
515 KernelArgs.RTArgs.MappersArray,
516 KernelArgs.NumIterations,
517 Flags,
518 NumTeams3D,
519 NumThreads3D,
520 KernelArgs.DynCGGroupMem};
521}
522
524 LLVMContext &Ctx = Fn.getContext();
525
526 // Get the function's current attributes.
527 auto Attrs = Fn.getAttributes();
528 auto FnAttrs = Attrs.getFnAttrs();
529 auto RetAttrs = Attrs.getRetAttrs();
531 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
532 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
533
534 // Add AS to FnAS while taking special care with integer extensions.
535 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
536 bool Param = true) -> void {
537 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
538 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
539 if (HasSignExt || HasZeroExt) {
540 assert(AS.getNumAttributes() == 1 &&
541 "Currently not handling extension attr combined with others.");
542 if (Param) {
543 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
544 FnAS = FnAS.addAttribute(Ctx, AK);
545 } else if (auto AK =
546 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
547 FnAS = FnAS.addAttribute(Ctx, AK);
548 } else {
549 FnAS = FnAS.addAttributes(Ctx, AS);
550 }
551 };
552
553#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
554#include "llvm/Frontend/OpenMP/OMPKinds.def"
555
556 // Add attributes to the function declaration.
557 switch (FnID) {
558#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
559 case Enum: \
560 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
561 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
562 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
563 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
564 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
565 break;
566#include "llvm/Frontend/OpenMP/OMPKinds.def"
567 default:
568 // Attributes are optional.
569 break;
570 }
571}
572
575 FunctionType *FnTy = nullptr;
576 Function *Fn = nullptr;
577
578 // Try to find the declation in the module first.
579 switch (FnID) {
580#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
581 case Enum: \
582 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
583 IsVarArg); \
584 Fn = M.getFunction(Str); \
585 break;
586#include "llvm/Frontend/OpenMP/OMPKinds.def"
587 }
588
589 if (!Fn) {
590 // Create a new declaration if we need one.
591 switch (FnID) {
592#define OMP_RTL(Enum, Str, ...) \
593 case Enum: \
594 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
595 break;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597 }
598
599 // Add information if the runtime function takes a callback function
600 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
601 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
602 LLVMContext &Ctx = Fn->getContext();
603 MDBuilder MDB(Ctx);
604 // Annotate the callback behavior of the runtime function:
605 // - The callback callee is argument number 2 (microtask).
606 // - The first two arguments of the callback callee are unknown (-1).
607 // - All variadic arguments to the runtime function are passed to the
608 // callback callee.
609 Fn->addMetadata(
610 LLVMContext::MD_callback,
612 2, {-1, -1}, /* VarArgsArePassed */ true)}));
613 }
614 }
615
616 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
617 << " with type " << *Fn->getFunctionType() << "\n");
618 addAttributes(FnID, *Fn);
619
620 } else {
621 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
622 << " with type " << *Fn->getFunctionType() << "\n");
623 }
624
625 assert(Fn && "Failed to create OpenMP runtime function");
626
627 return {FnTy, Fn};
628}
629
632 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
633 assert(Fn && "Failed to create OpenMP runtime function pointer");
634 return Fn;
635}
636
637void OpenMPIRBuilder::initialize() { initializeTypes(M); }
638
641 BasicBlock &EntryBlock = Function->getEntryBlock();
642 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
643
644 // Loop over blocks looking for constant allocas, skipping the entry block
645 // as any allocas there are already in the desired location.
646 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
647 Block++) {
648 for (auto Inst = Block->getReverseIterator()->begin();
649 Inst != Block->getReverseIterator()->end();) {
650 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
651 Inst++;
652 if (!isa<ConstantData>(AllocaInst->getArraySize()))
653 continue;
654 AllocaInst->moveBeforePreserving(MoveLocInst);
655 } else {
656 Inst++;
657 }
658 }
659 }
660}
661
663 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
665 SmallVector<OutlineInfo, 16> DeferredOutlines;
666 for (OutlineInfo &OI : OutlineInfos) {
667 // Skip functions that have not finalized yet; may happen with nested
668 // function generation.
669 if (Fn && OI.getFunction() != Fn) {
670 DeferredOutlines.push_back(OI);
671 continue;
672 }
673
674 ParallelRegionBlockSet.clear();
675 Blocks.clear();
676 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
677
678 Function *OuterFn = OI.getFunction();
679 CodeExtractorAnalysisCache CEAC(*OuterFn);
680 // If we generate code for the target device, we need to allocate
681 // struct for aggregate params in the device default alloca address space.
682 // OpenMP runtime requires that the params of the extracted functions are
683 // passed as zero address space pointers. This flag ensures that
684 // CodeExtractor generates correct code for extracted functions
685 // which are used by OpenMP runtime.
686 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
687 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
688 /* AggregateArgs */ true,
689 /* BlockFrequencyInfo */ nullptr,
690 /* BranchProbabilityInfo */ nullptr,
691 /* AssumptionCache */ nullptr,
692 /* AllowVarArgs */ true,
693 /* AllowAlloca */ true,
694 /* AllocaBlock*/ OI.OuterAllocaBB,
695 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
696
697 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
698 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
699 << " Exit: " << OI.ExitBB->getName() << "\n");
700 assert(Extractor.isEligible() &&
701 "Expected OpenMP outlining to be possible!");
702
703 for (auto *V : OI.ExcludeArgsFromAggregate)
704 Extractor.excludeArgFromAggregate(V);
705
706 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
707
708 // Forward target-cpu, target-features attributes to the outlined function.
709 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
710 if (TargetCpuAttr.isStringAttribute())
711 OutlinedFn->addFnAttr(TargetCpuAttr);
712
713 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
714 if (TargetFeaturesAttr.isStringAttribute())
715 OutlinedFn->addFnAttr(TargetFeaturesAttr);
716
717 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
718 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
719 assert(OutlinedFn->getReturnType()->isVoidTy() &&
720 "OpenMP outlined functions should not return a value!");
721
722 // For compability with the clang CG we move the outlined function after the
723 // one with the parallel region.
724 OutlinedFn->removeFromParent();
725 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
726
727 // Remove the artificial entry introduced by the extractor right away, we
728 // made our own entry block after all.
729 {
730 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
731 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
732 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
733 // Move instructions from the to-be-deleted ArtificialEntry to the entry
734 // basic block of the parallel region. CodeExtractor generates
735 // instructions to unwrap the aggregate argument and may sink
736 // allocas/bitcasts for values that are solely used in the outlined region
737 // and do not escape.
738 assert(!ArtificialEntry.empty() &&
739 "Expected instructions to add in the outlined region entry");
740 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
741 End = ArtificialEntry.rend();
742 It != End;) {
743 Instruction &I = *It;
744 It++;
745
746 if (I.isTerminator())
747 continue;
748
749 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
750 }
751
752 OI.EntryBB->moveBefore(&ArtificialEntry);
753 ArtificialEntry.eraseFromParent();
754 }
755 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
756 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
757
758 // Run a user callback, e.g. to add attributes.
759 if (OI.PostOutlineCB)
760 OI.PostOutlineCB(*OutlinedFn);
761 }
762
763 // Remove work items that have been completed.
764 OutlineInfos = std::move(DeferredOutlines);
765
766 // The createTarget functions embeds user written code into
767 // the target region which may inject allocas which need to
768 // be moved to the entry block of our target or risk malformed
769 // optimisations by later passes, this is only relevant for
770 // the device pass which appears to be a little more delicate
771 // when it comes to optimisations (however, we do not block on
772 // that here, it's up to the inserter to the list to do so).
773 // This notbaly has to occur after the OutlinedInfo candidates
774 // have been extracted so we have an end product that will not
775 // be implicitly adversely affected by any raises unless
776 // intentionally appended to the list.
777 // NOTE: This only does so for ConstantData, it could be extended
778 // to ConstantExpr's with further effort, however, they should
779 // largely be folded when they get here. Extending it to runtime
780 // defined/read+writeable allocation sizes would be non-trivial
781 // (need to factor in movement of any stores to variables the
782 // allocation size depends on, as well as the usual loads,
783 // otherwise it'll yield the wrong result after movement) and
784 // likely be more suitable as an LLVM optimisation pass.
787
788 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
789 [](EmitMetadataErrorKind Kind,
790 const TargetRegionEntryInfo &EntryInfo) -> void {
791 errs() << "Error of kind: " << Kind
792 << " when emitting offload entries and metadata during "
793 "OMPIRBuilder finalization \n";
794 };
795
798
799 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
800 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
801 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
802 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
803 }
804}
805
807 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
808}
809
812 auto *GV =
813 new GlobalVariable(M, I32Ty,
814 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
815 ConstantInt::get(I32Ty, Value), Name);
816 GV->setVisibility(GlobalValue::HiddenVisibility);
817
818 return GV;
819}
820
822 uint32_t SrcLocStrSize,
823 IdentFlag LocFlags,
824 unsigned Reserve2Flags) {
825 // Enable "C-mode".
826 LocFlags |= OMP_IDENT_FLAG_KMPC;
827
828 Constant *&Ident =
829 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
830 if (!Ident) {
832 Constant *IdentData[] = {I32Null,
833 ConstantInt::get(Int32, uint32_t(LocFlags)),
834 ConstantInt::get(Int32, Reserve2Flags),
835 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
836 Constant *Initializer =
837 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
838
839 // Look for existing encoding of the location + flags, not needed but
840 // minimizes the difference to the existing solution while we transition.
841 for (GlobalVariable &GV : M.globals())
842 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
843 if (GV.getInitializer() == Initializer)
844 Ident = &GV;
845
846 if (!Ident) {
847 auto *GV = new GlobalVariable(
848 M, OpenMPIRBuilder::Ident,
849 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
852 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
853 GV->setAlignment(Align(8));
854 Ident = GV;
855 }
856 }
857
859}
860
862 uint32_t &SrcLocStrSize) {
863 SrcLocStrSize = LocStr.size();
864 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
865 if (!SrcLocStr) {
866 Constant *Initializer =
868
869 // Look for existing encoding of the location, not needed but minimizes the
870 // difference to the existing solution while we transition.
871 for (GlobalVariable &GV : M.globals())
872 if (GV.isConstant() && GV.hasInitializer() &&
873 GV.getInitializer() == Initializer)
874 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
875
876 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
877 /* AddressSpace */ 0, &M);
878 }
879 return SrcLocStr;
880}
881
883 StringRef FileName,
884 unsigned Line, unsigned Column,
885 uint32_t &SrcLocStrSize) {
886 SmallString<128> Buffer;
887 Buffer.push_back(';');
888 Buffer.append(FileName);
889 Buffer.push_back(';');
890 Buffer.append(FunctionName);
891 Buffer.push_back(';');
892 Buffer.append(std::to_string(Line));
893 Buffer.push_back(';');
894 Buffer.append(std::to_string(Column));
895 Buffer.push_back(';');
896 Buffer.push_back(';');
897 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
898}
899
900Constant *
902 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
903 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
904}
905
907 uint32_t &SrcLocStrSize,
908 Function *F) {
909 DILocation *DIL = DL.get();
910 if (!DIL)
911 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
912 StringRef FileName = M.getName();
913 if (DIFile *DIF = DIL->getFile())
914 if (std::optional<StringRef> Source = DIF->getSource())
915 FileName = *Source;
916 StringRef Function = DIL->getScope()->getSubprogram()->getName();
917 if (Function.empty() && F)
918 Function = F->getName();
919 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
920 DIL->getColumn(), SrcLocStrSize);
921}
922
924 uint32_t &SrcLocStrSize) {
925 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
926 Loc.IP.getBlock()->getParent());
927}
928
930 return Builder.CreateCall(
931 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
932 "omp_global_thread_num");
933}
934
937 bool ForceSimpleCall, bool CheckCancelFlag) {
938 if (!updateToLocation(Loc))
939 return Loc.IP;
940
941 // Build call __kmpc_cancel_barrier(loc, thread_id) or
942 // __kmpc_barrier(loc, thread_id);
943
944 IdentFlag BarrierLocFlags;
945 switch (Kind) {
946 case OMPD_for:
947 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
948 break;
949 case OMPD_sections:
950 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
951 break;
952 case OMPD_single:
953 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
954 break;
955 case OMPD_barrier:
956 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
957 break;
958 default:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
960 break;
961 }
962
963 uint32_t SrcLocStrSize;
964 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
965 Value *Args[] = {
966 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
967 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
968
969 // If we are in a cancellable parallel region, barriers are cancellation
970 // points.
971 // TODO: Check why we would force simple calls or to ignore the cancel flag.
972 bool UseCancelBarrier =
973 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
974
975 Value *Result =
977 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
978 : OMPRTL___kmpc_barrier),
979 Args);
980
981 if (UseCancelBarrier && CheckCancelFlag)
982 emitCancelationCheckImpl(Result, OMPD_parallel);
983
984 return Builder.saveIP();
985}
986
989 Value *IfCondition,
990 omp::Directive CanceledDirective) {
991 if (!updateToLocation(Loc))
992 return Loc.IP;
993
994 // LLVM utilities like blocks with terminators.
995 auto *UI = Builder.CreateUnreachable();
996
997 Instruction *ThenTI = UI, *ElseTI = nullptr;
998 if (IfCondition)
999 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1000 Builder.SetInsertPoint(ThenTI);
1001
1002 Value *CancelKind = nullptr;
1003 switch (CanceledDirective) {
1004#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1005 case DirectiveEnum: \
1006 CancelKind = Builder.getInt32(Value); \
1007 break;
1008#include "llvm/Frontend/OpenMP/OMPKinds.def"
1009 default:
1010 llvm_unreachable("Unknown cancel kind!");
1011 }
1012
1013 uint32_t SrcLocStrSize;
1014 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1015 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1016 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1017 Value *Result = Builder.CreateCall(
1018 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1019 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1020 if (CanceledDirective == OMPD_parallel) {
1022 Builder.restoreIP(IP);
1024 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1025 /* CheckCancelFlag */ false);
1026 }
1027 };
1028
1029 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1030 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1031
1032 // Update the insertion point and remove the terminator we introduced.
1033 Builder.SetInsertPoint(UI->getParent());
1034 UI->eraseFromParent();
1035
1036 return Builder.saveIP();
1037}
1038
1040 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1041 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1042 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1043 if (!updateToLocation(Loc))
1044 return Loc.IP;
1045
1046 Builder.restoreIP(AllocaIP);
1047 auto *KernelArgsPtr =
1048 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1049 Builder.restoreIP(Loc.IP);
1050
1051 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1052 llvm::Value *Arg =
1053 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1055 KernelArgs[I], Arg,
1056 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1057 }
1058
1059 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1060 NumThreads, HostPtr, KernelArgsPtr};
1061
1062 Return = Builder.CreateCall(
1063 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1064 OffloadingArgs);
1065
1066 return Builder.saveIP();
1067}
1068
1070 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1071 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1072 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1073
1074 if (!updateToLocation(Loc))
1075 return Loc.IP;
1076
1077 Builder.restoreIP(Loc.IP);
1078 // On top of the arrays that were filled up, the target offloading call
1079 // takes as arguments the device id as well as the host pointer. The host
1080 // pointer is used by the runtime library to identify the current target
1081 // region, so it only has to be unique and not necessarily point to
1082 // anything. It could be the pointer to the outlined function that
1083 // implements the target region, but we aren't using that so that the
1084 // compiler doesn't need to keep that, and could therefore inline the host
1085 // function if proven worthwhile during optimization.
1086
1087 // From this point on, we need to have an ID of the target region defined.
1088 assert(OutlinedFnID && "Invalid outlined function ID!");
1089 (void)OutlinedFnID;
1090
1091 // Return value of the runtime offloading call.
1092 Value *Return = nullptr;
1093
1094 // Arguments for the target kernel.
1095 SmallVector<Value *> ArgsVector;
1096 getKernelArgsVector(Args, Builder, ArgsVector);
1097
1098 // The target region is an outlined function launched by the runtime
1099 // via calls to __tgt_target_kernel().
1100 //
1101 // Note that on the host and CPU targets, the runtime implementation of
1102 // these calls simply call the outlined function without forking threads.
1103 // The outlined functions themselves have runtime calls to
1104 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1105 // the compiler in emitTeamsCall() and emitParallelCall().
1106 //
1107 // In contrast, on the NVPTX target, the implementation of
1108 // __tgt_target_teams() launches a GPU kernel with the requested number
1109 // of teams and threads so no additional calls to the runtime are required.
1110 // Check the error code and execute the host version if required.
1111 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1112 Args.NumTeams, Args.NumThreads,
1113 OutlinedFnID, ArgsVector));
1114
1115 BasicBlock *OffloadFailedBlock =
1116 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1117 BasicBlock *OffloadContBlock =
1118 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1120 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1121
1122 auto CurFn = Builder.GetInsertBlock()->getParent();
1123 emitBlock(OffloadFailedBlock, CurFn);
1124 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1125 emitBranch(OffloadContBlock);
1126 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1127 return Builder.saveIP();
1128}
1129
1131 omp::Directive CanceledDirective,
1132 FinalizeCallbackTy ExitCB) {
1133 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1134 "Unexpected cancellation!");
1135
1136 // For a cancel barrier we create two new blocks.
1138 BasicBlock *NonCancellationBlock;
1139 if (Builder.GetInsertPoint() == BB->end()) {
1140 // TODO: This branch will not be needed once we moved to the
1141 // OpenMPIRBuilder codegen completely.
1142 NonCancellationBlock = BasicBlock::Create(
1143 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1144 } else {
1145 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1148 }
1149 BasicBlock *CancellationBlock = BasicBlock::Create(
1150 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1151
1152 // Jump to them based on the return value.
1153 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1154 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1155 /* TODO weight */ nullptr, nullptr);
1156
1157 // From the cancellation block we finalize all variables and go to the
1158 // post finalization block that is known to the FiniCB callback.
1159 Builder.SetInsertPoint(CancellationBlock);
1160 if (ExitCB)
1161 ExitCB(Builder.saveIP());
1162 auto &FI = FinalizationStack.back();
1163 FI.FiniCB(Builder.saveIP());
1164
1165 // The continuation block is where code generation continues.
1166 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1167}
1168
1169// Callback used to create OpenMP runtime calls to support
1170// omp parallel clause for the device.
1171// We need to use this callback to replace call to the OutlinedFn in OuterFn
1172// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1174 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1175 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1176 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1177 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1178 // Add some known attributes.
1179 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1180 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1181 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1182 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1183 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1184 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1185
1186 assert(OutlinedFn.arg_size() >= 2 &&
1187 "Expected at least tid and bounded tid as arguments");
1188 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1189
1190 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1191 assert(CI && "Expected call instruction to outlined function");
1192 CI->getParent()->setName("omp_parallel");
1193
1194 Builder.SetInsertPoint(CI);
1195 Type *PtrTy = OMPIRBuilder->VoidPtr;
1196 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1197
1198 // Add alloca for kernel args
1199 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1200 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1201 AllocaInst *ArgsAlloca =
1202 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1203 Value *Args = ArgsAlloca;
1204 // Add address space cast if array for storing arguments is not allocated
1205 // in address space 0
1206 if (ArgsAlloca->getAddressSpace())
1207 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1208 Builder.restoreIP(CurrentIP);
1209
1210 // Store captured vars which are used by kmpc_parallel_51
1211 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1212 Value *V = *(CI->arg_begin() + 2 + Idx);
1213 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1214 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1215 Builder.CreateStore(V, StoreAddress);
1216 }
1217
1218 Value *Cond =
1219 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1220 : Builder.getInt32(1);
1221
1222 // Build kmpc_parallel_51 call
1223 Value *Parallel51CallArgs[] = {
1224 /* identifier*/ Ident,
1225 /* global thread num*/ ThreadID,
1226 /* if expression */ Cond,
1227 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1228 /* Proc bind */ Builder.getInt32(-1),
1229 /* outlined function */
1230 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1231 /* wrapper function */ NullPtrValue,
1232 /* arguments of the outlined funciton*/ Args,
1233 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1234
1235 FunctionCallee RTLFn =
1236 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1237
1238 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1239
1240 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1241 << *Builder.GetInsertBlock()->getParent() << "\n");
1242
1243 // Initialize the local TID stack location with the argument value.
1244 Builder.SetInsertPoint(PrivTID);
1245 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1246 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1247 PrivTIDAddr);
1248
1249 // Remove redundant call to the outlined function.
1250 CI->eraseFromParent();
1251
1252 for (Instruction *I : ToBeDeleted) {
1253 I->eraseFromParent();
1254 }
1255}
1256
1257// Callback used to create OpenMP runtime calls to support
1258// omp parallel clause for the host.
1259// We need to use this callback to replace call to the OutlinedFn in OuterFn
1260// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1261static void
1263 Function *OuterFn, Value *Ident, Value *IfCondition,
1264 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1265 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1266 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1267 FunctionCallee RTLFn;
1268 if (IfCondition) {
1269 RTLFn =
1270 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1271 } else {
1272 RTLFn =
1273 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1274 }
1275 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1276 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1277 LLVMContext &Ctx = F->getContext();
1278 MDBuilder MDB(Ctx);
1279 // Annotate the callback behavior of the __kmpc_fork_call:
1280 // - The callback callee is argument number 2 (microtask).
1281 // - The first two arguments of the callback callee are unknown (-1).
1282 // - All variadic arguments to the __kmpc_fork_call are passed to the
1283 // callback callee.
1284 F->addMetadata(LLVMContext::MD_callback,
1286 2, {-1, -1},
1287 /* VarArgsArePassed */ true)}));
1288 }
1289 }
1290 // Add some known attributes.
1291 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1292 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1293 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1294
1295 assert(OutlinedFn.arg_size() >= 2 &&
1296 "Expected at least tid and bounded tid as arguments");
1297 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1298
1299 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1300 CI->getParent()->setName("omp_parallel");
1301 Builder.SetInsertPoint(CI);
1302
1303 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1304 Value *ForkCallArgs[] = {
1305 Ident, Builder.getInt32(NumCapturedVars),
1306 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1307
1308 SmallVector<Value *, 16> RealArgs;
1309 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1310 if (IfCondition) {
1311 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1312 RealArgs.push_back(Cond);
1313 }
1314 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1315
1316 // __kmpc_fork_call_if always expects a void ptr as the last argument
1317 // If there are no arguments, pass a null pointer.
1318 auto PtrTy = OMPIRBuilder->VoidPtr;
1319 if (IfCondition && NumCapturedVars == 0) {
1320 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1321 RealArgs.push_back(NullPtrValue);
1322 }
1323 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1324 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1325
1326 Builder.CreateCall(RTLFn, RealArgs);
1327
1328 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1329 << *Builder.GetInsertBlock()->getParent() << "\n");
1330
1331 // Initialize the local TID stack location with the argument value.
1332 Builder.SetInsertPoint(PrivTID);
1333 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1334 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1335 PrivTIDAddr);
1336
1337 // Remove redundant call to the outlined function.
1338 CI->eraseFromParent();
1339
1340 for (Instruction *I : ToBeDeleted) {
1341 I->eraseFromParent();
1342 }
1343}
1344
1346 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1347 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1348 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1349 omp::ProcBindKind ProcBind, bool IsCancellable) {
1350 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1351
1352 if (!updateToLocation(Loc))
1353 return Loc.IP;
1354
1355 uint32_t SrcLocStrSize;
1356 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1357 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1358 Value *ThreadID = getOrCreateThreadID(Ident);
1359 // If we generate code for the target device, we need to allocate
1360 // struct for aggregate params in the device default alloca address space.
1361 // OpenMP runtime requires that the params of the extracted functions are
1362 // passed as zero address space pointers. This flag ensures that extracted
1363 // function arguments are declared in zero address space
1364 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1365
1366 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1367 // only if we compile for host side.
1368 if (NumThreads && !Config.isTargetDevice()) {
1369 Value *Args[] = {
1370 Ident, ThreadID,
1371 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1373 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1374 }
1375
1376 if (ProcBind != OMP_PROC_BIND_default) {
1377 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1378 Value *Args[] = {
1379 Ident, ThreadID,
1380 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1382 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1383 }
1384
1385 BasicBlock *InsertBB = Builder.GetInsertBlock();
1386 Function *OuterFn = InsertBB->getParent();
1387
1388 // Save the outer alloca block because the insertion iterator may get
1389 // invalidated and we still need this later.
1390 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1391
1392 // Vector to remember instructions we used only during the modeling but which
1393 // we want to delete at the end.
1395
1396 // Change the location to the outer alloca insertion point to create and
1397 // initialize the allocas we pass into the parallel region.
1398 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1399 Builder.restoreIP(NewOuter);
1400 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1401 AllocaInst *ZeroAddrAlloca =
1402 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1403 Instruction *TIDAddr = TIDAddrAlloca;
1404 Instruction *ZeroAddr = ZeroAddrAlloca;
1405 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1406 // Add additional casts to enforce pointers in zero address space
1407 TIDAddr = new AddrSpaceCastInst(
1408 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1409 TIDAddr->insertAfter(TIDAddrAlloca);
1410 ToBeDeleted.push_back(TIDAddr);
1411 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1412 PointerType ::get(M.getContext(), 0),
1413 "zero.addr.ascast");
1414 ZeroAddr->insertAfter(ZeroAddrAlloca);
1415 ToBeDeleted.push_back(ZeroAddr);
1416 }
1417
1418 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1419 // associated arguments in the outlined function, so we delete them later.
1420 ToBeDeleted.push_back(TIDAddrAlloca);
1421 ToBeDeleted.push_back(ZeroAddrAlloca);
1422
1423 // Create an artificial insertion point that will also ensure the blocks we
1424 // are about to split are not degenerated.
1425 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1426
1427 BasicBlock *EntryBB = UI->getParent();
1428 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1429 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1430 BasicBlock *PRegPreFiniBB =
1431 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1432 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1433
1434 auto FiniCBWrapper = [&](InsertPointTy IP) {
1435 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1436 // target to the region exit block.
1437 if (IP.getBlock()->end() == IP.getPoint()) {
1439 Builder.restoreIP(IP);
1440 Instruction *I = Builder.CreateBr(PRegExitBB);
1441 IP = InsertPointTy(I->getParent(), I->getIterator());
1442 }
1443 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1444 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1445 "Unexpected insertion point for finalization call!");
1446 return FiniCB(IP);
1447 };
1448
1449 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1450
1451 // Generate the privatization allocas in the block that will become the entry
1452 // of the outlined function.
1453 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1454 InsertPointTy InnerAllocaIP = Builder.saveIP();
1455
1456 AllocaInst *PrivTIDAddr =
1457 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1458 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1459
1460 // Add some fake uses for OpenMP provided arguments.
1461 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1462 Instruction *ZeroAddrUse =
1463 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1464 ToBeDeleted.push_back(ZeroAddrUse);
1465
1466 // EntryBB
1467 // |
1468 // V
1469 // PRegionEntryBB <- Privatization allocas are placed here.
1470 // |
1471 // V
1472 // PRegionBodyBB <- BodeGen is invoked here.
1473 // |
1474 // V
1475 // PRegPreFiniBB <- The block we will start finalization from.
1476 // |
1477 // V
1478 // PRegionExitBB <- A common exit to simplify block collection.
1479 //
1480
1481 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1482
1483 // Let the caller create the body.
1484 assert(BodyGenCB && "Expected body generation callback!");
1485 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1486 BodyGenCB(InnerAllocaIP, CodeGenIP);
1487
1488 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1489
1490 OutlineInfo OI;
1491 if (Config.isTargetDevice()) {
1492 // Generate OpenMP target specific runtime call
1493 OI.PostOutlineCB = [=, ToBeDeletedVec =
1494 std::move(ToBeDeleted)](Function &OutlinedFn) {
1495 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1496 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1497 ThreadID, ToBeDeletedVec);
1498 };
1499 } else {
1500 // Generate OpenMP host runtime call
1501 OI.PostOutlineCB = [=, ToBeDeletedVec =
1502 std::move(ToBeDeleted)](Function &OutlinedFn) {
1503 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1504 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1505 };
1506 }
1507
1508 OI.OuterAllocaBB = OuterAllocaBlock;
1509 OI.EntryBB = PRegEntryBB;
1510 OI.ExitBB = PRegExitBB;
1511
1512 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1514 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1515
1516 // Ensure a single exit node for the outlined region by creating one.
1517 // We might have multiple incoming edges to the exit now due to finalizations,
1518 // e.g., cancel calls that cause the control flow to leave the region.
1519 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1520 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1521 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1522 Blocks.push_back(PRegOutlinedExitBB);
1523
1524 CodeExtractorAnalysisCache CEAC(*OuterFn);
1525 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1526 /* AggregateArgs */ false,
1527 /* BlockFrequencyInfo */ nullptr,
1528 /* BranchProbabilityInfo */ nullptr,
1529 /* AssumptionCache */ nullptr,
1530 /* AllowVarArgs */ true,
1531 /* AllowAlloca */ true,
1532 /* AllocationBlock */ OuterAllocaBlock,
1533 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1534
1535 // Find inputs to, outputs from the code region.
1536 BasicBlock *CommonExit = nullptr;
1537 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1538 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1539 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1540
1541 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1542
1543 FunctionCallee TIDRTLFn =
1544 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1545
1546 auto PrivHelper = [&](Value &V) {
1547 if (&V == TIDAddr || &V == ZeroAddr) {
1548 OI.ExcludeArgsFromAggregate.push_back(&V);
1549 return;
1550 }
1551
1553 for (Use &U : V.uses())
1554 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1555 if (ParallelRegionBlockSet.count(UserI->getParent()))
1556 Uses.insert(&U);
1557
1558 // __kmpc_fork_call expects extra arguments as pointers. If the input
1559 // already has a pointer type, everything is fine. Otherwise, store the
1560 // value onto stack and load it back inside the to-be-outlined region. This
1561 // will ensure only the pointer will be passed to the function.
1562 // FIXME: if there are more than 15 trailing arguments, they must be
1563 // additionally packed in a struct.
1564 Value *Inner = &V;
1565 if (!V.getType()->isPointerTy()) {
1567 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1568
1569 Builder.restoreIP(OuterAllocaIP);
1570 Value *Ptr =
1571 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1572
1573 // Store to stack at end of the block that currently branches to the entry
1574 // block of the to-be-outlined region.
1575 Builder.SetInsertPoint(InsertBB,
1576 InsertBB->getTerminator()->getIterator());
1577 Builder.CreateStore(&V, Ptr);
1578
1579 // Load back next to allocations in the to-be-outlined region.
1580 Builder.restoreIP(InnerAllocaIP);
1581 Inner = Builder.CreateLoad(V.getType(), Ptr);
1582 }
1583
1584 Value *ReplacementValue = nullptr;
1585 CallInst *CI = dyn_cast<CallInst>(&V);
1586 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1587 ReplacementValue = PrivTID;
1588 } else {
1590 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1591 InnerAllocaIP = {
1592 InnerAllocaIP.getBlock(),
1593 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1594
1595 assert(ReplacementValue &&
1596 "Expected copy/create callback to set replacement value!");
1597 if (ReplacementValue == &V)
1598 return;
1599 }
1600
1601 for (Use *UPtr : Uses)
1602 UPtr->set(ReplacementValue);
1603 };
1604
1605 // Reset the inner alloca insertion as it will be used for loading the values
1606 // wrapped into pointers before passing them into the to-be-outlined region.
1607 // Configure it to insert immediately after the fake use of zero address so
1608 // that they are available in the generated body and so that the
1609 // OpenMP-related values (thread ID and zero address pointers) remain leading
1610 // in the argument list.
1611 InnerAllocaIP = IRBuilder<>::InsertPoint(
1612 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1613
1614 // Reset the outer alloca insertion point to the entry of the relevant block
1615 // in case it was invalidated.
1616 OuterAllocaIP = IRBuilder<>::InsertPoint(
1617 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1618
1619 for (Value *Input : Inputs) {
1620 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1621 PrivHelper(*Input);
1622 }
1623 LLVM_DEBUG({
1624 for (Value *Output : Outputs)
1625 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1626 });
1627 assert(Outputs.empty() &&
1628 "OpenMP outlining should not produce live-out values!");
1629
1630 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1631 LLVM_DEBUG({
1632 for (auto *BB : Blocks)
1633 dbgs() << " PBR: " << BB->getName() << "\n";
1634 });
1635
1636 // Adjust the finalization stack, verify the adjustment, and call the
1637 // finalize function a last time to finalize values between the pre-fini
1638 // block and the exit block if we left the parallel "the normal way".
1639 auto FiniInfo = FinalizationStack.pop_back_val();
1640 (void)FiniInfo;
1641 assert(FiniInfo.DK == OMPD_parallel &&
1642 "Unexpected finalization stack state!");
1643
1644 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1645
1646 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1647 FiniCB(PreFiniIP);
1648
1649 // Register the outlined info.
1650 addOutlineInfo(std::move(OI));
1651
1652 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1653 UI->eraseFromParent();
1654
1655 return AfterIP;
1656}
1657
1659 // Build call void __kmpc_flush(ident_t *loc)
1660 uint32_t SrcLocStrSize;
1661 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1662 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1663
1664 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1665}
1666
1668 if (!updateToLocation(Loc))
1669 return;
1670 emitFlush(Loc);
1671}
1672
1674 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1675 // global_tid);
1676 uint32_t SrcLocStrSize;
1677 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1678 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1679 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1680
1681 // Ignore return result until untied tasks are supported.
1682 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1683 Args);
1684}
1685
1687 if (!updateToLocation(Loc))
1688 return;
1689 emitTaskwaitImpl(Loc);
1690}
1691
1693 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1694 uint32_t SrcLocStrSize;
1695 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1696 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1698 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1699
1700 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1701 Args);
1702}
1703
1705 if (!updateToLocation(Loc))
1706 return;
1707 emitTaskyieldImpl(Loc);
1708}
1709
1712 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1713 bool Tied, Value *Final, Value *IfCondition,
1714 SmallVector<DependData> Dependencies) {
1715
1716 if (!updateToLocation(Loc))
1717 return InsertPointTy();
1718
1719 uint32_t SrcLocStrSize;
1720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1722 // The current basic block is split into four basic blocks. After outlining,
1723 // they will be mapped as follows:
1724 // ```
1725 // def current_fn() {
1726 // current_basic_block:
1727 // br label %task.exit
1728 // task.exit:
1729 // ; instructions after task
1730 // }
1731 // def outlined_fn() {
1732 // task.alloca:
1733 // br label %task.body
1734 // task.body:
1735 // ret void
1736 // }
1737 // ```
1738 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1739 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1740 BasicBlock *TaskAllocaBB =
1741 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1742
1743 InsertPointTy TaskAllocaIP =
1744 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1745 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1746 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1747
1748 OutlineInfo OI;
1749 OI.EntryBB = TaskAllocaBB;
1750 OI.OuterAllocaBB = AllocaIP.getBlock();
1751 OI.ExitBB = TaskExitBB;
1752
1753 // Add the thread ID argument.
1754 std::stack<Instruction *> ToBeDeleted;
1756 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1757
1758 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1759 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1760 // Replace the Stale CI by appropriate RTL function call.
1761 assert(OutlinedFn.getNumUses() == 1 &&
1762 "there must be a single user for the outlined function");
1763 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1764
1765 // HasShareds is true if any variables are captured in the outlined region,
1766 // false otherwise.
1767 bool HasShareds = StaleCI->arg_size() > 1;
1768 Builder.SetInsertPoint(StaleCI);
1769
1770 // Gather the arguments for emitting the runtime call for
1771 // @__kmpc_omp_task_alloc
1772 Function *TaskAllocFn =
1773 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1774
1775 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1776 // call.
1777 Value *ThreadID = getOrCreateThreadID(Ident);
1778
1779 // Argument - `flags`
1780 // Task is tied iff (Flags & 1) == 1.
1781 // Task is untied iff (Flags & 1) == 0.
1782 // Task is final iff (Flags & 2) == 2.
1783 // Task is not final iff (Flags & 2) == 0.
1784 // TODO: Handle the other flags.
1785 Value *Flags = Builder.getInt32(Tied);
1786 if (Final) {
1787 Value *FinalFlag =
1789 Flags = Builder.CreateOr(FinalFlag, Flags);
1790 }
1791
1792 // Argument - `sizeof_kmp_task_t` (TaskSize)
1793 // Tasksize refers to the size in bytes of kmp_task_t data structure
1794 // including private vars accessed in task.
1795 // TODO: add kmp_task_t_with_privates (privates)
1796 Value *TaskSize = Builder.getInt64(
1798
1799 // Argument - `sizeof_shareds` (SharedsSize)
1800 // SharedsSize refers to the shareds array size in the kmp_task_t data
1801 // structure.
1802 Value *SharedsSize = Builder.getInt64(0);
1803 if (HasShareds) {
1804 AllocaInst *ArgStructAlloca =
1805 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1806 assert(ArgStructAlloca &&
1807 "Unable to find the alloca instruction corresponding to arguments "
1808 "for extracted function");
1809 StructType *ArgStructType =
1810 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1811 assert(ArgStructType && "Unable to find struct type corresponding to "
1812 "arguments for extracted function");
1813 SharedsSize =
1815 }
1816 // Emit the @__kmpc_omp_task_alloc runtime call
1817 // The runtime call returns a pointer to an area where the task captured
1818 // variables must be copied before the task is run (TaskData)
1819 CallInst *TaskData = Builder.CreateCall(
1820 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1821 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1822 /*task_func=*/&OutlinedFn});
1823
1824 // Copy the arguments for outlined function
1825 if (HasShareds) {
1826 Value *Shareds = StaleCI->getArgOperand(1);
1827 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1828 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1829 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1830 SharedsSize);
1831 }
1832
1833 Value *DepArray = nullptr;
1834 if (Dependencies.size()) {
1835 InsertPointTy OldIP = Builder.saveIP();
1837 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1838
1839 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1840 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1841
1842 unsigned P = 0;
1843 for (const DependData &Dep : Dependencies) {
1844 Value *Base =
1845 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1846 // Store the pointer to the variable
1848 DependInfo, Base,
1849 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1850 Value *DepValPtr =
1852 Builder.CreateStore(DepValPtr, Addr);
1853 // Store the size of the variable
1855 DependInfo, Base,
1856 static_cast<unsigned int>(RTLDependInfoFields::Len));
1858 Dep.DepValueType)),
1859 Size);
1860 // Store the dependency kind
1862 DependInfo, Base,
1863 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1865 ConstantInt::get(Builder.getInt8Ty(),
1866 static_cast<unsigned int>(Dep.DepKind)),
1867 Flags);
1868 ++P;
1869 }
1870
1871 Builder.restoreIP(OldIP);
1872 }
1873
1874 // In the presence of the `if` clause, the following IR is generated:
1875 // ...
1876 // %data = call @__kmpc_omp_task_alloc(...)
1877 // br i1 %if_condition, label %then, label %else
1878 // then:
1879 // call @__kmpc_omp_task(...)
1880 // br label %exit
1881 // else:
1882 // ;; Wait for resolution of dependencies, if any, before
1883 // ;; beginning the task
1884 // call @__kmpc_omp_wait_deps(...)
1885 // call @__kmpc_omp_task_begin_if0(...)
1886 // call @outlined_fn(...)
1887 // call @__kmpc_omp_task_complete_if0(...)
1888 // br label %exit
1889 // exit:
1890 // ...
1891 if (IfCondition) {
1892 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1893 // terminator.
1894 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1895 Instruction *IfTerminator =
1896 Builder.GetInsertPoint()->getParent()->getTerminator();
1897 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1898 Builder.SetInsertPoint(IfTerminator);
1899 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1900 &ElseTI);
1901 Builder.SetInsertPoint(ElseTI);
1902
1903 if (Dependencies.size()) {
1904 Function *TaskWaitFn =
1905 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1907 TaskWaitFn,
1908 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1909 ConstantInt::get(Builder.getInt32Ty(), 0),
1911 }
1912 Function *TaskBeginFn =
1913 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1914 Function *TaskCompleteFn =
1915 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1916 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1917 CallInst *CI = nullptr;
1918 if (HasShareds)
1919 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1920 else
1921 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1922 CI->setDebugLoc(StaleCI->getDebugLoc());
1923 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1924 Builder.SetInsertPoint(ThenTI);
1925 }
1926
1927 if (Dependencies.size()) {
1928 Function *TaskFn =
1929 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
1931 TaskFn,
1932 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
1933 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
1935
1936 } else {
1937 // Emit the @__kmpc_omp_task runtime call to spawn the task
1938 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
1939 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
1940 }
1941
1942 StaleCI->eraseFromParent();
1943
1944 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
1945 if (HasShareds) {
1946 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
1947 OutlinedFn.getArg(1)->replaceUsesWithIf(
1948 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
1949 }
1950
1951 while (!ToBeDeleted.empty()) {
1952 ToBeDeleted.top()->eraseFromParent();
1953 ToBeDeleted.pop();
1954 }
1955 };
1956
1957 addOutlineInfo(std::move(OI));
1958 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
1959
1960 return Builder.saveIP();
1961}
1962
1965 InsertPointTy AllocaIP,
1966 BodyGenCallbackTy BodyGenCB) {
1967 if (!updateToLocation(Loc))
1968 return InsertPointTy();
1969
1970 uint32_t SrcLocStrSize;
1971 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1972 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1973 Value *ThreadID = getOrCreateThreadID(Ident);
1974
1975 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
1976 Function *TaskgroupFn =
1977 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
1978 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
1979
1980 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
1981 BodyGenCB(AllocaIP, Builder.saveIP());
1982
1983 Builder.SetInsertPoint(TaskgroupExitBB);
1984 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
1985 Function *EndTaskgroupFn =
1986 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
1987 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
1988
1989 return Builder.saveIP();
1990}
1991
1993 const LocationDescription &Loc, InsertPointTy AllocaIP,
1995 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
1996 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
1997
1998 if (!updateToLocation(Loc))
1999 return Loc.IP;
2000
2001 auto FiniCBWrapper = [&](InsertPointTy IP) {
2002 if (IP.getBlock()->end() != IP.getPoint())
2003 return FiniCB(IP);
2004 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2005 // will fail because that function requires the Finalization Basic Block to
2006 // have a terminator, which is already removed by EmitOMPRegionBody.
2007 // IP is currently at cancelation block.
2008 // We need to backtrack to the condition block to fetch
2009 // the exit block and create a branch from cancelation
2010 // to exit block.
2012 Builder.restoreIP(IP);
2013 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2014 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2015 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2016 Instruction *I = Builder.CreateBr(ExitBB);
2017 IP = InsertPointTy(I->getParent(), I->getIterator());
2018 return FiniCB(IP);
2019 };
2020
2021 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2022
2023 // Each section is emitted as a switch case
2024 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2025 // -> OMP.createSection() which generates the IR for each section
2026 // Iterate through all sections and emit a switch construct:
2027 // switch (IV) {
2028 // case 0:
2029 // <SectionStmt[0]>;
2030 // break;
2031 // ...
2032 // case <NumSection> - 1:
2033 // <SectionStmt[<NumSection> - 1]>;
2034 // break;
2035 // }
2036 // ...
2037 // section_loop.after:
2038 // <FiniCB>;
2039 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2040 Builder.restoreIP(CodeGenIP);
2042 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2043 Function *CurFn = Continue->getParent();
2044 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2045
2046 unsigned CaseNumber = 0;
2047 for (auto SectionCB : SectionCBs) {
2049 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2050 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2051 Builder.SetInsertPoint(CaseBB);
2052 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2053 SectionCB(InsertPointTy(),
2054 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2055 CaseNumber++;
2056 }
2057 // remove the existing terminator from body BB since there can be no
2058 // terminators after switch/case
2059 };
2060 // Loop body ends here
2061 // LowerBound, UpperBound, and STride for createCanonicalLoop
2062 Type *I32Ty = Type::getInt32Ty(M.getContext());
2063 Value *LB = ConstantInt::get(I32Ty, 0);
2064 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2065 Value *ST = ConstantInt::get(I32Ty, 1);
2067 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2068 InsertPointTy AfterIP =
2069 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2070
2071 // Apply the finalization callback in LoopAfterBB
2072 auto FiniInfo = FinalizationStack.pop_back_val();
2073 assert(FiniInfo.DK == OMPD_sections &&
2074 "Unexpected finalization stack state!");
2075 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2076 Builder.restoreIP(AfterIP);
2077 BasicBlock *FiniBB =
2078 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2079 CB(Builder.saveIP());
2080 AfterIP = {FiniBB, FiniBB->begin()};
2081 }
2082
2083 return AfterIP;
2084}
2085
2088 BodyGenCallbackTy BodyGenCB,
2089 FinalizeCallbackTy FiniCB) {
2090 if (!updateToLocation(Loc))
2091 return Loc.IP;
2092
2093 auto FiniCBWrapper = [&](InsertPointTy IP) {
2094 if (IP.getBlock()->end() != IP.getPoint())
2095 return FiniCB(IP);
2096 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2097 // will fail because that function requires the Finalization Basic Block to
2098 // have a terminator, which is already removed by EmitOMPRegionBody.
2099 // IP is currently at cancelation block.
2100 // We need to backtrack to the condition block to fetch
2101 // the exit block and create a branch from cancelation
2102 // to exit block.
2104 Builder.restoreIP(IP);
2105 auto *CaseBB = Loc.IP.getBlock();
2106 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2107 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2108 Instruction *I = Builder.CreateBr(ExitBB);
2109 IP = InsertPointTy(I->getParent(), I->getIterator());
2110 return FiniCB(IP);
2111 };
2112
2113 Directive OMPD = Directive::OMPD_sections;
2114 // Since we are using Finalization Callback here, HasFinalize
2115 // and IsCancellable have to be true
2116 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2117 /*Conditional*/ false, /*hasFinalize*/ true,
2118 /*IsCancellable*/ true);
2119}
2120
2123 IT++;
2124 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2125}
2126
2127void OpenMPIRBuilder::emitUsed(StringRef Name,
2128 std::vector<WeakTrackingVH> &List) {
2129 if (List.empty())
2130 return;
2131
2132 // Convert List to what ConstantArray needs.
2134 UsedArray.resize(List.size());
2135 for (unsigned I = 0, E = List.size(); I != E; ++I)
2137 cast<Constant>(&*List[I]), Builder.getPtrTy());
2138
2139 if (UsedArray.empty())
2140 return;
2141 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2142
2143 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2144 ConstantArray::get(ATy, UsedArray), Name);
2145
2146 GV->setSection("llvm.metadata");
2147}
2148
2149Value *OpenMPIRBuilder::getGPUThreadID() {
2150 return Builder.CreateCall(
2152 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2153 {});
2154}
2155
2156Value *OpenMPIRBuilder::getGPUWarpSize() {
2157 return Builder.CreateCall(
2158 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2159}
2160
2161Value *OpenMPIRBuilder::getNVPTXWarpID() {
2162 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2163 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2164}
2165
2166Value *OpenMPIRBuilder::getNVPTXLaneID() {
2167 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2168 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2169 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2170 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2171 "nvptx_lane_id");
2172}
2173
2174Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2175 Type *ToType) {
2176 Type *FromType = From->getType();
2177 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2178 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2179 assert(FromSize > 0 && "From size must be greater than zero");
2180 assert(ToSize > 0 && "To size must be greater than zero");
2181 if (FromType == ToType)
2182 return From;
2183 if (FromSize == ToSize)
2184 return Builder.CreateBitCast(From, ToType);
2185 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2186 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2187 InsertPointTy SaveIP = Builder.saveIP();
2188 Builder.restoreIP(AllocaIP);
2189 Value *CastItem = Builder.CreateAlloca(ToType);
2190 Builder.restoreIP(SaveIP);
2191
2193 CastItem, FromType->getPointerTo());
2194 Builder.CreateStore(From, ValCastItem);
2195 return Builder.CreateLoad(ToType, CastItem);
2196}
2197
2198Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2199 Value *Element,
2200 Type *ElementType,
2201 Value *Offset) {
2202 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2203 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2204
2205 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2206 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2207 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2208 Value *WarpSize =
2209 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2211 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2212 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2213 Value *WarpSizeCast =
2214 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2215 Value *ShuffleCall =
2216 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2217 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2218}
2219
2220void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2221 Value *DstAddr, Type *ElemType,
2222 Value *Offset, Type *ReductionArrayTy) {
2224 // Create the loop over the big sized data.
2225 // ptr = (void*)Elem;
2226 // ptrEnd = (void*) Elem + 1;
2227 // Step = 8;
2228 // while (ptr + Step < ptrEnd)
2229 // shuffle((int64_t)*ptr);
2230 // Step = 4;
2231 // while (ptr + Step < ptrEnd)
2232 // shuffle((int32_t)*ptr);
2233 // ...
2234 Type *IndexTy = Builder.getIndexTy(
2236 Value *ElemPtr = DstAddr;
2237 Value *Ptr = SrcAddr;
2238 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2239 if (Size < IntSize)
2240 continue;
2241 Type *IntType = Builder.getIntNTy(IntSize * 8);
2243 Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
2244 Value *SrcAddrGEP =
2245 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2247 ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
2248
2249 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2250 if ((Size / IntSize) > 1) {
2252 SrcAddrGEP, Builder.getPtrTy());
2253 BasicBlock *PreCondBB =
2254 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2255 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2256 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2257 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2258 emitBlock(PreCondBB, CurFunc);
2259 PHINode *PhiSrc =
2260 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2261 PhiSrc->addIncoming(Ptr, CurrentBB);
2262 PHINode *PhiDest =
2263 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2264 PhiDest->addIncoming(ElemPtr, CurrentBB);
2265 Ptr = PhiSrc;
2266 ElemPtr = PhiDest;
2267 Value *PtrDiff = Builder.CreatePtrDiff(
2268 Builder.getInt8Ty(), PtrEnd,
2271 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2272 ExitBB);
2273 emitBlock(ThenBB, CurFunc);
2274 Value *Res = createRuntimeShuffleFunction(
2275 AllocaIP,
2277 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2278 IntType, Offset);
2279 Builder.CreateAlignedStore(Res, ElemPtr,
2280 M.getDataLayout().getPrefTypeAlign(ElemType));
2281 Value *LocalPtr =
2282 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2283 Value *LocalElemPtr =
2284 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2285 PhiSrc->addIncoming(LocalPtr, ThenBB);
2286 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2287 emitBranch(PreCondBB);
2288 emitBlock(ExitBB, CurFunc);
2289 } else {
2290 Value *Res = createRuntimeShuffleFunction(
2291 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2292 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2293 Res->getType()->getScalarSizeInBits())
2294 Res = Builder.CreateTrunc(Res, ElemType);
2295 Builder.CreateStore(Res, ElemPtr);
2296 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2297 ElemPtr =
2298 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2299 }
2300 Size = Size % IntSize;
2301 }
2302}
2303
2304void OpenMPIRBuilder::emitReductionListCopy(
2305 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2306 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2307 CopyOptionsTy CopyOptions) {
2308 Type *IndexTy = Builder.getIndexTy(
2310 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2311
2312 // Iterates, element-by-element, through the source Reduce list and
2313 // make a copy.
2314 for (auto En : enumerate(ReductionInfos)) {
2315 const ReductionInfo &RI = En.value();
2316 Value *SrcElementAddr = nullptr;
2317 Value *DestElementAddr = nullptr;
2318 Value *DestElementPtrAddr = nullptr;
2319 // Should we shuffle in an element from a remote lane?
2320 bool ShuffleInElement = false;
2321 // Set to true to update the pointer in the dest Reduce list to a
2322 // newly created element.
2323 bool UpdateDestListPtr = false;
2324
2325 // Step 1.1: Get the address for the src element in the Reduce list.
2326 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2327 ReductionArrayTy, SrcBase,
2328 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2329 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2330
2331 // Step 1.2: Create a temporary to store the element in the destination
2332 // Reduce list.
2333 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2334 ReductionArrayTy, DestBase,
2335 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2336 switch (Action) {
2338 InsertPointTy CurIP = Builder.saveIP();
2339 Builder.restoreIP(AllocaIP);
2340 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2341 ".omp.reduction.element");
2342 DestAlloca->setAlignment(
2343 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2344 DestElementAddr = DestAlloca;
2345 DestElementAddr =
2346 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2347 DestElementAddr->getName() + ".ascast");
2348 Builder.restoreIP(CurIP);
2349 ShuffleInElement = true;
2350 UpdateDestListPtr = true;
2351 break;
2352 }
2354 DestElementAddr =
2355 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2356 break;
2357 }
2358 }
2359
2360 // Now that all active lanes have read the element in the
2361 // Reduce list, shuffle over the value from the remote lane.
2362 if (ShuffleInElement) {
2363 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2364 RemoteLaneOffset, ReductionArrayTy);
2365 } else {
2366 switch (RI.EvaluationKind) {
2367 case EvalKind::Scalar: {
2368 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2369 // Store the source element value to the dest element address.
2370 Builder.CreateStore(Elem, DestElementAddr);
2371 break;
2372 }
2373 case EvalKind::Complex: {
2375 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2376 Value *SrcReal = Builder.CreateLoad(
2377 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2379 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2380 Value *SrcImg = Builder.CreateLoad(
2381 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2382
2384 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2386 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2387 Builder.CreateStore(SrcReal, DestRealPtr);
2388 Builder.CreateStore(SrcImg, DestImgPtr);
2389 break;
2390 }
2391 case EvalKind::Aggregate: {
2392 Value *SizeVal = Builder.getInt64(
2393 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2395 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2396 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2397 SizeVal, false);
2398 break;
2399 }
2400 };
2401 }
2402
2403 // Step 3.1: Modify reference in dest Reduce list as needed.
2404 // Modifying the reference in Reduce list to point to the newly
2405 // created element. The element is live in the current function
2406 // scope and that of functions it invokes (i.e., reduce_function).
2407 // RemoteReduceData[i] = (void*)&RemoteElem
2408 if (UpdateDestListPtr) {
2410 DestElementAddr, Builder.getPtrTy(),
2411 DestElementAddr->getName() + ".ascast");
2412 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2413 }
2414 }
2415}
2416
2417Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
2418 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2419 AttributeList FuncAttrs) {
2420 InsertPointTy SavedIP = Builder.saveIP();
2421 LLVMContext &Ctx = M.getContext();
2423 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2424 /* IsVarArg */ false);
2425 Function *WcFunc =
2427 "_omp_reduction_inter_warp_copy_func", &M);
2428 WcFunc->setAttributes(FuncAttrs);
2429 WcFunc->addParamAttr(0, Attribute::NoUndef);
2430 WcFunc->addParamAttr(1, Attribute::NoUndef);
2431 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2432 Builder.SetInsertPoint(EntryBB);
2433
2434 // ReduceList: thread local Reduce list.
2435 // At the stage of the computation when this function is called, partially
2436 // aggregated values reside in the first lane of every active warp.
2437 Argument *ReduceListArg = WcFunc->getArg(0);
2438 // NumWarps: number of warps active in the parallel region. This could
2439 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2440 Argument *NumWarpsArg = WcFunc->getArg(1);
2441
2442 // This array is used as a medium to transfer, one reduce element at a time,
2443 // the data from the first lane of every warp to lanes in the first warp
2444 // in order to perform the final step of a reduction in a parallel region
2445 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2446 // for reduced latency, as well as to have a distinct copy for concurrently
2447 // executing target regions. The array is declared with common linkage so
2448 // as to be shared across compilation units.
2449 StringRef TransferMediumName =
2450 "__openmp_nvptx_data_transfer_temporary_storage";
2451 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2452 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2453 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2454 if (!TransferMedium) {
2455 TransferMedium = new GlobalVariable(
2456 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2457 UndefValue::get(ArrayTy), TransferMediumName,
2458 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2459 /*AddressSpace=*/3);
2460 }
2461
2462 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2463 Value *GPUThreadID = getGPUThreadID();
2464 // nvptx_lane_id = nvptx_id % warpsize
2465 Value *LaneID = getNVPTXLaneID();
2466 // nvptx_warp_id = nvptx_id / warpsize
2467 Value *WarpID = getNVPTXWarpID();
2468
2469 InsertPointTy AllocaIP =
2472 Type *Arg0Type = ReduceListArg->getType();
2473 Type *Arg1Type = NumWarpsArg->getType();
2474 Builder.restoreIP(AllocaIP);
2475 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2476 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2477 AllocaInst *NumWarpsAlloca =
2478 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2480 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2482 NumWarpsAlloca, Arg1Type->getPointerTo(),
2483 NumWarpsAlloca->getName() + ".ascast");
2484 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2485 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2486 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2487 InsertPointTy CodeGenIP =
2489 Builder.restoreIP(CodeGenIP);
2490
2491 Value *ReduceList =
2492 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2493
2494 for (auto En : enumerate(ReductionInfos)) {
2495 //
2496 // Warp master copies reduce element to transfer medium in __shared__
2497 // memory.
2498 //
2499 const ReductionInfo &RI = En.value();
2500 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2501 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2502 Type *CType = Builder.getIntNTy(TySize * 8);
2503
2504 unsigned NumIters = RealTySize / TySize;
2505 if (NumIters == 0)
2506 continue;
2507 Value *Cnt = nullptr;
2508 Value *CntAddr = nullptr;
2509 BasicBlock *PrecondBB = nullptr;
2510 BasicBlock *ExitBB = nullptr;
2511 if (NumIters > 1) {
2512 CodeGenIP = Builder.saveIP();
2513 Builder.restoreIP(AllocaIP);
2514 CntAddr =
2515 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2516
2517 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2518 CntAddr->getName() + ".ascast");
2519 Builder.restoreIP(CodeGenIP);
2521 CntAddr,
2522 /*Volatile=*/false);
2523 PrecondBB = BasicBlock::Create(Ctx, "precond");
2524 ExitBB = BasicBlock::Create(Ctx, "exit");
2525 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2526 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2527 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2528 /*Volatile=*/false);
2530 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2531 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2533 }
2534
2535 // kmpc_barrier.
2536 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2537 omp::Directive::OMPD_unknown,
2538 /* ForceSimpleCall */ false,
2539 /* CheckCancelFlag */ true);
2540 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2541 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2542 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2543
2544 // if (lane_id == 0)
2545 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2546 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2548
2549 // Reduce element = LocalReduceList[i]
2550 auto *RedListArrayTy =
2551 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2552 Type *IndexTy = Builder.getIndexTy(
2554 Value *ElemPtrPtr =
2555 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2556 {ConstantInt::get(IndexTy, 0),
2557 ConstantInt::get(IndexTy, En.index())});
2558 // elemptr = ((CopyType*)(elemptrptr)) + I
2559 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2560 if (NumIters > 1)
2561 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2562
2563 // Get pointer to location in transfer medium.
2564 // MediumPtr = &medium[warp_id]
2565 Value *MediumPtr = Builder.CreateInBoundsGEP(
2566 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2567 // elem = *elemptr
2568 //*MediumPtr = elem
2569 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2570 // Store the source element value to the dest element address.
2571 Builder.CreateStore(Elem, MediumPtr,
2572 /*IsVolatile*/ true);
2573 Builder.CreateBr(MergeBB);
2574
2575 // else
2577 Builder.CreateBr(MergeBB);
2578
2579 // endif
2581 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2582 omp::Directive::OMPD_unknown,
2583 /* ForceSimpleCall */ false,
2584 /* CheckCancelFlag */ true);
2585
2586 // Warp 0 copies reduce element from transfer medium
2587 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2588 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2589 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2590
2591 Value *NumWarpsVal =
2592 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2593 // Up to 32 threads in warp 0 are active.
2594 Value *IsActiveThread =
2595 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2596 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2597
2598 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2599
2600 // SecMediumPtr = &medium[tid]
2601 // SrcMediumVal = *SrcMediumPtr
2602 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2603 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2604 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2605 Value *TargetElemPtrPtr =
2606 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2607 {ConstantInt::get(IndexTy, 0),
2608 ConstantInt::get(IndexTy, En.index())});
2609 Value *TargetElemPtrVal =
2610 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2611 Value *TargetElemPtr = TargetElemPtrVal;
2612 if (NumIters > 1)
2613 TargetElemPtr =
2614 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2615
2616 // *TargetElemPtr = SrcMediumVal;
2617 Value *SrcMediumValue =
2618 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2619 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2620 Builder.CreateBr(W0MergeBB);
2621
2622 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2623 Builder.CreateBr(W0MergeBB);
2624
2625 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2626
2627 if (NumIters > 1) {
2628 Cnt = Builder.CreateNSWAdd(
2629 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2630 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2631
2632 auto *CurFn = Builder.GetInsertBlock()->getParent();
2633 emitBranch(PrecondBB);
2634 emitBlock(ExitBB, CurFn);
2635 }
2636 RealTySize %= TySize;
2637 }
2638 }
2639
2641 Builder.restoreIP(SavedIP);
2642
2643 return WcFunc;
2644}
2645
2646Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2647 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2648 AttributeList FuncAttrs) {
2649 LLVMContext &Ctx = M.getContext();
2650 FunctionType *FuncTy =
2652 {Builder.getPtrTy(), Builder.getInt16Ty(),
2653 Builder.getInt16Ty(), Builder.getInt16Ty()},
2654 /* IsVarArg */ false);
2655 Function *SarFunc =
2657 "_omp_reduction_shuffle_and_reduce_func", &M);
2658 SarFunc->setAttributes(FuncAttrs);
2659 SarFunc->addParamAttr(0, Attribute::NoUndef);
2660 SarFunc->addParamAttr(1, Attribute::NoUndef);
2661 SarFunc->addParamAttr(2, Attribute::NoUndef);
2662 SarFunc->addParamAttr(3, Attribute::NoUndef);
2663 SarFunc->addParamAttr(1, Attribute::SExt);
2664 SarFunc->addParamAttr(2, Attribute::SExt);
2665 SarFunc->addParamAttr(3, Attribute::SExt);
2666 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2667 Builder.SetInsertPoint(EntryBB);
2668
2669 // Thread local Reduce list used to host the values of data to be reduced.
2670 Argument *ReduceListArg = SarFunc->getArg(0);
2671 // Current lane id; could be logical.
2672 Argument *LaneIDArg = SarFunc->getArg(1);
2673 // Offset of the remote source lane relative to the current lane.
2674 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2675 // Algorithm version. This is expected to be known at compile time.
2676 Argument *AlgoVerArg = SarFunc->getArg(3);
2677
2678 Type *ReduceListArgType = ReduceListArg->getType();
2679 Type *LaneIDArgType = LaneIDArg->getType();
2680 Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
2681 Value *ReduceListAlloca = Builder.CreateAlloca(
2682 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2683 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2684 LaneIDArg->getName() + ".addr");
2685 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2686 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2687 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2688 AlgoVerArg->getName() + ".addr");
2689 ArrayType *RedListArrayTy =
2690 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2691
2692 // Create a local thread-private variable to host the Reduce list
2693 // from a remote lane.
2694 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2695 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2696
2698 ReduceListAlloca, ReduceListArgType,
2699 ReduceListAlloca->getName() + ".ascast");
2701 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2702 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2703 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2704 RemoteLaneOffsetAlloca->getName() + ".ascast");
2706 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2708 RemoteReductionListAlloca, Builder.getPtrTy(),
2709 RemoteReductionListAlloca->getName() + ".ascast");
2710
2711 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2712 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2713 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2714 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2715
2716 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2717 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2718 Value *RemoteLaneOffset =
2719 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2720 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2721
2722 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2723
2724 // This loop iterates through the list of reduce elements and copies,
2725 // element by element, from a remote lane in the warp to RemoteReduceList,
2726 // hosted on the thread's stack.
2727 emitReductionListCopy(
2728 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2729 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2730
2731 // The actions to be performed on the Remote Reduce list is dependent
2732 // on the algorithm version.
2733 //
2734 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2735 // LaneId % 2 == 0 && Offset > 0):
2736 // do the reduction value aggregation
2737 //
2738 // The thread local variable Reduce list is mutated in place to host the
2739 // reduced data, which is the aggregated value produced from local and
2740 // remote lanes.
2741 //
2742 // Note that AlgoVer is expected to be a constant integer known at compile
2743 // time.
2744 // When AlgoVer==0, the first conjunction evaluates to true, making
2745 // the entire predicate true during compile time.
2746 // When AlgoVer==1, the second conjunction has only the second part to be
2747 // evaluated during runtime. Other conjunctions evaluates to false
2748 // during compile time.
2749 // When AlgoVer==2, the third conjunction has only the second part to be
2750 // evaluated during runtime. Other conjunctions evaluates to false
2751 // during compile time.
2752 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2753 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2754 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2755 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2756 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2757 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2758 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2759 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2760 Value *RemoteOffsetComp =
2761 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2762 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2763 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2764 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2765
2766 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2767 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2768 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2769
2770 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2773 ReduceList, Builder.getPtrTy());
2774 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2775 RemoteListAddrCast, Builder.getPtrTy());
2776 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2777 ->addFnAttr(Attribute::NoUnwind);
2778 Builder.CreateBr(MergeBB);
2779
2781 Builder.CreateBr(MergeBB);
2782
2784
2785 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2786 // Reduce list.
2787 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2788 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2789 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2790
2791 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2792 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2793 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2794 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2795
2796 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2797 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2798 ReductionInfos, RemoteListAddrCast, ReduceList);
2799 Builder.CreateBr(CpyMergeBB);
2800
2801 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2802 Builder.CreateBr(CpyMergeBB);
2803
2804 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2805
2807
2808 return SarFunc;
2809}
2810
2811Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2812 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2813 AttributeList FuncAttrs) {
2815 LLVMContext &Ctx = M.getContext();
2818 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2819 /* IsVarArg */ false);
2820 Function *LtGCFunc =
2822 "_omp_reduction_list_to_global_copy_func", &M);
2823 LtGCFunc->setAttributes(FuncAttrs);
2824 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2825 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2826 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2827
2828 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2829 Builder.SetInsertPoint(EntryBlock);
2830
2831 // Buffer: global reduction buffer.
2832 Argument *BufferArg = LtGCFunc->getArg(0);
2833 // Idx: index of the buffer.
2834 Argument *IdxArg = LtGCFunc->getArg(1);
2835 // ReduceList: thread local Reduce list.
2836 Argument *ReduceListArg = LtGCFunc->getArg(2);
2837
2838 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2839 BufferArg->getName() + ".addr");
2840 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2841 IdxArg->getName() + ".addr");
2842 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2843 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2845 BufferArgAlloca, Builder.getPtrTy(),
2846 BufferArgAlloca->getName() + ".ascast");
2848 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2849 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2850 ReduceListArgAlloca, Builder.getPtrTy(),
2851 ReduceListArgAlloca->getName() + ".ascast");
2852
2853 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2854 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2855 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2856
2857 Value *LocalReduceList =
2858 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2859 Value *BufferArgVal =
2860 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2861 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2862 Type *IndexTy = Builder.getIndexTy(
2864 for (auto En : enumerate(ReductionInfos)) {
2865 const ReductionInfo &RI = En.value();
2866 auto *RedListArrayTy =
2867 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2868 // Reduce element = LocalReduceList[i]
2869 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
2870 RedListArrayTy, LocalReduceList,
2871 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2872 // elemptr = ((CopyType*)(elemptrptr)) + I
2873 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2874
2875 // Global = Buffer.VD[Idx];
2876 Value *BufferVD =
2877 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
2879 ReductionsBufferTy, BufferVD, 0, En.index());
2880
2881 switch (RI.EvaluationKind) {
2882 case EvalKind::Scalar: {
2883 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
2884 Builder.CreateStore(TargetElement, GlobVal);
2885 break;
2886 }
2887 case EvalKind::Complex: {
2889 RI.ElementType, ElemPtr, 0, 0, ".realp");
2890 Value *SrcReal = Builder.CreateLoad(
2891 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2893 RI.ElementType, ElemPtr, 0, 1, ".imagp");
2894 Value *SrcImg = Builder.CreateLoad(
2895 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2896
2898 RI.ElementType, GlobVal, 0, 0, ".realp");
2900 RI.ElementType, GlobVal, 0, 1, ".imagp");
2901 Builder.CreateStore(SrcReal, DestRealPtr);
2902 Builder.CreateStore(SrcImg, DestImgPtr);
2903 break;
2904 }
2905 case EvalKind::Aggregate: {
2906 Value *SizeVal =
2907 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
2909 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
2910 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
2911 break;
2912 }
2913 }
2914 }
2915
2917 Builder.restoreIP(OldIP);
2918 return LtGCFunc;
2919}
2920
2921Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
2922 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2923 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
2925 LLVMContext &Ctx = M.getContext();
2928 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2929 /* IsVarArg */ false);
2930 Function *LtGRFunc =
2932 "_omp_reduction_list_to_global_reduce_func", &M);
2933 LtGRFunc->setAttributes(FuncAttrs);
2934 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
2935 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
2936 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
2937
2938 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
2939 Builder.SetInsertPoint(EntryBlock);
2940
2941 // Buffer: global reduction buffer.
2942 Argument *BufferArg = LtGRFunc->getArg(0);
2943 // Idx: index of the buffer.
2944 Argument *IdxArg = LtGRFunc->getArg(1);
2945 // ReduceList: thread local Reduce list.
2946 Argument *ReduceListArg = LtGRFunc->getArg(2);
2947
2948 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2949 BufferArg->getName() + ".addr");
2950 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2951 IdxArg->getName() + ".addr");
2952 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2953 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2954 auto *RedListArrayTy =
2955 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2956
2957 // 1. Build a list of reduction variables.
2958 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2959 Value *LocalReduceList =
2960 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
2961
2963 BufferArgAlloca, Builder.getPtrTy(),
2964 BufferArgAlloca->getName() + ".ascast");
2966 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2967 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2968 ReduceListArgAlloca, Builder.getPtrTy(),
2969 ReduceListArgAlloca->getName() + ".ascast");
2970 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2971 LocalReduceList, Builder.getPtrTy(),
2972 LocalReduceList->getName() + ".ascast");
2973
2974 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2975 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2976 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2977
2978 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2979 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2980 Type *IndexTy = Builder.getIndexTy(
2982 for (auto En : enumerate(ReductionInfos)) {
2983 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
2984 RedListArrayTy, LocalReduceListAddrCast,
2985 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2986 Value *BufferVD =
2987 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
2988 // Global = Buffer.VD[Idx];
2990 ReductionsBufferTy, BufferVD, 0, En.index());
2991 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
2992 }
2993
2994 // Call reduce_function(GlobalReduceList, ReduceList)
2995 Value *ReduceList =
2996 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2997 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
2998 ->addFnAttr(Attribute::NoUnwind);
3000 Builder.restoreIP(OldIP);
3001 return LtGRFunc;
3002}
3003
3004Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3005 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3006 AttributeList FuncAttrs) {
3008 LLVMContext &Ctx = M.getContext();
3011 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3012 /* IsVarArg */ false);
3013 Function *LtGCFunc =
3015 "_omp_reduction_global_to_list_copy_func", &M);
3016 LtGCFunc->setAttributes(FuncAttrs);
3017 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3018 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3019 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3020
3021 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3022 Builder.SetInsertPoint(EntryBlock);
3023
3024 // Buffer: global reduction buffer.
3025 Argument *BufferArg = LtGCFunc->getArg(0);
3026 // Idx: index of the buffer.
3027 Argument *IdxArg = LtGCFunc->getArg(1);
3028 // ReduceList: thread local Reduce list.
3029 Argument *ReduceListArg = LtGCFunc->getArg(2);
3030
3031 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3032 BufferArg->getName() + ".addr");
3033 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3034 IdxArg->getName() + ".addr");
3035 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3036 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3038 BufferArgAlloca, Builder.getPtrTy(),
3039 BufferArgAlloca->getName() + ".ascast");
3041 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3042 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3043 ReduceListArgAlloca, Builder.getPtrTy(),
3044 ReduceListArgAlloca->getName() + ".ascast");
3045 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3046 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3047 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3048
3049 Value *LocalReduceList =
3050 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3051 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3052 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3053 Type *IndexTy = Builder.getIndexTy(
3055 for (auto En : enumerate(ReductionInfos)) {
3056 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3057 auto *RedListArrayTy =
3058 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3059 // Reduce element = LocalReduceList[i]
3060 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3061 RedListArrayTy, LocalReduceList,
3062 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3063 // elemptr = ((CopyType*)(elemptrptr)) + I
3064 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3065 // Global = Buffer.VD[Idx];
3066 Value *BufferVD =
3067 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3069 ReductionsBufferTy, BufferVD, 0, En.index());
3070
3071 switch (RI.EvaluationKind) {
3072 case EvalKind::Scalar: {
3073 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3074 Builder.CreateStore(TargetElement, ElemPtr);
3075 break;
3076 }
3077 case EvalKind::Complex: {
3079 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3080 Value *SrcReal = Builder.CreateLoad(
3081 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3083 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3084 Value *SrcImg = Builder.CreateLoad(
3085 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3086
3088 RI.ElementType, ElemPtr, 0, 0, ".realp");
3090 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3091 Builder.CreateStore(SrcReal, DestRealPtr);
3092 Builder.CreateStore(SrcImg, DestImgPtr);
3093 break;
3094 }
3095 case EvalKind::Aggregate: {
3096 Value *SizeVal =
3100 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3101 SizeVal, false);
3102 break;
3103 }
3104 }
3105 }
3106
3108 Builder.restoreIP(OldIP);
3109 return LtGCFunc;
3110}
3111
3112Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3113 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3114 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3116 LLVMContext &Ctx = M.getContext();
3117 auto *FuncTy = FunctionType::get(
3119 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3120 /* IsVarArg */ false);
3121 Function *LtGRFunc =
3123 "_omp_reduction_global_to_list_reduce_func", &M);
3124 LtGRFunc->setAttributes(FuncAttrs);
3125 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3126 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3127 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3128
3129 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3130 Builder.SetInsertPoint(EntryBlock);
3131
3132 // Buffer: global reduction buffer.
3133 Argument *BufferArg = LtGRFunc->getArg(0);
3134 // Idx: index of the buffer.
3135 Argument *IdxArg = LtGRFunc->getArg(1);
3136 // ReduceList: thread local Reduce list.
3137 Argument *ReduceListArg = LtGRFunc->getArg(2);
3138
3139 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3140 BufferArg->getName() + ".addr");
3141 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3142 IdxArg->getName() + ".addr");
3143 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3144 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3145 ArrayType *RedListArrayTy =
3146 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3147
3148 // 1. Build a list of reduction variables.
3149 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3150 Value *LocalReduceList =
3151 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3152
3154 BufferArgAlloca, Builder.getPtrTy(),
3155 BufferArgAlloca->getName() + ".ascast");
3157 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3158 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3159 ReduceListArgAlloca, Builder.getPtrTy(),
3160 ReduceListArgAlloca->getName() + ".ascast");
3162 LocalReduceList, Builder.getPtrTy(),
3163 LocalReduceList->getName() + ".ascast");
3164
3165 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3166 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3167 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3168
3169 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3170 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3171 Type *IndexTy = Builder.getIndexTy(
3173 for (auto En : enumerate(ReductionInfos)) {
3174 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3175 RedListArrayTy, ReductionList,
3176 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3177 // Global = Buffer.VD[Idx];
3178 Value *BufferVD =
3179 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3181 ReductionsBufferTy, BufferVD, 0, En.index());
3182 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3183 }
3184
3185 // Call reduce_function(ReduceList, GlobalReduceList)
3186 Value *ReduceList =
3187 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3188 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3189 ->addFnAttr(Attribute::NoUnwind);
3191 Builder.restoreIP(OldIP);
3192 return LtGRFunc;
3193}
3194
3195std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3196 std::string Suffix =
3197 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3198 return (Name + Suffix).str();
3199}
3200
3201Function *OpenMPIRBuilder::createReductionFunction(
3202 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3203 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3204 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3205 {Builder.getPtrTy(), Builder.getPtrTy()},
3206 /* IsVarArg */ false);
3207 std::string Name = getReductionFuncName(ReducerName);
3208 Function *ReductionFunc =
3210 ReductionFunc->setAttributes(FuncAttrs);
3211 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3212 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3213 BasicBlock *EntryBB =
3214 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3215 Builder.SetInsertPoint(EntryBB);
3216
3217 // Need to alloca memory here and deal with the pointers before getting
3218 // LHS/RHS pointers out
3219 Value *LHSArrayPtr = nullptr;
3220 Value *RHSArrayPtr = nullptr;
3221 Argument *Arg0 = ReductionFunc->getArg(0);
3222 Argument *Arg1 = ReductionFunc->getArg(1);
3223 Type *Arg0Type = Arg0->getType();
3224 Type *Arg1Type = Arg1->getType();
3225
3226 Value *LHSAlloca =
3227 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3228 Value *RHSAlloca =
3229 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3231 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3233 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3234 Builder.CreateStore(Arg0, LHSAddrCast);
3235 Builder.CreateStore(Arg1, RHSAddrCast);
3236 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3237 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3238
3239 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3240 Type *IndexTy = Builder.getIndexTy(
3242 SmallVector<Value *> LHSPtrs, RHSPtrs;
3243 for (auto En : enumerate(ReductionInfos)) {
3244 const ReductionInfo &RI = En.value();
3245 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3246 RedArrayTy, RHSArrayPtr,
3247 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3248 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3250 RHSI8Ptr, RI.PrivateVariable->getType(),
3251 RHSI8Ptr->getName() + ".ascast");
3252
3253 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3254 RedArrayTy, LHSArrayPtr,
3255 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3256 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3258 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3259
3261 LHSPtrs.emplace_back(LHSPtr);
3262 RHSPtrs.emplace_back(RHSPtr);
3263 } else {
3264 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3265 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3266 Value *Reduced;
3267 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3268 if (!Builder.GetInsertBlock())
3269 return ReductionFunc;
3270 Builder.CreateStore(Reduced, LHSPtr);
3271 }
3272 }
3273
3275 for (auto En : enumerate(ReductionInfos)) {
3276 unsigned Index = En.index();
3277 const ReductionInfo &RI = En.value();
3278 Value *LHSFixupPtr, *RHSFixupPtr;
3279 Builder.restoreIP(RI.ReductionGenClang(
3280 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3281
3282 // Fix the CallBack code genereated to use the correct Values for the LHS
3283 // and RHS
3284 LHSFixupPtr->replaceUsesWithIf(
3285 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3286 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3287 ReductionFunc;
3288 });
3289 RHSFixupPtr->replaceUsesWithIf(
3290 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3291 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3292 ReductionFunc;
3293 });
3294 }
3295
3297 return ReductionFunc;
3298}
3299
3300static void
3302 bool IsGPU) {
3303 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3304 (void)RI;
3305 assert(RI.Variable && "expected non-null variable");
3306 assert(RI.PrivateVariable && "expected non-null private variable");
3307 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3308 "expected non-null reduction generator callback");
3309 if (!IsGPU) {
3310 assert(
3311 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3312 "expected variables and their private equivalents to have the same "
3313 "type");
3314 }
3315 assert(RI.Variable->getType()->isPointerTy() &&
3316 "expected variables to be pointers");
3317 }
3318}
3319
3321 const LocationDescription &Loc, InsertPointTy AllocaIP,
3322 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3323 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3324 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3325 unsigned ReductionBufNum, Value *SrcLocInfo) {
3326 if (!updateToLocation(Loc))
3327 return InsertPointTy();
3328 Builder.restoreIP(CodeGenIP);
3329 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3330 LLVMContext &Ctx = M.getContext();
3331
3332 // Source location for the ident struct
3333 if (!SrcLocInfo) {
3334 uint32_t SrcLocStrSize;
3335 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3336 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3337 }
3338
3339 if (ReductionInfos.size() == 0)
3340 return Builder.saveIP();
3341
3342 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3343 AttributeList FuncAttrs;
3344 AttrBuilder AttrBldr(Ctx);
3345 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3346 AttrBldr.addAttribute(Attr);
3347 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3348 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3349
3350 Function *ReductionFunc = nullptr;
3351 CodeGenIP = Builder.saveIP();
3352 ReductionFunc =
3353 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3354 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3355 Builder.restoreIP(CodeGenIP);
3356
3357 // Set the grid value in the config needed for lowering later on
3358 if (GridValue.has_value())
3359 Config.setGridValue(GridValue.value());
3360 else
3361 Config.setGridValue(getGridValue(T, ReductionFunc));
3362
3363 uint32_t SrcLocStrSize;
3364 Constant *SrcLocStr = getOrCreateDefaultSrcLocStr(SrcLocStrSize);
3365 Value *RTLoc =
3366 getOrCreateIdent(SrcLocStr, SrcLocStrSize, omp::IdentFlag(0), 0);
3367
3368 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3369 // RedList, shuffle_reduce_func, interwarp_copy_func);
3370 // or
3371 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3372 Value *Res;
3373
3374 // 1. Build a list of reduction variables.
3375 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3376 auto Size = ReductionInfos.size();
3377 Type *PtrTy = PointerType::getUnqual(Ctx);
3378 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3379 CodeGenIP = Builder.saveIP();
3380 Builder.restoreIP(AllocaIP);
3381 Value *ReductionListAlloca =
3382 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3384 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3385 Builder.restoreIP(CodeGenIP);
3386 Type *IndexTy = Builder.getIndexTy(
3388 for (auto En : enumerate(ReductionInfos)) {
3389 const ReductionInfo &RI = En.value();
3390 Value *ElemPtr = Builder.CreateInBoundsGEP(
3391 RedArrayTy, ReductionList,
3392 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3393 Value *CastElem =
3395 Builder.CreateStore(CastElem, ElemPtr);
3396 }
3397 CodeGenIP = Builder.saveIP();
3398 Function *SarFunc =
3399 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3400 Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3401 Builder.restoreIP(CodeGenIP);
3402
3403 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3404
3405 unsigned MaxDataSize = 0;
3406 SmallVector<Type *> ReductionTypeArgs;
3407 for (auto En : enumerate(ReductionInfos)) {
3408 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3409 if (Size > MaxDataSize)
3410 MaxDataSize = Size;
3411 ReductionTypeArgs.emplace_back(En.value().ElementType);
3412 }
3413 Value *ReductionDataSize =
3414 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3415 if (!IsTeamsReduction) {
3416 Value *SarFuncCast =
3418 Value *WcFuncCast =
3420 Value *Args[] = {RTLoc, ReductionDataSize, RL, SarFuncCast, WcFuncCast};
3422 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3423 Res = Builder.CreateCall(Pv2Ptr, Args);
3424 } else {
3425 CodeGenIP = Builder.saveIP();
3426 StructType *ReductionsBufferTy = StructType::create(
3427 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3428 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3429 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3430 Function *LtGCFunc = emitListToGlobalCopyFunction(
3431 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3432 Function *LtGRFunc = emitListToGlobalReduceFunction(
3433 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3434 Function *GtLCFunc = emitGlobalToListCopyFunction(
3435 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3436 Function *GtLRFunc = emitGlobalToListReduceFunction(
3437 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3438 Builder.restoreIP(CodeGenIP);
3439
3440 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3441 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3442
3443 Value *Args3[] = {RTLoc,
3444 KernelTeamsReductionPtr,
3445 Builder.getInt32(ReductionBufNum),
3446 ReductionDataSize,
3447 RL,
3448 SarFunc,
3449 WcFunc,
3450 LtGCFunc,
3451 LtGRFunc,
3452 GtLCFunc,
3453 GtLRFunc};
3454
3455 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3456 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3457 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3458 }
3459
3460 // 5. Build if (res == 1)
3461 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3462 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3464 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3465
3466 // 6. Build then branch: where we have reduced values in the master
3467 // thread in each team.
3468 // __kmpc_end_reduce{_nowait}(<gtid>);
3469 // break;
3470 emitBlock(ThenBB, CurFunc);
3471
3472 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3473 for (auto En : enumerate(ReductionInfos)) {
3474 const ReductionInfo &RI = En.value();
3475 Value *LHS = RI.Variable;
3476 Value *RHS =
3478
3480 Value *LHSPtr, *RHSPtr;
3482 &LHSPtr, &RHSPtr, CurFunc));
3483
3484 // Fix the CallBack code genereated to use the correct Values for the LHS
3485 // and RHS
3486 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3487 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3488 ReductionFunc;
3489 });
3490 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3491 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3492 ReductionFunc;
3493 });
3494 } else {
3495 assert(false && "Unhandled ReductionGenCBKind");
3496 }
3497 }
3498 emitBlock(ExitBB, CurFunc);
3499
3501
3502 return Builder.saveIP();
3503}
3504
3506 Type *VoidTy = Type::getVoidTy(M.getContext());
3507 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3508 auto *FuncTy =
3509 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3511 ".omp.reduction.func", &M);
3512}
3513
3516 InsertPointTy AllocaIP,
3517 ArrayRef<ReductionInfo> ReductionInfos,
3518 ArrayRef<bool> IsByRef, bool IsNoWait) {
3519 assert(ReductionInfos.size() == IsByRef.size());
3520 for (const ReductionInfo &RI : ReductionInfos) {
3521 (void)RI;
3522 assert(RI.Variable && "expected non-null variable");
3523 assert(RI.PrivateVariable && "expected non-null private variable");
3524 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3525 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3526 "expected variables and their private equivalents to have the same "
3527 "type");
3528 assert(RI.Variable->getType()->isPointerTy() &&
3529 "expected variables to be pointers");
3530 }
3531
3532 if (!updateToLocation(Loc))
3533 return InsertPointTy();
3534
3535 BasicBlock *InsertBlock = Loc.IP.getBlock();
3536 BasicBlock *ContinuationBlock =
3537 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3538 InsertBlock->getTerminator()->eraseFromParent();
3539
3540 // Create and populate array of type-erased pointers to private reduction
3541 // values.
3542 unsigned NumReductions = ReductionInfos.size();
3543 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3545 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3546
3547 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3548
3549 for (auto En : enumerate(ReductionInfos)) {
3550 unsigned Index = En.index();
3551 const ReductionInfo &RI = En.value();
3552 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3553 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3554 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3555 }
3556
3557 // Emit a call to the runtime function that orchestrates the reduction.
3558 // Declare the reduction function in the process.
3560 Module *Module = Func->getParent();
3561 uint32_t SrcLocStrSize;
3562 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3563 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3564 return RI.AtomicReductionGen;
3565 });
3566 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3567 CanGenerateAtomic
3568 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3569 : IdentFlag(0));
3570 Value *ThreadId = getOrCreateThreadID(Ident);
3571 Constant *NumVariables = Builder.getInt32(NumReductions);
3572 const DataLayout &DL = Module->getDataLayout();
3573 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3574 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3575 Function *ReductionFunc = getFreshReductionFunc(*Module);
3576 Value *Lock = getOMPCriticalRegionLock(".reduction");
3578 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3579 : RuntimeFunction::OMPRTL___kmpc_reduce);
3580 CallInst *ReduceCall =
3581 Builder.CreateCall(ReduceFunc,
3582 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3583 ReductionFunc, Lock},
3584 "reduce");
3585
3586 // Create final reduction entry blocks for the atomic and non-atomic case.
3587 // Emit IR that dispatches control flow to one of the blocks based on the
3588 // reduction supporting the atomic mode.
3589 BasicBlock *NonAtomicRedBlock =
3590 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3591 BasicBlock *AtomicRedBlock =
3592 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3593 SwitchInst *Switch =
3594 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3595 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3596 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3597
3598 // Populate the non-atomic reduction using the elementwise reduction function.
3599 // This loads the elements from the global and private variables and reduces
3600 // them before storing back the result to the global variable.
3601 Builder.SetInsertPoint(NonAtomicRedBlock);
3602 for (auto En : enumerate(ReductionInfos)) {
3603 const ReductionInfo &RI = En.value();
3605 // We have one less load for by-ref case because that load is now inside of
3606 // the reduction region
3607 Value *RedValue = nullptr;
3608 if (!IsByRef[En.index()]) {
3609 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3610 "red.value." + Twine(En.index()));
3611 }
3612 Value *PrivateRedValue =
3614 "red.private.value." + Twine(En.index()));
3615 Value *Reduced;
3616 if (IsByRef[En.index()]) {
3618 PrivateRedValue, Reduced));
3619 } else {
3621 PrivateRedValue, Reduced));
3622 }
3623 if (!Builder.GetInsertBlock())
3624 return InsertPointTy();
3625 // for by-ref case, the load is inside of the reduction region
3626 if (!IsByRef[En.index()])
3627 Builder.CreateStore(Reduced, RI.Variable);
3628 }
3629 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3630 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3631 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3632 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3633 Builder.CreateBr(ContinuationBlock);
3634
3635 // Populate the atomic reduction using the atomic elementwise reduction
3636 // function. There are no loads/stores here because they will be happening
3637 // inside the atomic elementwise reduction.
3638 Builder.SetInsertPoint(AtomicRedBlock);
3639 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3640 for (const ReductionInfo &RI : ReductionInfos) {
3642 RI.Variable, RI.PrivateVariable));
3643 if (!Builder.GetInsertBlock())
3644 return InsertPointTy();
3645 }
3646 Builder.CreateBr(ContinuationBlock);
3647 } else {
3649 }
3650
3651 // Populate the outlined reduction function using the elementwise reduction
3652 // function. Partial values are extracted from the type-erased array of
3653 // pointers to private variables.
3654 BasicBlock *ReductionFuncBlock =
3655 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3656 Builder.SetInsertPoint(ReductionFuncBlock);
3657 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3658 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3659
3660 for (auto En : enumerate(ReductionInfos)) {
3661 const ReductionInfo &RI = En.value();
3663 RedArrayTy, LHSArrayPtr, 0, En.index());
3664 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3665 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3666 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3668 RedArrayTy, RHSArrayPtr, 0, En.index());
3669 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3670 Value *RHSPtr =
3672 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3673 Value *Reduced;
3675 if (!Builder.GetInsertBlock())
3676 return InsertPointTy();
3677 // store is inside of the reduction region when using by-ref
3678 if (!IsByRef[En.index()])
3679 Builder.CreateStore(Reduced, LHSPtr);
3680 }
3682
3683 Builder.SetInsertPoint(ContinuationBlock);
3684 return Builder.saveIP();
3685}
3686
3689 BodyGenCallbackTy BodyGenCB,
3690 FinalizeCallbackTy FiniCB) {
3691
3692 if (!updateToLocation(Loc))
3693 return Loc.IP;
3694
3695 Directive OMPD = Directive::OMPD_master;
3696 uint32_t SrcLocStrSize;
3697 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3698 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3699 Value *ThreadId = getOrCreateThreadID(Ident);
3700 Value *Args[] = {Ident, ThreadId};
3701
3702 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3703 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3704
3705 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3706 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3707
3708 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3709 /*Conditional*/ true, /*hasFinalize*/ true);
3710}
3711
3714 BodyGenCallbackTy BodyGenCB,
3715 FinalizeCallbackTy FiniCB, Value *Filter) {
3716 if (!updateToLocation(Loc))
3717 return Loc.IP;
3718
3719 Directive OMPD = Directive::OMPD_masked;
3720 uint32_t SrcLocStrSize;
3721 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3722 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3723 Value *ThreadId = getOrCreateThreadID(Ident);
3724 Value *Args[] = {Ident, ThreadId, Filter};
3725 Value *ArgsEnd[] = {Ident, ThreadId};
3726
3727 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3728 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3729
3730 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3731 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3732
3733 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3734 /*Conditional*/ true, /*hasFinalize*/ true);
3735}
3736
3738 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3739 BasicBlock *PostInsertBefore, const Twine &Name) {
3740 Module *M = F->getParent();
3741 LLVMContext &Ctx = M->getContext();
3742 Type *IndVarTy = TripCount->getType();
3743
3744 // Create the basic block structure.
3745 BasicBlock *Preheader =
3746 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3747 BasicBlock *Header =
3748 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3749 BasicBlock *Cond =
3750 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3751 BasicBlock *Body =
3752 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3753 BasicBlock *Latch =
3754 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3755 BasicBlock *Exit =
3756 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3757 BasicBlock *After =
3758 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3759
3760 // Use specified DebugLoc for new instructions.
3762
3763 Builder.SetInsertPoint(Preheader);
3764 Builder.CreateBr(Header);
3765
3766 Builder.SetInsertPoint(Header);
3767 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3768 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3770
3772 Value *Cmp =
3773 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3774 Builder.CreateCondBr(Cmp, Body, Exit);
3775
3776 Builder.SetInsertPoint(Body);
3777 Builder.CreateBr(Latch);
3778
3779 Builder.SetInsertPoint(Latch);
3780 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3781 "omp_" + Name + ".next", /*HasNUW=*/true);
3782 Builder.CreateBr(Header);
3783 IndVarPHI->addIncoming(Next, Latch);
3784
3785 Builder.SetInsertPoint(Exit);
3787
3788 // Remember and return the canonical control flow.
3789 LoopInfos.emplace_front();
3790 CanonicalLoopInfo *CL = &LoopInfos.front();
3791
3792 CL->Header = Header;
3793 CL->Cond = Cond;
3794 CL->Latch = Latch;
3795 CL->Exit = Exit;
3796
3797#ifndef NDEBUG
3798 CL->assertOK();
3799#endif
3800 return CL;
3801}
3802
3805 LoopBodyGenCallbackTy BodyGenCB,
3806 Value *TripCount, const Twine &Name) {
3807 BasicBlock *BB = Loc.IP.getBlock();
3808 BasicBlock *NextBB = BB->getNextNode();
3809
3810 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3811 NextBB, NextBB, Name);
3812 BasicBlock *After = CL->getAfter();
3813
3814 // If location is not set, don't connect the loop.
3815 if (updateToLocation(Loc)) {
3816 // Split the loop at the insertion point: Branch to the preheader and move
3817 // every following instruction to after the loop (the After BB). Also, the
3818 // new successor is the loop's after block.
3819 spliceBB(Builder, After, /*CreateBranch=*/false);
3821 }
3822
3823 // Emit the body content. We do it after connecting the loop to the CFG to
3824 // avoid that the callback encounters degenerate BBs.
3825 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
3826
3827#ifndef NDEBUG
3828 CL->assertOK();
3829#endif
3830 return CL;
3831}
3832
3834 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3835 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3836 InsertPointTy ComputeIP, const Twine &Name) {
3837
3838 // Consider the following difficulties (assuming 8-bit signed integers):
3839 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3840 // DO I = 1, 100, 50
3841 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3842 // DO I = 100, 0, -128
3843
3844 // Start, Stop and Step must be of the same integer type.
3845 auto *IndVarTy = cast<IntegerType>(Start->getType());
3846 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
3847 assert(IndVarTy == Step->getType() && "Step type mismatch");
3848
3849 LocationDescription ComputeLoc =
3850 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
3851 updateToLocation(ComputeLoc);
3852
3853 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
3854 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
3855
3856 // Like Step, but always positive.
3857 Value *Incr = Step;
3858
3859 // Distance between Start and Stop; always positive.
3860 Value *Span;
3861
3862 // Condition whether there are no iterations are executed at all, e.g. because
3863 // UB < LB.
3864 Value *ZeroCmp;
3865
3866 if (IsSigned) {
3867 // Ensure that increment is positive. If not, negate and invert LB and UB.
3868 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
3869 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
3870 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
3871 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
3872 Span = Builder.CreateSub(UB, LB, "", false, true);
3873 ZeroCmp = Builder.CreateICmp(
3874 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
3875 } else {
3876 Span = Builder.CreateSub(Stop, Start, "", true);
3877 ZeroCmp = Builder.CreateICmp(
3878 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
3879 }
3880
3881 Value *CountIfLooping;
3882 if (InclusiveStop) {
3883 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
3884 } else {
3885 // Avoid incrementing past stop since it could overflow.
3886 Value *CountIfTwo = Builder.CreateAdd(
3887 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
3888 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
3889 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
3890 }
3891 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
3892 "omp_" + Name + ".tripcount");
3893
3894 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
3895 Builder.restoreIP(CodeGenIP);
3896 Value *Span = Builder.CreateMul(IV, Step);
3897 Value *IndVar = Builder.CreateAdd(Span, Start);
3898 BodyGenCB(Builder.saveIP(), IndVar);
3899 };
3900 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
3901 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
3902}
3903
3904// Returns an LLVM function to call for initializing loop bounds using OpenMP
3905// static scheduling depending on `type`. Only i32 and i64 are supported by the
3906// runtime. Always interpret integers as unsigned similarly to
3907// CanonicalLoopInfo.
3909 OpenMPIRBuilder &OMPBuilder) {
3910 unsigned Bitwidth = Ty->getIntegerBitWidth();
3911 if (Bitwidth == 32)
3912 return OMPBuilder.getOrCreateRuntimeFunction(
3913 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
3914 if (Bitwidth == 64)
3915 return OMPBuilder.getOrCreateRuntimeFunction(
3916 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
3917 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3918}
3919
3921OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
3922 InsertPointTy AllocaIP,
3923 bool NeedsBarrier) {
3924 assert(CLI->isValid() && "Requires a valid canonical loop");
3925 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3926 "Require dedicated allocate IP");
3927
3928 // Set up the source location value for OpenMP runtime.
3931
3932 uint32_t SrcLocStrSize;
3933 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
3934 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3935
3936 // Declare useful OpenMP runtime functions.
3937 Value *IV = CLI->getIndVar();
3938 Type *IVTy = IV->getType();
3939 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
3940 FunctionCallee StaticFini =
3941 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
3942
3943 // Allocate space for computed loop bounds as expected by the "init" function.
3944 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
3945
3946 Type *I32Type = Type::getInt32Ty(M.getContext());
3947 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
3948 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
3949 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
3950 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
3951
3952 // At the end of the preheader, prepare for calling the "init" function by
3953 // storing the current loop bounds into the allocated space. A canonical loop
3954 // always iterates from 0 to trip-count with step 1. Note that "init" expects
3955 // and produces an inclusive upper bound.
3957 Constant *Zero = ConstantInt::get(IVTy, 0);
3958 Constant *One = ConstantInt::get(IVTy, 1);
3959 Builder.CreateStore(Zero, PLowerBound);
3960 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
3961 Builder.CreateStore(UpperBound, PUpperBound);
3962 Builder.CreateStore(One, PStride);
3963
3964 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
3965
3966 Constant *SchedulingType = ConstantInt::get(
3967 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
3968
3969 // Call the "init" function and update the trip count of the loop with the
3970 // value it produced.
3971 Builder.CreateCall(StaticInit,
3972 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
3973 PUpperBound, PStride, One, Zero});
3974 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
3975 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
3976 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
3977 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
3978 CLI->setTripCount(TripCount);
3979
3980 // Update all uses of the induction variable except the one in the condition
3981 // block that compares it with the actual upper bound, and the increment in
3982 // the latch block.
3983
3984 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
3986 CLI->getBody()->getFirstInsertionPt());
3988 return Builder.CreateAdd(OldIV, LowerBound);
3989 });
3990
3991 // In the "exit" block, call the "fini" function.
3993 CLI->getExit()->getTerminator()->getIterator());
3994 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
3995
3996 // Add the barrier if requested.
3997 if (NeedsBarrier)
3998 createBarrier(LocationDescription(Builder.saveIP(), DL),
3999 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4000 /* CheckCancelFlag */ false);
4001
4002 InsertPointTy AfterIP = CLI->getAfterIP();
4003 CLI->invalidate();
4004
4005 return AfterIP;
4006}
4007
4008OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4009 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4010 bool NeedsBarrier, Value *ChunkSize) {
4011 assert(CLI->isValid() && "Requires a valid canonical loop");
4012 assert(ChunkSize && "Chunk size is required");
4013
4014 LLVMContext &Ctx = CLI->getFunction()->getContext();
4015 Value *IV = CLI->getIndVar();
4016 Value *OrigTripCount = CLI->getTripCount();
4017 Type *IVTy = IV->getType();
4018 assert(IVTy->getIntegerBitWidth() <= 64 &&
4019 "Max supported tripcount bitwidth is 64 bits");
4020 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4021 : Type::getInt64Ty(Ctx);
4022 Type *I32Type = Type::getInt32Ty(M.getContext());
4023 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4024 Constant *One = ConstantInt::get(InternalIVTy, 1);
4025
4026 // Declare useful OpenMP runtime functions.
4027 FunctionCallee StaticInit =
4028 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4029 FunctionCallee StaticFini =
4030 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4031
4032 // Allocate space for computed loop bounds as expected by the "init" function.
4033 Builder.restoreIP(AllocaIP);
4035 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4036 Value *PLowerBound =
4037 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4038 Value *PUpperBound =
4039 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4040 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4041
4042 // Set up the source location value for the OpenMP runtime.
4045
4046 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4047 Value *CastedChunkSize =
4048 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4049 Value *CastedTripCount =
4050 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4051
4052 Constant *SchedulingType = ConstantInt::get(
4053 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4054 Builder.CreateStore(Zero, PLowerBound);
4055 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4056 Builder.CreateStore(OrigUpperBound, PUpperBound);
4057 Builder.CreateStore(One, PStride);
4058
4059 // Call the "init" function and update the trip count of the loop with the
4060 // value it produced.
4061 uint32_t SrcLocStrSize;
4062 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4063 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4064 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4065 Builder.CreateCall(StaticInit,
4066 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4067 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4068 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4069 /*pstride=*/PStride, /*incr=*/One,
4070 /*chunk=*/CastedChunkSize});
4071
4072 // Load values written by the "init" function.
4073 Value *FirstChunkStart =
4074 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4075 Value *FirstChunkStop =
4076 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4077 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4078 Value *ChunkRange =
4079 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4080 Value *NextChunkStride =
4081 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4082
4083 // Create outer "dispatch" loop for enumerating the chunks.
4084 BasicBlock *DispatchEnter = splitBB(Builder, true);
4085 Value *DispatchCounter;
4087 {Builder.saveIP(), DL},
4088 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
4089 FirstChunkStart, CastedTripCount, NextChunkStride,
4090 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4091 "dispatch");
4092
4093 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4094 // not have to preserve the canonical invariant.
4095 BasicBlock *DispatchBody = DispatchCLI->getBody();
4096 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4097 BasicBlock *DispatchExit = DispatchCLI->getExit();
4098 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4099 DispatchCLI->invalidate();
4100
4101 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4102 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4103 redirectTo(CLI->getExit(), DispatchLatch, DL);
4104 redirectTo(DispatchBody, DispatchEnter, DL);
4105
4106 // Prepare the prolog of the chunk loop.
4109
4110 // Compute the number of iterations of the chunk loop.
4112 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4113 Value *IsLastChunk =
4114 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4115 Value *CountUntilOrigTripCount =
4116 Builder.CreateSub(CastedTripCount, DispatchCounter);
4117 Value *ChunkTripCount = Builder.CreateSelect(
4118 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4119 Value *BackcastedChunkTC =
4120 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4121 CLI->setTripCount(BackcastedChunkTC);
4122
4123 // Update all uses of the induction variable except the one in the condition
4124 // block that compares it with the actual upper bound, and the increment in
4125 // the latch block.
4126 Value *BackcastedDispatchCounter =
4127 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4128 CLI->mapIndVar([&](Instruction *) -> Value * {
4129 Builder.restoreIP(CLI->getBodyIP());
4130 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4131 });
4132
4133 // In the "exit" block, call the "fini" function.
4134 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4135 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4136
4137 // Add the barrier if requested.
4138 if (NeedsBarrier)
4139 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4140 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4141
4142#ifndef NDEBUG
4143 // Even though we currently do not support applying additional methods to it,
4144 // the chunk loop should remain a canonical loop.
4145 CLI->assertOK();
4146#endif
4147
4148 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
4149}
4150
4151// Returns an LLVM function to call for executing an OpenMP static worksharing
4152// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4153// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4154static FunctionCallee
4156 WorksharingLoopType LoopType) {
4157 unsigned Bitwidth = Ty->getIntegerBitWidth();
4158 Module &M = OMPBuilder->M;
4159 switch (LoopType) {
4160 case WorksharingLoopType::ForStaticLoop:
4161 if (Bitwidth == 32)
4162 return OMPBuilder->getOrCreateRuntimeFunction(
4163 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4164 if (Bitwidth == 64)
4165 return OMPBuilder->getOrCreateRuntimeFunction(
4166 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4167 break;
4168 case WorksharingLoopType::DistributeStaticLoop:
4169 if (Bitwidth == 32)
4170 return OMPBuilder->getOrCreateRuntimeFunction(
4171 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4172 if (Bitwidth == 64)
4173 return OMPBuilder->getOrCreateRuntimeFunction(
4174 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4175 break;
4176 case WorksharingLoopType::DistributeForStaticLoop:
4177 if (Bitwidth == 32)
4178 return OMPBuilder->getOrCreateRuntimeFunction(
4179 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4180 if (Bitwidth == 64)
4181 return OMPBuilder->getOrCreateRuntimeFunction(
4182 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4183 break;
4184 }
4185 if (Bitwidth != 32 && Bitwidth != 64) {
4186 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4187 }
4188 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4189}
4190
4191// Inserts a call to proper OpenMP Device RTL function which handles
4192// loop worksharing.
4194 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4195 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4196 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4197 Type *TripCountTy = TripCount->getType();
4198 Module &M = OMPBuilder->M;
4199 IRBuilder<> &Builder = OMPBuilder->Builder;
4200 FunctionCallee RTLFn =
4201 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4202 SmallVector<Value *, 8> RealArgs;
4203 RealArgs.push_back(Ident);
4204 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4205 RealArgs.push_back(LoopBodyArg);
4206 RealArgs.push_back(TripCount);
4207 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4208 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4209 Builder.CreateCall(RTLFn, RealArgs);
4210 return;
4211 }
4212 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4213 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4214 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});