LLVM 20.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/SmallSet.h"
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
36#include "llvm/IR/Function.h"
38#include "llvm/IR/IRBuilder.h"
39#include "llvm/IR/LLVMContext.h"
40#include "llvm/IR/MDBuilder.h"
41#include "llvm/IR/Metadata.h"
42#include "llvm/IR/PassManager.h"
45#include "llvm/IR/Value.h"
57
58#include <cstdint>
59#include <optional>
60#include <stack>
61
62#define DEBUG_TYPE "openmp-ir-builder"
63
64using namespace llvm;
65using namespace omp;
66
67static cl::opt<bool>
68 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
69 cl::desc("Use optimistic attributes describing "
70 "'as-if' properties of runtime calls."),
71 cl::init(false));
72
74 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
75 cl::desc("Factor for the unroll threshold to account for code "
76 "simplifications still taking place"),
77 cl::init(1.5));
78
79#ifndef NDEBUG
80/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
81/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
82/// an InsertPoint stores the instruction before something is inserted. For
83/// instance, if both point to the same instruction, two IRBuilders alternating
84/// creating instruction will cause the instructions to be interleaved.
87 if (!IP1.isSet() || !IP2.isSet())
88 return false;
89 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
90}
91
93 // Valid ordered/unordered and base algorithm combinations.
94 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
95 case OMPScheduleType::UnorderedStaticChunked:
96 case OMPScheduleType::UnorderedStatic:
97 case OMPScheduleType::UnorderedDynamicChunked:
98 case OMPScheduleType::UnorderedGuidedChunked:
99 case OMPScheduleType::UnorderedRuntime:
100 case OMPScheduleType::UnorderedAuto:
101 case OMPScheduleType::UnorderedTrapezoidal:
102 case OMPScheduleType::UnorderedGreedy:
103 case OMPScheduleType::UnorderedBalanced:
104 case OMPScheduleType::UnorderedGuidedIterativeChunked:
105 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
106 case OMPScheduleType::UnorderedSteal:
107 case OMPScheduleType::UnorderedStaticBalancedChunked:
108 case OMPScheduleType::UnorderedGuidedSimd:
109 case OMPScheduleType::UnorderedRuntimeSimd:
110 case OMPScheduleType::OrderedStaticChunked:
111 case OMPScheduleType::OrderedStatic:
112 case OMPScheduleType::OrderedDynamicChunked:
113 case OMPScheduleType::OrderedGuidedChunked:
114 case OMPScheduleType::OrderedRuntime:
115 case OMPScheduleType::OrderedAuto:
116 case OMPScheduleType::OrderdTrapezoidal:
117 case OMPScheduleType::NomergeUnorderedStaticChunked:
118 case OMPScheduleType::NomergeUnorderedStatic:
119 case OMPScheduleType::NomergeUnorderedDynamicChunked:
120 case OMPScheduleType::NomergeUnorderedGuidedChunked:
121 case OMPScheduleType::NomergeUnorderedRuntime:
122 case OMPScheduleType::NomergeUnorderedAuto:
123 case OMPScheduleType::NomergeUnorderedTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedGreedy:
125 case OMPScheduleType::NomergeUnorderedBalanced:
126 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
128 case OMPScheduleType::NomergeUnorderedSteal:
129 case OMPScheduleType::NomergeOrderedStaticChunked:
130 case OMPScheduleType::NomergeOrderedStatic:
131 case OMPScheduleType::NomergeOrderedDynamicChunked:
132 case OMPScheduleType::NomergeOrderedGuidedChunked:
133 case OMPScheduleType::NomergeOrderedRuntime:
134 case OMPScheduleType::NomergeOrderedAuto:
135 case OMPScheduleType::NomergeOrderedTrapezoidal:
136 break;
137 default:
138 return false;
139 }
140
141 // Must not set both monotonicity modifiers at the same time.
142 OMPScheduleType MonotonicityFlags =
143 SchedType & OMPScheduleType::MonotonicityMask;
144 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
145 return false;
146
147 return true;
148}
149#endif
150
151static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
152 if (T.isAMDGPU()) {
153 StringRef Features =
154 Kernel->getFnAttribute("target-features").getValueAsString();
155 if (Features.count("+wavefrontsize64"))
156 return omp::getAMDGPUGridValues<64>();
157 return omp::getAMDGPUGridValues<32>();
158 }
159 if (T.isNVPTX())
161 llvm_unreachable("No grid value available for this architecture!");
162}
163
164/// Determine which scheduling algorithm to use, determined from schedule clause
165/// arguments.
166static OMPScheduleType
167getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
168 bool HasSimdModifier) {
169 // Currently, the default schedule it static.
170 switch (ClauseKind) {
171 case OMP_SCHEDULE_Default:
172 case OMP_SCHEDULE_Static:
173 return HasChunks ? OMPScheduleType::BaseStaticChunked
174 : OMPScheduleType::BaseStatic;
175 case OMP_SCHEDULE_Dynamic:
176 return OMPScheduleType::BaseDynamicChunked;
177 case OMP_SCHEDULE_Guided:
178 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
179 : OMPScheduleType::BaseGuidedChunked;
180 case OMP_SCHEDULE_Auto:
182 case OMP_SCHEDULE_Runtime:
183 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
184 : OMPScheduleType::BaseRuntime;
185 }
186 llvm_unreachable("unhandled schedule clause argument");
187}
188
189/// Adds ordering modifier flags to schedule type.
190static OMPScheduleType
192 bool HasOrderedClause) {
193 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
194 OMPScheduleType::None &&
195 "Must not have ordering nor monotonicity flags already set");
196
197 OMPScheduleType OrderingModifier = HasOrderedClause
198 ? OMPScheduleType::ModifierOrdered
199 : OMPScheduleType::ModifierUnordered;
200 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
201
202 // Unsupported combinations
203 if (OrderingScheduleType ==
204 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
205 return OMPScheduleType::OrderedGuidedChunked;
206 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
207 OMPScheduleType::ModifierOrdered))
208 return OMPScheduleType::OrderedRuntime;
209
210 return OrderingScheduleType;
211}
212
213/// Adds monotonicity modifier flags to schedule type.
214static OMPScheduleType
216 bool HasSimdModifier, bool HasMonotonic,
217 bool HasNonmonotonic, bool HasOrderedClause) {
218 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
219 OMPScheduleType::None &&
220 "Must not have monotonicity flags already set");
221 assert((!HasMonotonic || !HasNonmonotonic) &&
222 "Monotonic and Nonmonotonic are contradicting each other");
223
224 if (HasMonotonic) {
225 return ScheduleType | OMPScheduleType::ModifierMonotonic;
226 } else if (HasNonmonotonic) {
227 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
228 } else {
229 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
230 // If the static schedule kind is specified or if the ordered clause is
231 // specified, and if the nonmonotonic modifier is not specified, the
232 // effect is as if the monotonic modifier is specified. Otherwise, unless
233 // the monotonic modifier is specified, the effect is as if the
234 // nonmonotonic modifier is specified.
235 OMPScheduleType BaseScheduleType =
236 ScheduleType & ~OMPScheduleType::ModifierMask;
237 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
238 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
239 HasOrderedClause) {
240 // The monotonic is used by default in openmp runtime library, so no need
241 // to set it.
242 return ScheduleType;
243 } else {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 }
246 }
247}
248
249/// Determine the schedule type using schedule and ordering clause arguments.
250static OMPScheduleType
251computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
252 bool HasSimdModifier, bool HasMonotonicModifier,
253 bool HasNonmonotonicModifier, bool HasOrderedClause) {
254 OMPScheduleType BaseSchedule =
255 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
256 OMPScheduleType OrderedSchedule =
257 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
259 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
260 HasNonmonotonicModifier, HasOrderedClause);
261
263 return Result;
264}
265
266/// Make \p Source branch to \p Target.
267///
268/// Handles two situations:
269/// * \p Source already has an unconditional branch.
270/// * \p Source is a degenerate block (no terminator because the BB is
271/// the current head of the IR construction).
273 if (Instruction *Term = Source->getTerminator()) {
274 auto *Br = cast<BranchInst>(Term);
275 assert(!Br->isConditional() &&
276 "BB's terminator must be an unconditional branch (or degenerate)");
277 BasicBlock *Succ = Br->getSuccessor(0);
278 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
279 Br->setSuccessor(0, Target);
280 return;
281 }
282
283 auto *NewBr = BranchInst::Create(Target, Source);
284 NewBr->setDebugLoc(DL);
285}
286
288 bool CreateBranch) {
289 assert(New->getFirstInsertionPt() == New->begin() &&
290 "Target BB must not have PHI nodes");
291
292 // Move instructions to new block.
293 BasicBlock *Old = IP.getBlock();
294 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
295
296 if (CreateBranch)
297 BranchInst::Create(New, Old);
298}
299
300void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
302 BasicBlock *Old = Builder.GetInsertBlock();
303
304 spliceBB(Builder.saveIP(), New, CreateBranch);
305 if (CreateBranch)
306 Builder.SetInsertPoint(Old->getTerminator());
307 else
308 Builder.SetInsertPoint(Old);
309
310 // SetInsertPoint also updates the Builder's debug location, but we want to
311 // keep the one the Builder was configured to use.
313}
314
317 BasicBlock *Old = IP.getBlock();
319 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
320 Old->getParent(), Old->getNextNode());
321 spliceBB(IP, New, CreateBranch);
322 New->replaceSuccessorsPhiUsesWith(Old, New);
323 return New;
324}
325
326BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
329 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
330 if (CreateBranch)
331 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
332 else
333 Builder.SetInsertPoint(Builder.GetInsertBlock());
334 // SetInsertPoint also updates the Builder's debug location, but we want to
335 // keep the one the Builder was configured to use.
337 return New;
338}
339
340BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
343 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, Name);
344 if (CreateBranch)
345 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
346 else
347 Builder.SetInsertPoint(Builder.GetInsertBlock());
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
351 return New;
352}
353
355 llvm::Twine Suffix) {
356 BasicBlock *Old = Builder.GetInsertBlock();
357 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
358}
359
360// This function creates a fake integer value and a fake use for the integer
361// value. It returns the fake value created. This is useful in modeling the
362// extra arguments to the outlined functions.
364 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
366 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
367 const Twine &Name = "", bool AsPtr = true) {
368 Builder.restoreIP(OuterAllocaIP);
369 Instruction *FakeVal;
370 AllocaInst *FakeValAddr =
371 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
372 ToBeDeleted.push_back(FakeValAddr);
373
374 if (AsPtr) {
375 FakeVal = FakeValAddr;
376 } else {
377 FakeVal =
378 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
379 ToBeDeleted.push_back(FakeVal);
380 }
381
382 // Generate a fake use of this value
383 Builder.restoreIP(InnerAllocaIP);
384 Instruction *UseFakeVal;
385 if (AsPtr) {
386 UseFakeVal =
387 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
388 } else {
389 UseFakeVal =
390 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
391 }
392 ToBeDeleted.push_back(UseFakeVal);
393 return FakeVal;
394}
395
396//===----------------------------------------------------------------------===//
397// OpenMPIRBuilderConfig
398//===----------------------------------------------------------------------===//
399
400namespace {
402/// Values for bit flags for marking which requires clauses have been used.
403enum OpenMPOffloadingRequiresDirFlags {
404 /// flag undefined.
405 OMP_REQ_UNDEFINED = 0x000,
406 /// no requires directive present.
407 OMP_REQ_NONE = 0x001,
408 /// reverse_offload clause.
409 OMP_REQ_REVERSE_OFFLOAD = 0x002,
410 /// unified_address clause.
411 OMP_REQ_UNIFIED_ADDRESS = 0x004,
412 /// unified_shared_memory clause.
413 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
414 /// dynamic_allocators clause.
415 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
416 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
417};
418
419} // anonymous namespace
420
422 : RequiresFlags(OMP_REQ_UNDEFINED) {}
423
425 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
426 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
427 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
428 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
429 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
430 RequiresFlags(OMP_REQ_UNDEFINED) {
431 if (HasRequiresReverseOffload)
432 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
433 if (HasRequiresUnifiedAddress)
434 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
435 if (HasRequiresUnifiedSharedMemory)
436 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
437 if (HasRequiresDynamicAllocators)
438 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
439}
440
442 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
443}
444
446 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
447}
448
450 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
451}
452
454 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
455}
456
458 return hasRequiresFlags() ? RequiresFlags
459 : static_cast<int64_t>(OMP_REQ_NONE);
460}
461
463 if (Value)
464 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
465 else
466 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
467}
468
470 if (Value)
471 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
472 else
473 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
474}
475
477 if (Value)
478 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
479 else
480 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
481}
482
484 if (Value)
485 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
486 else
487 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
488}
489
490//===----------------------------------------------------------------------===//
491// OpenMPIRBuilder
492//===----------------------------------------------------------------------===//
493
495 IRBuilderBase &Builder,
496 SmallVector<Value *> &ArgsVector) {
498 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
499 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
500 constexpr const size_t MaxDim = 3;
501 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
502 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
503
504 assert(!KernelArgs.NumTeams.empty());
505
506 Value *NumTeams3D =
507 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
508 for (unsigned I = 1; I < std::min(KernelArgs.NumTeams.size(), MaxDim); ++I)
509 NumTeams3D =
510 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
511 Value *NumThreads3D =
512 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads, {0});
513
514 ArgsVector = {Version,
515 PointerNum,
516 KernelArgs.RTArgs.BasePointersArray,
517 KernelArgs.RTArgs.PointersArray,
518 KernelArgs.RTArgs.SizesArray,
519 KernelArgs.RTArgs.MapTypesArray,
520 KernelArgs.RTArgs.MapNamesArray,
521 KernelArgs.RTArgs.MappersArray,
522 KernelArgs.NumIterations,
523 Flags,
524 NumTeams3D,
525 NumThreads3D,
526 KernelArgs.DynCGGroupMem};
527}
528
530 LLVMContext &Ctx = Fn.getContext();
531
532 // Get the function's current attributes.
533 auto Attrs = Fn.getAttributes();
534 auto FnAttrs = Attrs.getFnAttrs();
535 auto RetAttrs = Attrs.getRetAttrs();
537 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
538 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
539
540 // Add AS to FnAS while taking special care with integer extensions.
541 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
542 bool Param = true) -> void {
543 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
544 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
545 if (HasSignExt || HasZeroExt) {
546 assert(AS.getNumAttributes() == 1 &&
547 "Currently not handling extension attr combined with others.");
548 if (Param) {
549 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
550 FnAS = FnAS.addAttribute(Ctx, AK);
551 } else if (auto AK =
552 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
553 FnAS = FnAS.addAttribute(Ctx, AK);
554 } else {
555 FnAS = FnAS.addAttributes(Ctx, AS);
556 }
557 };
558
559#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
560#include "llvm/Frontend/OpenMP/OMPKinds.def"
561
562 // Add attributes to the function declaration.
563 switch (FnID) {
564#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
565 case Enum: \
566 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
567 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
568 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
569 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
570 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
571 break;
572#include "llvm/Frontend/OpenMP/OMPKinds.def"
573 default:
574 // Attributes are optional.
575 break;
576 }
577}
578
581 FunctionType *FnTy = nullptr;
582 Function *Fn = nullptr;
583
584 // Try to find the declation in the module first.
585 switch (FnID) {
586#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
587 case Enum: \
588 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
589 IsVarArg); \
590 Fn = M.getFunction(Str); \
591 break;
592#include "llvm/Frontend/OpenMP/OMPKinds.def"
593 }
594
595 if (!Fn) {
596 // Create a new declaration if we need one.
597 switch (FnID) {
598#define OMP_RTL(Enum, Str, ...) \
599 case Enum: \
600 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
601 break;
602#include "llvm/Frontend/OpenMP/OMPKinds.def"
603 }
604
605 // Add information if the runtime function takes a callback function
606 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
607 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
608 LLVMContext &Ctx = Fn->getContext();
609 MDBuilder MDB(Ctx);
610 // Annotate the callback behavior of the runtime function:
611 // - The callback callee is argument number 2 (microtask).
612 // - The first two arguments of the callback callee are unknown (-1).
613 // - All variadic arguments to the runtime function are passed to the
614 // callback callee.
615 Fn->addMetadata(
616 LLVMContext::MD_callback,
618 2, {-1, -1}, /* VarArgsArePassed */ true)}));
619 }
620 }
621
622 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
623 << " with type " << *Fn->getFunctionType() << "\n");
624 addAttributes(FnID, *Fn);
625
626 } else {
627 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
628 << " with type " << *Fn->getFunctionType() << "\n");
629 }
630
631 assert(Fn && "Failed to create OpenMP runtime function");
632
633 return {FnTy, Fn};
634}
635
638 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
639 assert(Fn && "Failed to create OpenMP runtime function pointer");
640 return Fn;
641}
642
643void OpenMPIRBuilder::initialize() { initializeTypes(M); }
644
647 BasicBlock &EntryBlock = Function->getEntryBlock();
648 Instruction *MoveLocInst = EntryBlock.getFirstNonPHI();
649
650 // Loop over blocks looking for constant allocas, skipping the entry block
651 // as any allocas there are already in the desired location.
652 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
653 Block++) {
654 for (auto Inst = Block->getReverseIterator()->begin();
655 Inst != Block->getReverseIterator()->end();) {
656 if (auto *AllocaInst = dyn_cast_if_present<llvm::AllocaInst>(Inst)) {
657 Inst++;
658 if (!isa<ConstantData>(AllocaInst->getArraySize()))
659 continue;
660 AllocaInst->moveBeforePreserving(MoveLocInst);
661 } else {
662 Inst++;
663 }
664 }
665 }
666}
667
669 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
671 SmallVector<OutlineInfo, 16> DeferredOutlines;
672 for (OutlineInfo &OI : OutlineInfos) {
673 // Skip functions that have not finalized yet; may happen with nested
674 // function generation.
675 if (Fn && OI.getFunction() != Fn) {
676 DeferredOutlines.push_back(OI);
677 continue;
678 }
679
680 ParallelRegionBlockSet.clear();
681 Blocks.clear();
682 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
683
684 Function *OuterFn = OI.getFunction();
685 CodeExtractorAnalysisCache CEAC(*OuterFn);
686 // If we generate code for the target device, we need to allocate
687 // struct for aggregate params in the device default alloca address space.
688 // OpenMP runtime requires that the params of the extracted functions are
689 // passed as zero address space pointers. This flag ensures that
690 // CodeExtractor generates correct code for extracted functions
691 // which are used by OpenMP runtime.
692 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
693 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
694 /* AggregateArgs */ true,
695 /* BlockFrequencyInfo */ nullptr,
696 /* BranchProbabilityInfo */ nullptr,
697 /* AssumptionCache */ nullptr,
698 /* AllowVarArgs */ true,
699 /* AllowAlloca */ true,
700 /* AllocaBlock*/ OI.OuterAllocaBB,
701 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
702
703 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
704 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
705 << " Exit: " << OI.ExitBB->getName() << "\n");
706 assert(Extractor.isEligible() &&
707 "Expected OpenMP outlining to be possible!");
708
709 for (auto *V : OI.ExcludeArgsFromAggregate)
710 Extractor.excludeArgFromAggregate(V);
711
712 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
713
714 // Forward target-cpu, target-features attributes to the outlined function.
715 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
716 if (TargetCpuAttr.isStringAttribute())
717 OutlinedFn->addFnAttr(TargetCpuAttr);
718
719 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
720 if (TargetFeaturesAttr.isStringAttribute())
721 OutlinedFn->addFnAttr(TargetFeaturesAttr);
722
723 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
724 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
725 assert(OutlinedFn->getReturnType()->isVoidTy() &&
726 "OpenMP outlined functions should not return a value!");
727
728 // For compability with the clang CG we move the outlined function after the
729 // one with the parallel region.
730 OutlinedFn->removeFromParent();
731 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
732
733 // Remove the artificial entry introduced by the extractor right away, we
734 // made our own entry block after all.
735 {
736 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
737 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
738 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
739 // Move instructions from the to-be-deleted ArtificialEntry to the entry
740 // basic block of the parallel region. CodeExtractor generates
741 // instructions to unwrap the aggregate argument and may sink
742 // allocas/bitcasts for values that are solely used in the outlined region
743 // and do not escape.
744 assert(!ArtificialEntry.empty() &&
745 "Expected instructions to add in the outlined region entry");
746 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
747 End = ArtificialEntry.rend();
748 It != End;) {
749 Instruction &I = *It;
750 It++;
751
752 if (I.isTerminator())
753 continue;
754
755 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
756 }
757
758 OI.EntryBB->moveBefore(&ArtificialEntry);
759 ArtificialEntry.eraseFromParent();
760 }
761 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
762 assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
763
764 // Run a user callback, e.g. to add attributes.
765 if (OI.PostOutlineCB)
766 OI.PostOutlineCB(*OutlinedFn);
767 }
768
769 // Remove work items that have been completed.
770 OutlineInfos = std::move(DeferredOutlines);
771
772 // The createTarget functions embeds user written code into
773 // the target region which may inject allocas which need to
774 // be moved to the entry block of our target or risk malformed
775 // optimisations by later passes, this is only relevant for
776 // the device pass which appears to be a little more delicate
777 // when it comes to optimisations (however, we do not block on
778 // that here, it's up to the inserter to the list to do so).
779 // This notbaly has to occur after the OutlinedInfo candidates
780 // have been extracted so we have an end product that will not
781 // be implicitly adversely affected by any raises unless
782 // intentionally appended to the list.
783 // NOTE: This only does so for ConstantData, it could be extended
784 // to ConstantExpr's with further effort, however, they should
785 // largely be folded when they get here. Extending it to runtime
786 // defined/read+writeable allocation sizes would be non-trivial
787 // (need to factor in movement of any stores to variables the
788 // allocation size depends on, as well as the usual loads,
789 // otherwise it'll yield the wrong result after movement) and
790 // likely be more suitable as an LLVM optimisation pass.
793
794 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
795 [](EmitMetadataErrorKind Kind,
796 const TargetRegionEntryInfo &EntryInfo) -> void {
797 errs() << "Error of kind: " << Kind
798 << " when emitting offload entries and metadata during "
799 "OMPIRBuilder finalization \n";
800 };
801
804
805 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
806 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
807 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
808 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
809 }
810}
811
813 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
814}
815
818 auto *GV =
819 new GlobalVariable(M, I32Ty,
820 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
821 ConstantInt::get(I32Ty, Value), Name);
822 GV->setVisibility(GlobalValue::HiddenVisibility);
823
824 return GV;
825}
826
828 uint32_t SrcLocStrSize,
829 IdentFlag LocFlags,
830 unsigned Reserve2Flags) {
831 // Enable "C-mode".
832 LocFlags |= OMP_IDENT_FLAG_KMPC;
833
834 Constant *&Ident =
835 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
836 if (!Ident) {
838 Constant *IdentData[] = {I32Null,
839 ConstantInt::get(Int32, uint32_t(LocFlags)),
840 ConstantInt::get(Int32, Reserve2Flags),
841 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
842 Constant *Initializer =
843 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
844
845 // Look for existing encoding of the location + flags, not needed but
846 // minimizes the difference to the existing solution while we transition.
847 for (GlobalVariable &GV : M.globals())
848 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
849 if (GV.getInitializer() == Initializer)
850 Ident = &GV;
851
852 if (!Ident) {
853 auto *GV = new GlobalVariable(
854 M, OpenMPIRBuilder::Ident,
855 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
858 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
859 GV->setAlignment(Align(8));
860 Ident = GV;
861 }
862 }
863
865}
866
868 uint32_t &SrcLocStrSize) {
869 SrcLocStrSize = LocStr.size();
870 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
871 if (!SrcLocStr) {
872 Constant *Initializer =
874
875 // Look for existing encoding of the location, not needed but minimizes the
876 // difference to the existing solution while we transition.
877 for (GlobalVariable &GV : M.globals())
878 if (GV.isConstant() && GV.hasInitializer() &&
879 GV.getInitializer() == Initializer)
880 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
881
882 SrcLocStr = Builder.CreateGlobalStringPtr(LocStr, /* Name */ "",
883 /* AddressSpace */ 0, &M);
884 }
885 return SrcLocStr;
886}
887
889 StringRef FileName,
890 unsigned Line, unsigned Column,
891 uint32_t &SrcLocStrSize) {
892 SmallString<128> Buffer;
893 Buffer.push_back(';');
894 Buffer.append(FileName);
895 Buffer.push_back(';');
896 Buffer.append(FunctionName);
897 Buffer.push_back(';');
898 Buffer.append(std::to_string(Line));
899 Buffer.push_back(';');
900 Buffer.append(std::to_string(Column));
901 Buffer.push_back(';');
902 Buffer.push_back(';');
903 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
904}
905
906Constant *
908 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
909 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
910}
911
913 uint32_t &SrcLocStrSize,
914 Function *F) {
915 DILocation *DIL = DL.get();
916 if (!DIL)
917 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
918 StringRef FileName = M.getName();
919 if (DIFile *DIF = DIL->getFile())
920 if (std::optional<StringRef> Source = DIF->getSource())
921 FileName = *Source;
922 StringRef Function = DIL->getScope()->getSubprogram()->getName();
923 if (Function.empty() && F)
924 Function = F->getName();
925 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
926 DIL->getColumn(), SrcLocStrSize);
927}
928
930 uint32_t &SrcLocStrSize) {
931 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
932 Loc.IP.getBlock()->getParent());
933}
934
936 return Builder.CreateCall(
937 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
938 "omp_global_thread_num");
939}
940
943 bool ForceSimpleCall, bool CheckCancelFlag) {
944 if (!updateToLocation(Loc))
945 return Loc.IP;
946
947 // Build call __kmpc_cancel_barrier(loc, thread_id) or
948 // __kmpc_barrier(loc, thread_id);
949
950 IdentFlag BarrierLocFlags;
951 switch (Kind) {
952 case OMPD_for:
953 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
954 break;
955 case OMPD_sections:
956 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
957 break;
958 case OMPD_single:
959 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
960 break;
961 case OMPD_barrier:
962 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
963 break;
964 default:
965 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
966 break;
967 }
968
969 uint32_t SrcLocStrSize;
970 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
971 Value *Args[] = {
972 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
973 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
974
975 // If we are in a cancellable parallel region, barriers are cancellation
976 // points.
977 // TODO: Check why we would force simple calls or to ignore the cancel flag.
978 bool UseCancelBarrier =
979 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
980
981 Value *Result =
983 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
984 : OMPRTL___kmpc_barrier),
985 Args);
986
987 if (UseCancelBarrier && CheckCancelFlag)
988 emitCancelationCheckImpl(Result, OMPD_parallel);
989
990 return Builder.saveIP();
991}
992
995 Value *IfCondition,
996 omp::Directive CanceledDirective) {
997 if (!updateToLocation(Loc))
998 return Loc.IP;
999
1000 // LLVM utilities like blocks with terminators.
1001 auto *UI = Builder.CreateUnreachable();
1002
1003 Instruction *ThenTI = UI, *ElseTI = nullptr;
1004 if (IfCondition)
1005 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1006 Builder.SetInsertPoint(ThenTI);
1007
1008 Value *CancelKind = nullptr;
1009 switch (CanceledDirective) {
1010#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1011 case DirectiveEnum: \
1012 CancelKind = Builder.getInt32(Value); \
1013 break;
1014#include "llvm/Frontend/OpenMP/OMPKinds.def"
1015 default:
1016 llvm_unreachable("Unknown cancel kind!");
1017 }
1018
1019 uint32_t SrcLocStrSize;
1020 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1021 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1022 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1023 Value *Result = Builder.CreateCall(
1024 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1025 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) {
1026 if (CanceledDirective == OMPD_parallel) {
1028 Builder.restoreIP(IP);
1030 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
1031 /* CheckCancelFlag */ false);
1032 }
1033 };
1034
1035 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1036 emitCancelationCheckImpl(Result, CanceledDirective, ExitCB);
1037
1038 // Update the insertion point and remove the terminator we introduced.
1039 Builder.SetInsertPoint(UI->getParent());
1040 UI->eraseFromParent();
1041
1042 return Builder.saveIP();
1043}
1044
1046 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1047 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1048 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1049 if (!updateToLocation(Loc))
1050 return Loc.IP;
1051
1052 Builder.restoreIP(AllocaIP);
1053 auto *KernelArgsPtr =
1054 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1055 Builder.restoreIP(Loc.IP);
1056
1057 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1058 llvm::Value *Arg =
1059 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1061 KernelArgs[I], Arg,
1062 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1063 }
1064
1065 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1066 NumThreads, HostPtr, KernelArgsPtr};
1067
1068 Return = Builder.CreateCall(
1069 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1070 OffloadingArgs);
1071
1072 return Builder.saveIP();
1073}
1074
1076 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
1077 EmitFallbackCallbackTy emitTargetCallFallbackCB, TargetKernelArgs &Args,
1078 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1079
1080 if (!updateToLocation(Loc))
1081 return Loc.IP;
1082
1083 Builder.restoreIP(Loc.IP);
1084 // On top of the arrays that were filled up, the target offloading call
1085 // takes as arguments the device id as well as the host pointer. The host
1086 // pointer is used by the runtime library to identify the current target
1087 // region, so it only has to be unique and not necessarily point to
1088 // anything. It could be the pointer to the outlined function that
1089 // implements the target region, but we aren't using that so that the
1090 // compiler doesn't need to keep that, and could therefore inline the host
1091 // function if proven worthwhile during optimization.
1092
1093 // From this point on, we need to have an ID of the target region defined.
1094 assert(OutlinedFnID && "Invalid outlined function ID!");
1095 (void)OutlinedFnID;
1096
1097 // Return value of the runtime offloading call.
1098 Value *Return = nullptr;
1099
1100 // Arguments for the target kernel.
1101 SmallVector<Value *> ArgsVector;
1102 getKernelArgsVector(Args, Builder, ArgsVector);
1103
1104 // The target region is an outlined function launched by the runtime
1105 // via calls to __tgt_target_kernel().
1106 //
1107 // Note that on the host and CPU targets, the runtime implementation of
1108 // these calls simply call the outlined function without forking threads.
1109 // The outlined functions themselves have runtime calls to
1110 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1111 // the compiler in emitTeamsCall() and emitParallelCall().
1112 //
1113 // In contrast, on the NVPTX target, the implementation of
1114 // __tgt_target_teams() launches a GPU kernel with the requested number
1115 // of teams and threads so no additional calls to the runtime are required.
1116 // Check the error code and execute the host version if required.
1117 Builder.restoreIP(emitTargetKernel(Builder, AllocaIP, Return, RTLoc, DeviceID,
1118 Args.NumTeams.front(), Args.NumThreads,
1119 OutlinedFnID, ArgsVector));
1120
1121 BasicBlock *OffloadFailedBlock =
1122 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1123 BasicBlock *OffloadContBlock =
1124 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1126 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1127
1128 auto CurFn = Builder.GetInsertBlock()->getParent();
1129 emitBlock(OffloadFailedBlock, CurFn);
1130 Builder.restoreIP(emitTargetCallFallbackCB(Builder.saveIP()));
1131 emitBranch(OffloadContBlock);
1132 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1133 return Builder.saveIP();
1134}
1135
1137 omp::Directive CanceledDirective,
1138 FinalizeCallbackTy ExitCB) {
1139 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1140 "Unexpected cancellation!");
1141
1142 // For a cancel barrier we create two new blocks.
1144 BasicBlock *NonCancellationBlock;
1145 if (Builder.GetInsertPoint() == BB->end()) {
1146 // TODO: This branch will not be needed once we moved to the
1147 // OpenMPIRBuilder codegen completely.
1148 NonCancellationBlock = BasicBlock::Create(
1149 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1150 } else {
1151 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1154 }
1155 BasicBlock *CancellationBlock = BasicBlock::Create(
1156 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1157
1158 // Jump to them based on the return value.
1159 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1160 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1161 /* TODO weight */ nullptr, nullptr);
1162
1163 // From the cancellation block we finalize all variables and go to the
1164 // post finalization block that is known to the FiniCB callback.
1165 Builder.SetInsertPoint(CancellationBlock);
1166 if (ExitCB)
1167 ExitCB(Builder.saveIP());
1168 auto &FI = FinalizationStack.back();
1169 FI.FiniCB(Builder.saveIP());
1170
1171 // The continuation block is where code generation continues.
1172 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1173}
1174
1175// Callback used to create OpenMP runtime calls to support
1176// omp parallel clause for the device.
1177// We need to use this callback to replace call to the OutlinedFn in OuterFn
1178// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1180 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1181 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1182 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1183 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1184 // Add some known attributes.
1185 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1186 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1187 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1188 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1189 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1190 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1191
1192 assert(OutlinedFn.arg_size() >= 2 &&
1193 "Expected at least tid and bounded tid as arguments");
1194 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1195
1196 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1197 assert(CI && "Expected call instruction to outlined function");
1198 CI->getParent()->setName("omp_parallel");
1199
1200 Builder.SetInsertPoint(CI);
1201 Type *PtrTy = OMPIRBuilder->VoidPtr;
1202 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1203
1204 // Add alloca for kernel args
1205 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1206 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1207 AllocaInst *ArgsAlloca =
1208 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1209 Value *Args = ArgsAlloca;
1210 // Add address space cast if array for storing arguments is not allocated
1211 // in address space 0
1212 if (ArgsAlloca->getAddressSpace())
1213 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1214 Builder.restoreIP(CurrentIP);
1215
1216 // Store captured vars which are used by kmpc_parallel_51
1217 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1218 Value *V = *(CI->arg_begin() + 2 + Idx);
1219 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1220 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1221 Builder.CreateStore(V, StoreAddress);
1222 }
1223
1224 Value *Cond =
1225 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1226 : Builder.getInt32(1);
1227
1228 // Build kmpc_parallel_51 call
1229 Value *Parallel51CallArgs[] = {
1230 /* identifier*/ Ident,
1231 /* global thread num*/ ThreadID,
1232 /* if expression */ Cond,
1233 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1234 /* Proc bind */ Builder.getInt32(-1),
1235 /* outlined function */
1236 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr),
1237 /* wrapper function */ NullPtrValue,
1238 /* arguments of the outlined funciton*/ Args,
1239 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1240
1241 FunctionCallee RTLFn =
1242 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1243
1244 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1245
1246 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1247 << *Builder.GetInsertBlock()->getParent() << "\n");
1248
1249 // Initialize the local TID stack location with the argument value.
1250 Builder.SetInsertPoint(PrivTID);
1251 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1252 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1253 PrivTIDAddr);
1254
1255 // Remove redundant call to the outlined function.
1256 CI->eraseFromParent();
1257
1258 for (Instruction *I : ToBeDeleted) {
1259 I->eraseFromParent();
1260 }
1261}
1262
1263// Callback used to create OpenMP runtime calls to support
1264// omp parallel clause for the host.
1265// We need to use this callback to replace call to the OutlinedFn in OuterFn
1266// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1267static void
1269 Function *OuterFn, Value *Ident, Value *IfCondition,
1270 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1271 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1272 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1273 FunctionCallee RTLFn;
1274 if (IfCondition) {
1275 RTLFn =
1276 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1277 } else {
1278 RTLFn =
1279 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1280 }
1281 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1282 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1283 LLVMContext &Ctx = F->getContext();
1284 MDBuilder MDB(Ctx);
1285 // Annotate the callback behavior of the __kmpc_fork_call:
1286 // - The callback callee is argument number 2 (microtask).
1287 // - The first two arguments of the callback callee are unknown (-1).
1288 // - All variadic arguments to the __kmpc_fork_call are passed to the
1289 // callback callee.
1290 F->addMetadata(LLVMContext::MD_callback,
1292 2, {-1, -1},
1293 /* VarArgsArePassed */ true)}));
1294 }
1295 }
1296 // Add some known attributes.
1297 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1298 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1299 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1300
1301 assert(OutlinedFn.arg_size() >= 2 &&
1302 "Expected at least tid and bounded tid as arguments");
1303 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1304
1305 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1306 CI->getParent()->setName("omp_parallel");
1307 Builder.SetInsertPoint(CI);
1308
1309 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1310 Value *ForkCallArgs[] = {
1311 Ident, Builder.getInt32(NumCapturedVars),
1312 Builder.CreateBitCast(&OutlinedFn, OMPIRBuilder->ParallelTaskPtr)};
1313
1314 SmallVector<Value *, 16> RealArgs;
1315 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1316 if (IfCondition) {
1317 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1318 RealArgs.push_back(Cond);
1319 }
1320 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1321
1322 // __kmpc_fork_call_if always expects a void ptr as the last argument
1323 // If there are no arguments, pass a null pointer.
1324 auto PtrTy = OMPIRBuilder->VoidPtr;
1325 if (IfCondition && NumCapturedVars == 0) {
1326 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1327 RealArgs.push_back(NullPtrValue);
1328 }
1329 if (IfCondition && RealArgs.back()->getType() != PtrTy)
1330 RealArgs.back() = Builder.CreateBitCast(RealArgs.back(), PtrTy);
1331
1332 Builder.CreateCall(RTLFn, RealArgs);
1333
1334 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1335 << *Builder.GetInsertBlock()->getParent() << "\n");
1336
1337 // Initialize the local TID stack location with the argument value.
1338 Builder.SetInsertPoint(PrivTID);
1339 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1340 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1341 PrivTIDAddr);
1342
1343 // Remove redundant call to the outlined function.
1344 CI->eraseFromParent();
1345
1346 for (Instruction *I : ToBeDeleted) {
1347 I->eraseFromParent();
1348 }
1349}
1350
1352 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1353 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1354 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1355 omp::ProcBindKind ProcBind, bool IsCancellable) {
1356 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1357
1358 if (!updateToLocation(Loc))
1359 return Loc.IP;
1360
1361 uint32_t SrcLocStrSize;
1362 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1363 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1364 Value *ThreadID = getOrCreateThreadID(Ident);
1365 // If we generate code for the target device, we need to allocate
1366 // struct for aggregate params in the device default alloca address space.
1367 // OpenMP runtime requires that the params of the extracted functions are
1368 // passed as zero address space pointers. This flag ensures that extracted
1369 // function arguments are declared in zero address space
1370 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1371
1372 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1373 // only if we compile for host side.
1374 if (NumThreads && !Config.isTargetDevice()) {
1375 Value *Args[] = {
1376 Ident, ThreadID,
1377 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1379 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1380 }
1381
1382 if (ProcBind != OMP_PROC_BIND_default) {
1383 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1384 Value *Args[] = {
1385 Ident, ThreadID,
1386 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1388 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1389 }
1390
1391 BasicBlock *InsertBB = Builder.GetInsertBlock();
1392 Function *OuterFn = InsertBB->getParent();
1393
1394 // Save the outer alloca block because the insertion iterator may get
1395 // invalidated and we still need this later.
1396 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1397
1398 // Vector to remember instructions we used only during the modeling but which
1399 // we want to delete at the end.
1401
1402 // Change the location to the outer alloca insertion point to create and
1403 // initialize the allocas we pass into the parallel region.
1404 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1405 Builder.restoreIP(NewOuter);
1406 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1407 AllocaInst *ZeroAddrAlloca =
1408 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1409 Instruction *TIDAddr = TIDAddrAlloca;
1410 Instruction *ZeroAddr = ZeroAddrAlloca;
1411 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1412 // Add additional casts to enforce pointers in zero address space
1413 TIDAddr = new AddrSpaceCastInst(
1414 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1415 TIDAddr->insertAfter(TIDAddrAlloca);
1416 ToBeDeleted.push_back(TIDAddr);
1417 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1418 PointerType ::get(M.getContext(), 0),
1419 "zero.addr.ascast");
1420 ZeroAddr->insertAfter(ZeroAddrAlloca);
1421 ToBeDeleted.push_back(ZeroAddr);
1422 }
1423
1424 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1425 // associated arguments in the outlined function, so we delete them later.
1426 ToBeDeleted.push_back(TIDAddrAlloca);
1427 ToBeDeleted.push_back(ZeroAddrAlloca);
1428
1429 // Create an artificial insertion point that will also ensure the blocks we
1430 // are about to split are not degenerated.
1431 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1432
1433 BasicBlock *EntryBB = UI->getParent();
1434 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1435 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1436 BasicBlock *PRegPreFiniBB =
1437 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1438 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1439
1440 auto FiniCBWrapper = [&](InsertPointTy IP) {
1441 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1442 // target to the region exit block.
1443 if (IP.getBlock()->end() == IP.getPoint()) {
1445 Builder.restoreIP(IP);
1446 Instruction *I = Builder.CreateBr(PRegExitBB);
1447 IP = InsertPointTy(I->getParent(), I->getIterator());
1448 }
1449 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1450 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1451 "Unexpected insertion point for finalization call!");
1452 return FiniCB(IP);
1453 };
1454
1455 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1456
1457 // Generate the privatization allocas in the block that will become the entry
1458 // of the outlined function.
1459 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1460 InsertPointTy InnerAllocaIP = Builder.saveIP();
1461
1462 AllocaInst *PrivTIDAddr =
1463 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1464 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1465
1466 // Add some fake uses for OpenMP provided arguments.
1467 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1468 Instruction *ZeroAddrUse =
1469 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1470 ToBeDeleted.push_back(ZeroAddrUse);
1471
1472 // EntryBB
1473 // |
1474 // V
1475 // PRegionEntryBB <- Privatization allocas are placed here.
1476 // |
1477 // V
1478 // PRegionBodyBB <- BodeGen is invoked here.
1479 // |
1480 // V
1481 // PRegPreFiniBB <- The block we will start finalization from.
1482 // |
1483 // V
1484 // PRegionExitBB <- A common exit to simplify block collection.
1485 //
1486
1487 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1488
1489 // Let the caller create the body.
1490 assert(BodyGenCB && "Expected body generation callback!");
1491 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1492 BodyGenCB(InnerAllocaIP, CodeGenIP);
1493
1494 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1495
1496 OutlineInfo OI;
1497 if (Config.isTargetDevice()) {
1498 // Generate OpenMP target specific runtime call
1499 OI.PostOutlineCB = [=, ToBeDeletedVec =
1500 std::move(ToBeDeleted)](Function &OutlinedFn) {
1501 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1502 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1503 ThreadID, ToBeDeletedVec);
1504 };
1505 } else {
1506 // Generate OpenMP host runtime call
1507 OI.PostOutlineCB = [=, ToBeDeletedVec =
1508 std::move(ToBeDeleted)](Function &OutlinedFn) {
1509 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1510 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1511 };
1512 }
1513
1514 OI.OuterAllocaBB = OuterAllocaBlock;
1515 OI.EntryBB = PRegEntryBB;
1516 OI.ExitBB = PRegExitBB;
1517
1518 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1520 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1521
1522 // Ensure a single exit node for the outlined region by creating one.
1523 // We might have multiple incoming edges to the exit now due to finalizations,
1524 // e.g., cancel calls that cause the control flow to leave the region.
1525 BasicBlock *PRegOutlinedExitBB = PRegExitBB;
1526 PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
1527 PRegOutlinedExitBB->setName("omp.par.outlined.exit");
1528 Blocks.push_back(PRegOutlinedExitBB);
1529
1530 CodeExtractorAnalysisCache CEAC(*OuterFn);
1531 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1532 /* AggregateArgs */ false,
1533 /* BlockFrequencyInfo */ nullptr,
1534 /* BranchProbabilityInfo */ nullptr,
1535 /* AssumptionCache */ nullptr,
1536 /* AllowVarArgs */ true,
1537 /* AllowAlloca */ true,
1538 /* AllocationBlock */ OuterAllocaBlock,
1539 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1540
1541 // Find inputs to, outputs from the code region.
1542 BasicBlock *CommonExit = nullptr;
1543 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1544 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1545 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
1546
1547 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1548
1549 FunctionCallee TIDRTLFn =
1550 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1551
1552 auto PrivHelper = [&](Value &V) {
1553 if (&V == TIDAddr || &V == ZeroAddr) {
1554 OI.ExcludeArgsFromAggregate.push_back(&V);
1555 return;
1556 }
1557
1559 for (Use &U : V.uses())
1560 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1561 if (ParallelRegionBlockSet.count(UserI->getParent()))
1562 Uses.insert(&U);
1563
1564 // __kmpc_fork_call expects extra arguments as pointers. If the input
1565 // already has a pointer type, everything is fine. Otherwise, store the
1566 // value onto stack and load it back inside the to-be-outlined region. This
1567 // will ensure only the pointer will be passed to the function.
1568 // FIXME: if there are more than 15 trailing arguments, they must be
1569 // additionally packed in a struct.
1570 Value *Inner = &V;
1571 if (!V.getType()->isPointerTy()) {
1573 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1574
1575 Builder.restoreIP(OuterAllocaIP);
1576 Value *Ptr =
1577 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1578
1579 // Store to stack at end of the block that currently branches to the entry
1580 // block of the to-be-outlined region.
1581 Builder.SetInsertPoint(InsertBB,
1582 InsertBB->getTerminator()->getIterator());
1583 Builder.CreateStore(&V, Ptr);
1584
1585 // Load back next to allocations in the to-be-outlined region.
1586 Builder.restoreIP(InnerAllocaIP);
1587 Inner = Builder.CreateLoad(V.getType(), Ptr);
1588 }
1589
1590 Value *ReplacementValue = nullptr;
1591 CallInst *CI = dyn_cast<CallInst>(&V);
1592 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1593 ReplacementValue = PrivTID;
1594 } else {
1596 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue));
1597 InnerAllocaIP = {
1598 InnerAllocaIP.getBlock(),
1599 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1600
1601 assert(ReplacementValue &&
1602 "Expected copy/create callback to set replacement value!");
1603 if (ReplacementValue == &V)
1604 return;
1605 }
1606
1607 for (Use *UPtr : Uses)
1608 UPtr->set(ReplacementValue);
1609 };
1610
1611 // Reset the inner alloca insertion as it will be used for loading the values
1612 // wrapped into pointers before passing them into the to-be-outlined region.
1613 // Configure it to insert immediately after the fake use of zero address so
1614 // that they are available in the generated body and so that the
1615 // OpenMP-related values (thread ID and zero address pointers) remain leading
1616 // in the argument list.
1617 InnerAllocaIP = IRBuilder<>::InsertPoint(
1618 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1619
1620 // Reset the outer alloca insertion point to the entry of the relevant block
1621 // in case it was invalidated.
1622 OuterAllocaIP = IRBuilder<>::InsertPoint(
1623 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1624
1625 for (Value *Input : Inputs) {
1626 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1627 PrivHelper(*Input);
1628 }
1629 LLVM_DEBUG({
1630 for (Value *Output : Outputs)
1631 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1632 });
1633 assert(Outputs.empty() &&
1634 "OpenMP outlining should not produce live-out values!");
1635
1636 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1637 LLVM_DEBUG({
1638 for (auto *BB : Blocks)
1639 dbgs() << " PBR: " << BB->getName() << "\n";
1640 });
1641
1642 // Adjust the finalization stack, verify the adjustment, and call the
1643 // finalize function a last time to finalize values between the pre-fini
1644 // block and the exit block if we left the parallel "the normal way".
1645 auto FiniInfo = FinalizationStack.pop_back_val();
1646 (void)FiniInfo;
1647 assert(FiniInfo.DK == OMPD_parallel &&
1648 "Unexpected finalization stack state!");
1649
1650 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1651
1652 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1653 FiniCB(PreFiniIP);
1654
1655 // Register the outlined info.
1656 addOutlineInfo(std::move(OI));
1657
1658 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1659 UI->eraseFromParent();
1660
1661 return AfterIP;
1662}
1663
1665 // Build call void __kmpc_flush(ident_t *loc)
1666 uint32_t SrcLocStrSize;
1667 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1668 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1669
1670 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1671}
1672
1674 if (!updateToLocation(Loc))
1675 return;
1676 emitFlush(Loc);
1677}
1678
1680 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1681 // global_tid);
1682 uint32_t SrcLocStrSize;
1683 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1684 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1685 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1686
1687 // Ignore return result until untied tasks are supported.
1688 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1689 Args);
1690}
1691
1693 if (!updateToLocation(Loc))
1694 return;
1695 emitTaskwaitImpl(Loc);
1696}
1697
1699 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1700 uint32_t SrcLocStrSize;
1701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1702 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1704 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1705
1706 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1707 Args);
1708}
1709
1711 if (!updateToLocation(Loc))
1712 return;
1713 emitTaskyieldImpl(Loc);
1714}
1715
1716// Processes the dependencies in Dependencies and does the following
1717// - Allocates space on the stack of an array of DependInfo objects
1718// - Populates each DependInfo object with relevant information of
1719// the corresponding dependence.
1720// - All code is inserted in the entry block of the current function.
1722 OpenMPIRBuilder &OMPBuilder,
1724 // Early return if we have no dependencies to process
1725 if (Dependencies.empty())
1726 return nullptr;
1727
1728 // Given a vector of DependData objects, in this function we create an
1729 // array on the stack that holds kmp_dep_info objects corresponding
1730 // to each dependency. This is then passed to the OpenMP runtime.
1731 // For example, if there are 'n' dependencies then the following psedo
1732 // code is generated. Assume the first dependence is on a variable 'a'
1733 //
1734 // \code{c}
1735 // DepArray = alloc(n x sizeof(kmp_depend_info);
1736 // idx = 0;
1737 // DepArray[idx].base_addr = ptrtoint(&a);
1738 // DepArray[idx].len = 8;
1739 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1740 // ++idx;
1741 // DepArray[idx].base_addr = ...;
1742 // \endcode
1743
1744 IRBuilderBase &Builder = OMPBuilder.Builder;
1745 Type *DependInfo = OMPBuilder.DependInfo;
1746 Module &M = OMPBuilder.M;
1747
1748 Value *DepArray = nullptr;
1749 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1750 Builder.SetInsertPoint(
1752
1753 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1754 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1755
1756 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1757 Value *Base =
1758 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1759 // Store the pointer to the variable
1760 Value *Addr = Builder.CreateStructGEP(
1761 DependInfo, Base,
1762 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1763 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1764 Builder.CreateStore(DepValPtr, Addr);
1765 // Store the size of the variable
1766 Value *Size = Builder.CreateStructGEP(
1767 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1768 Builder.CreateStore(
1769 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1770 Size);
1771 // Store the dependency kind
1772 Value *Flags = Builder.CreateStructGEP(
1773 DependInfo, Base,
1774 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1775 Builder.CreateStore(
1776 ConstantInt::get(Builder.getInt8Ty(),
1777 static_cast<unsigned int>(Dep.DepKind)),
1778 Flags);
1779 }
1780 Builder.restoreIP(OldIP);
1781 return DepArray;
1782}
1783
1786 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1787 bool Tied, Value *Final, Value *IfCondition,
1788 SmallVector<DependData> Dependencies) {
1789
1790 if (!updateToLocation(Loc))
1791 return InsertPointTy();
1792
1793 uint32_t SrcLocStrSize;
1794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1795 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1796 // The current basic block is split into four basic blocks. After outlining,
1797 // they will be mapped as follows:
1798 // ```
1799 // def current_fn() {
1800 // current_basic_block:
1801 // br label %task.exit
1802 // task.exit:
1803 // ; instructions after task
1804 // }
1805 // def outlined_fn() {
1806 // task.alloca:
1807 // br label %task.body
1808 // task.body:
1809 // ret void
1810 // }
1811 // ```
1812 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1813 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1814 BasicBlock *TaskAllocaBB =
1815 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1816
1817 InsertPointTy TaskAllocaIP =
1818 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1819 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1820 BodyGenCB(TaskAllocaIP, TaskBodyIP);
1821
1822 OutlineInfo OI;
1823 OI.EntryBB = TaskAllocaBB;
1824 OI.OuterAllocaBB = AllocaIP.getBlock();
1825 OI.ExitBB = TaskExitBB;
1826
1827 // Add the thread ID argument.
1830 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1831
1832 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1833 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
1834 // Replace the Stale CI by appropriate RTL function call.
1835 assert(OutlinedFn.getNumUses() == 1 &&
1836 "there must be a single user for the outlined function");
1837 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1838
1839 // HasShareds is true if any variables are captured in the outlined region,
1840 // false otherwise.
1841 bool HasShareds = StaleCI->arg_size() > 1;
1842 Builder.SetInsertPoint(StaleCI);
1843
1844 // Gather the arguments for emitting the runtime call for
1845 // @__kmpc_omp_task_alloc
1846 Function *TaskAllocFn =
1847 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
1848
1849 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
1850 // call.
1851 Value *ThreadID = getOrCreateThreadID(Ident);
1852
1853 // Argument - `flags`
1854 // Task is tied iff (Flags & 1) == 1.
1855 // Task is untied iff (Flags & 1) == 0.
1856 // Task is final iff (Flags & 2) == 2.
1857 // Task is not final iff (Flags & 2) == 0.
1858 // TODO: Handle the other flags.
1859 Value *Flags = Builder.getInt32(Tied);
1860 if (Final) {
1861 Value *FinalFlag =
1863 Flags = Builder.CreateOr(FinalFlag, Flags);
1864 }
1865
1866 // Argument - `sizeof_kmp_task_t` (TaskSize)
1867 // Tasksize refers to the size in bytes of kmp_task_t data structure
1868 // including private vars accessed in task.
1869 // TODO: add kmp_task_t_with_privates (privates)
1870 Value *TaskSize = Builder.getInt64(
1872
1873 // Argument - `sizeof_shareds` (SharedsSize)
1874 // SharedsSize refers to the shareds array size in the kmp_task_t data
1875 // structure.
1876 Value *SharedsSize = Builder.getInt64(0);
1877 if (HasShareds) {
1878 AllocaInst *ArgStructAlloca =
1879 dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
1880 assert(ArgStructAlloca &&
1881 "Unable to find the alloca instruction corresponding to arguments "
1882 "for extracted function");
1883 StructType *ArgStructType =
1884 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
1885 assert(ArgStructType && "Unable to find struct type corresponding to "
1886 "arguments for extracted function");
1887 SharedsSize =
1889 }
1890 // Emit the @__kmpc_omp_task_alloc runtime call
1891 // The runtime call returns a pointer to an area where the task captured
1892 // variables must be copied before the task is run (TaskData)
1893 CallInst *TaskData = Builder.CreateCall(
1894 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
1895 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
1896 /*task_func=*/&OutlinedFn});
1897
1898 // Copy the arguments for outlined function
1899 if (HasShareds) {
1900 Value *Shareds = StaleCI->getArgOperand(1);
1901 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
1902 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
1903 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
1904 SharedsSize);
1905 }
1906
1907 Value *DepArray = nullptr;
1908 if (Dependencies.size()) {
1909 InsertPointTy OldIP = Builder.saveIP();
1911 &OldIP.getBlock()->getParent()->getEntryBlock().back());
1912
1913 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1914 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1915
1916 unsigned P = 0;
1917 for (const DependData &Dep : Dependencies) {
1918 Value *Base =
1919 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, P);
1920 // Store the pointer to the variable
1922 DependInfo, Base,
1923 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1924 Value *DepValPtr =
1926 Builder.CreateStore(DepValPtr, Addr);
1927 // Store the size of the variable
1929 DependInfo, Base,
1930 static_cast<unsigned int>(RTLDependInfoFields::Len));
1932 Dep.DepValueType)),
1933 Size);
1934 // Store the dependency kind
1936 DependInfo, Base,
1937 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1939 ConstantInt::get(Builder.getInt8Ty(),
1940 static_cast<unsigned int>(Dep.DepKind)),
1941 Flags);
1942 ++P;
1943 }
1944
1945 Builder.restoreIP(OldIP);
1946 }
1947
1948 // In the presence of the `if` clause, the following IR is generated:
1949 // ...
1950 // %data = call @__kmpc_omp_task_alloc(...)
1951 // br i1 %if_condition, label %then, label %else
1952 // then:
1953 // call @__kmpc_omp_task(...)
1954 // br label %exit
1955 // else:
1956 // ;; Wait for resolution of dependencies, if any, before
1957 // ;; beginning the task
1958 // call @__kmpc_omp_wait_deps(...)
1959 // call @__kmpc_omp_task_begin_if0(...)
1960 // call @outlined_fn(...)
1961 // call @__kmpc_omp_task_complete_if0(...)
1962 // br label %exit
1963 // exit:
1964 // ...
1965 if (IfCondition) {
1966 // `SplitBlockAndInsertIfThenElse` requires the block to have a
1967 // terminator.
1968 splitBB(Builder, /*CreateBranch=*/true, "if.end");
1969 Instruction *IfTerminator =
1970 Builder.GetInsertPoint()->getParent()->getTerminator();
1971 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
1972 Builder.SetInsertPoint(IfTerminator);
1973 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
1974 &ElseTI);
1975 Builder.SetInsertPoint(ElseTI);
1976
1977 if (Dependencies.size()) {
1978 Function *TaskWaitFn =
1979 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
1981 TaskWaitFn,
1982 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
1983 ConstantInt::get(Builder.getInt32Ty(), 0),
1985 }
1986 Function *TaskBeginFn =
1987 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
1988 Function *TaskCompleteFn =
1989 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
1990 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
1991 CallInst *CI = nullptr;
1992 if (HasShareds)
1993 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
1994 else
1995 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
1996 CI->setDebugLoc(StaleCI->getDebugLoc());
1997 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
1998 Builder.SetInsertPoint(ThenTI);
1999 }
2000
2001 if (Dependencies.size()) {
2002 Function *TaskFn =
2003 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2005 TaskFn,
2006 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2007 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2009
2010 } else {
2011 // Emit the @__kmpc_omp_task runtime call to spawn the task
2012 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2013 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2014 }
2015
2016 StaleCI->eraseFromParent();
2017
2018 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2019 if (HasShareds) {
2020 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2021 OutlinedFn.getArg(1)->replaceUsesWithIf(
2022 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2023 }
2024
2025 for (Instruction *I : llvm::reverse(ToBeDeleted))
2026 I->eraseFromParent();
2027 };
2028
2029 addOutlineInfo(std::move(OI));
2030 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2031
2032 return Builder.saveIP();
2033}
2034
2037 InsertPointTy AllocaIP,
2038 BodyGenCallbackTy BodyGenCB) {
2039 if (!updateToLocation(Loc))
2040 return InsertPointTy();
2041
2042 uint32_t SrcLocStrSize;
2043 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2044 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2045 Value *ThreadID = getOrCreateThreadID(Ident);
2046
2047 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2048 Function *TaskgroupFn =
2049 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2050 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2051
2052 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2053 BodyGenCB(AllocaIP, Builder.saveIP());
2054
2055 Builder.SetInsertPoint(TaskgroupExitBB);
2056 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2057 Function *EndTaskgroupFn =
2058 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2059 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2060
2061 return Builder.saveIP();
2062}
2063
2065 const LocationDescription &Loc, InsertPointTy AllocaIP,
2067 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2068 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2069
2070 if (!updateToLocation(Loc))
2071 return Loc.IP;
2072
2073 auto FiniCBWrapper = [&](InsertPointTy IP) {
2074 if (IP.getBlock()->end() != IP.getPoint())
2075 return FiniCB(IP);
2076 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2077 // will fail because that function requires the Finalization Basic Block to
2078 // have a terminator, which is already removed by EmitOMPRegionBody.
2079 // IP is currently at cancelation block.
2080 // We need to backtrack to the condition block to fetch
2081 // the exit block and create a branch from cancelation
2082 // to exit block.
2084 Builder.restoreIP(IP);
2085 auto *CaseBB = IP.getBlock()->getSinglePredecessor();
2086 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2087 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2088 Instruction *I = Builder.CreateBr(ExitBB);
2089 IP = InsertPointTy(I->getParent(), I->getIterator());
2090 return FiniCB(IP);
2091 };
2092
2093 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2094
2095 // Each section is emitted as a switch case
2096 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2097 // -> OMP.createSection() which generates the IR for each section
2098 // Iterate through all sections and emit a switch construct:
2099 // switch (IV) {
2100 // case 0:
2101 // <SectionStmt[0]>;
2102 // break;
2103 // ...
2104 // case <NumSection> - 1:
2105 // <SectionStmt[<NumSection> - 1]>;
2106 // break;
2107 // }
2108 // ...
2109 // section_loop.after:
2110 // <FiniCB>;
2111 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) {
2112 Builder.restoreIP(CodeGenIP);
2114 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2115 Function *CurFn = Continue->getParent();
2116 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2117
2118 unsigned CaseNumber = 0;
2119 for (auto SectionCB : SectionCBs) {
2121 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2122 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2123 Builder.SetInsertPoint(CaseBB);
2124 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2125 SectionCB(InsertPointTy(),
2126 {CaseEndBr->getParent(), CaseEndBr->getIterator()});
2127 CaseNumber++;
2128 }
2129 // remove the existing terminator from body BB since there can be no
2130 // terminators after switch/case
2131 };
2132 // Loop body ends here
2133 // LowerBound, UpperBound, and STride for createCanonicalLoop
2134 Type *I32Ty = Type::getInt32Ty(M.getContext());
2135 Value *LB = ConstantInt::get(I32Ty, 0);
2136 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2137 Value *ST = ConstantInt::get(I32Ty, 1);
2139 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2140 InsertPointTy AfterIP =
2141 applyStaticWorkshareLoop(Loc.DL, LoopInfo, AllocaIP, !IsNowait);
2142
2143 // Apply the finalization callback in LoopAfterBB
2144 auto FiniInfo = FinalizationStack.pop_back_val();
2145 assert(FiniInfo.DK == OMPD_sections &&
2146 "Unexpected finalization stack state!");
2147 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2148 Builder.restoreIP(AfterIP);
2149 BasicBlock *FiniBB =
2150 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2151 CB(Builder.saveIP());
2152 AfterIP = {FiniBB, FiniBB->begin()};
2153 }
2154
2155 return AfterIP;
2156}
2157
2160 BodyGenCallbackTy BodyGenCB,
2161 FinalizeCallbackTy FiniCB) {
2162 if (!updateToLocation(Loc))
2163 return Loc.IP;
2164
2165 auto FiniCBWrapper = [&](InsertPointTy IP) {
2166 if (IP.getBlock()->end() != IP.getPoint())
2167 return FiniCB(IP);
2168 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2169 // will fail because that function requires the Finalization Basic Block to
2170 // have a terminator, which is already removed by EmitOMPRegionBody.
2171 // IP is currently at cancelation block.
2172 // We need to backtrack to the condition block to fetch
2173 // the exit block and create a branch from cancelation
2174 // to exit block.
2176 Builder.restoreIP(IP);
2177 auto *CaseBB = Loc.IP.getBlock();
2178 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2179 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2180 Instruction *I = Builder.CreateBr(ExitBB);
2181 IP = InsertPointTy(I->getParent(), I->getIterator());
2182 return FiniCB(IP);
2183 };
2184
2185 Directive OMPD = Directive::OMPD_sections;
2186 // Since we are using Finalization Callback here, HasFinalize
2187 // and IsCancellable have to be true
2188 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2189 /*Conditional*/ false, /*hasFinalize*/ true,
2190 /*IsCancellable*/ true);
2191}
2192
2195 IT++;
2196 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2197}
2198
2199void OpenMPIRBuilder::emitUsed(StringRef Name,
2200 std::vector<WeakTrackingVH> &List) {
2201 if (List.empty())
2202 return;
2203
2204 // Convert List to what ConstantArray needs.
2206 UsedArray.resize(List.size());
2207 for (unsigned I = 0, E = List.size(); I != E; ++I)
2209 cast<Constant>(&*List[I]), Builder.getPtrTy());
2210
2211 if (UsedArray.empty())
2212 return;
2213 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
2214
2215 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
2216 ConstantArray::get(ATy, UsedArray), Name);
2217
2218 GV->setSection("llvm.metadata");
2219}
2220
2221Value *OpenMPIRBuilder::getGPUThreadID() {
2222 return Builder.CreateCall(
2224 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2225 {});
2226}
2227
2228Value *OpenMPIRBuilder::getGPUWarpSize() {
2229 return Builder.CreateCall(
2230 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2231}
2232
2233Value *OpenMPIRBuilder::getNVPTXWarpID() {
2234 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2235 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2236}
2237
2238Value *OpenMPIRBuilder::getNVPTXLaneID() {
2239 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2240 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2241 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2242 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2243 "nvptx_lane_id");
2244}
2245
2246Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2247 Type *ToType) {
2248 Type *FromType = From->getType();
2249 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2250 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2251 assert(FromSize > 0 && "From size must be greater than zero");
2252 assert(ToSize > 0 && "To size must be greater than zero");
2253 if (FromType == ToType)
2254 return From;
2255 if (FromSize == ToSize)
2256 return Builder.CreateBitCast(From, ToType);
2257 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2258 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2259 InsertPointTy SaveIP = Builder.saveIP();
2260 Builder.restoreIP(AllocaIP);
2261 Value *CastItem = Builder.CreateAlloca(ToType);
2262 Builder.restoreIP(SaveIP);
2263
2265 CastItem, FromType->getPointerTo());
2266 Builder.CreateStore(From, ValCastItem);
2267 return Builder.CreateLoad(ToType, CastItem);
2268}
2269
2270Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2271 Value *Element,
2272 Type *ElementType,
2273 Value *Offset) {
2274 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2275 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2276
2277 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2278 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2279 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2280 Value *WarpSize =
2281 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2283 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2284 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2285 Value *WarpSizeCast =
2286 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2287 Value *ShuffleCall =
2288 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2289 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2290}
2291
2292void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2293 Value *DstAddr, Type *ElemType,
2294 Value *Offset, Type *ReductionArrayTy) {
2296 // Create the loop over the big sized data.
2297 // ptr = (void*)Elem;
2298 // ptrEnd = (void*) Elem + 1;
2299 // Step = 8;
2300 // while (ptr + Step < ptrEnd)
2301 // shuffle((int64_t)*ptr);
2302 // Step = 4;
2303 // while (ptr + Step < ptrEnd)
2304 // shuffle((int32_t)*ptr);
2305 // ...
2306 Type *IndexTy = Builder.getIndexTy(
2308 Value *ElemPtr = DstAddr;
2309 Value *Ptr = SrcAddr;
2310 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2311 if (Size < IntSize)
2312 continue;
2313 Type *IntType = Builder.getIntNTy(IntSize * 8);
2315 Ptr, IntType->getPointerTo(), Ptr->getName() + ".ascast");
2316 Value *SrcAddrGEP =
2317 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2319 ElemPtr, IntType->getPointerTo(), ElemPtr->getName() + ".ascast");
2320
2321 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2322 if ((Size / IntSize) > 1) {
2324 SrcAddrGEP, Builder.getPtrTy());
2325 BasicBlock *PreCondBB =
2326 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2327 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2328 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2329 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2330 emitBlock(PreCondBB, CurFunc);
2331 PHINode *PhiSrc =
2332 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2333 PhiSrc->addIncoming(Ptr, CurrentBB);
2334 PHINode *PhiDest =
2335 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2336 PhiDest->addIncoming(ElemPtr, CurrentBB);
2337 Ptr = PhiSrc;
2338 ElemPtr = PhiDest;
2339 Value *PtrDiff = Builder.CreatePtrDiff(
2340 Builder.getInt8Ty(), PtrEnd,
2343 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2344 ExitBB);
2345 emitBlock(ThenBB, CurFunc);
2346 Value *Res = createRuntimeShuffleFunction(
2347 AllocaIP,
2349 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2350 IntType, Offset);
2351 Builder.CreateAlignedStore(Res, ElemPtr,
2352 M.getDataLayout().getPrefTypeAlign(ElemType));
2353 Value *LocalPtr =
2354 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2355 Value *LocalElemPtr =
2356 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2357 PhiSrc->addIncoming(LocalPtr, ThenBB);
2358 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2359 emitBranch(PreCondBB);
2360 emitBlock(ExitBB, CurFunc);
2361 } else {
2362 Value *Res = createRuntimeShuffleFunction(
2363 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2364 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2365 Res->getType()->getScalarSizeInBits())
2366 Res = Builder.CreateTrunc(Res, ElemType);
2367 Builder.CreateStore(Res, ElemPtr);
2368 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2369 ElemPtr =
2370 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2371 }
2372 Size = Size % IntSize;
2373 }
2374}
2375
2376void OpenMPIRBuilder::emitReductionListCopy(
2377 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2378 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2379 CopyOptionsTy CopyOptions) {
2380 Type *IndexTy = Builder.getIndexTy(
2382 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2383
2384 // Iterates, element-by-element, through the source Reduce list and
2385 // make a copy.
2386 for (auto En : enumerate(ReductionInfos)) {
2387 const ReductionInfo &RI = En.value();
2388 Value *SrcElementAddr = nullptr;
2389 Value *DestElementAddr = nullptr;
2390 Value *DestElementPtrAddr = nullptr;
2391 // Should we shuffle in an element from a remote lane?
2392 bool ShuffleInElement = false;
2393 // Set to true to update the pointer in the dest Reduce list to a
2394 // newly created element.
2395 bool UpdateDestListPtr = false;
2396
2397 // Step 1.1: Get the address for the src element in the Reduce list.
2398 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2399 ReductionArrayTy, SrcBase,
2400 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2401 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2402
2403 // Step 1.2: Create a temporary to store the element in the destination
2404 // Reduce list.
2405 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2406 ReductionArrayTy, DestBase,
2407 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2408 switch (Action) {
2410 InsertPointTy CurIP = Builder.saveIP();
2411 Builder.restoreIP(AllocaIP);
2412 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2413 ".omp.reduction.element");
2414 DestAlloca->setAlignment(
2415 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2416 DestElementAddr = DestAlloca;
2417 DestElementAddr =
2418 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2419 DestElementAddr->getName() + ".ascast");
2420 Builder.restoreIP(CurIP);
2421 ShuffleInElement = true;
2422 UpdateDestListPtr = true;
2423 break;
2424 }
2426 DestElementAddr =
2427 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2428 break;
2429 }
2430 }
2431
2432 // Now that all active lanes have read the element in the
2433 // Reduce list, shuffle over the value from the remote lane.
2434 if (ShuffleInElement) {
2435 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2436 RemoteLaneOffset, ReductionArrayTy);
2437 } else {
2438 switch (RI.EvaluationKind) {
2439 case EvalKind::Scalar: {
2440 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2441 // Store the source element value to the dest element address.
2442 Builder.CreateStore(Elem, DestElementAddr);
2443 break;
2444 }
2445 case EvalKind::Complex: {
2447 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2448 Value *SrcReal = Builder.CreateLoad(
2449 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2451 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2452 Value *SrcImg = Builder.CreateLoad(
2453 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2454
2456 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2458 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2459 Builder.CreateStore(SrcReal, DestRealPtr);
2460 Builder.CreateStore(SrcImg, DestImgPtr);
2461 break;
2462 }
2463 case EvalKind::Aggregate: {
2464 Value *SizeVal = Builder.getInt64(
2465 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2467 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2468 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2469 SizeVal, false);
2470 break;
2471 }
2472 };
2473 }
2474
2475 // Step 3.1: Modify reference in dest Reduce list as needed.
2476 // Modifying the reference in Reduce list to point to the newly
2477 // created element. The element is live in the current function
2478 // scope and that of functions it invokes (i.e., reduce_function).
2479 // RemoteReduceData[i] = (void*)&RemoteElem
2480 if (UpdateDestListPtr) {
2482 DestElementAddr, Builder.getPtrTy(),
2483 DestElementAddr->getName() + ".ascast");
2484 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2485 }
2486 }
2487}
2488
2489Function *OpenMPIRBuilder::emitInterWarpCopyFunction(
2490 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2491 AttributeList FuncAttrs) {
2492 InsertPointTy SavedIP = Builder.saveIP();
2493 LLVMContext &Ctx = M.getContext();
2495 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2496 /* IsVarArg */ false);
2497 Function *WcFunc =
2499 "_omp_reduction_inter_warp_copy_func", &M);
2500 WcFunc->setAttributes(FuncAttrs);
2501 WcFunc->addParamAttr(0, Attribute::NoUndef);
2502 WcFunc->addParamAttr(1, Attribute::NoUndef);
2503 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2504 Builder.SetInsertPoint(EntryBB);
2505
2506 // ReduceList: thread local Reduce list.
2507 // At the stage of the computation when this function is called, partially
2508 // aggregated values reside in the first lane of every active warp.
2509 Argument *ReduceListArg = WcFunc->getArg(0);
2510 // NumWarps: number of warps active in the parallel region. This could
2511 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2512 Argument *NumWarpsArg = WcFunc->getArg(1);
2513
2514 // This array is used as a medium to transfer, one reduce element at a time,
2515 // the data from the first lane of every warp to lanes in the first warp
2516 // in order to perform the final step of a reduction in a parallel region
2517 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2518 // for reduced latency, as well as to have a distinct copy for concurrently
2519 // executing target regions. The array is declared with common linkage so
2520 // as to be shared across compilation units.
2521 StringRef TransferMediumName =
2522 "__openmp_nvptx_data_transfer_temporary_storage";
2523 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2524 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2525 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2526 if (!TransferMedium) {
2527 TransferMedium = new GlobalVariable(
2528 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2529 UndefValue::get(ArrayTy), TransferMediumName,
2530 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2531 /*AddressSpace=*/3);
2532 }
2533
2534 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2535 Value *GPUThreadID = getGPUThreadID();
2536 // nvptx_lane_id = nvptx_id % warpsize
2537 Value *LaneID = getNVPTXLaneID();
2538 // nvptx_warp_id = nvptx_id / warpsize
2539 Value *WarpID = getNVPTXWarpID();
2540
2541 InsertPointTy AllocaIP =
2544 Type *Arg0Type = ReduceListArg->getType();
2545 Type *Arg1Type = NumWarpsArg->getType();
2546 Builder.restoreIP(AllocaIP);
2547 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2548 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2549 AllocaInst *NumWarpsAlloca =
2550 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2552 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2554 NumWarpsAlloca, Arg1Type->getPointerTo(),
2555 NumWarpsAlloca->getName() + ".ascast");
2556 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2557 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2558 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2559 InsertPointTy CodeGenIP =
2561 Builder.restoreIP(CodeGenIP);
2562
2563 Value *ReduceList =
2564 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2565
2566 for (auto En : enumerate(ReductionInfos)) {
2567 //
2568 // Warp master copies reduce element to transfer medium in __shared__
2569 // memory.
2570 //
2571 const ReductionInfo &RI = En.value();
2572 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2573 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2574 Type *CType = Builder.getIntNTy(TySize * 8);
2575
2576 unsigned NumIters = RealTySize / TySize;
2577 if (NumIters == 0)
2578 continue;
2579 Value *Cnt = nullptr;
2580 Value *CntAddr = nullptr;
2581 BasicBlock *PrecondBB = nullptr;
2582 BasicBlock *ExitBB = nullptr;
2583 if (NumIters > 1) {
2584 CodeGenIP = Builder.saveIP();
2585 Builder.restoreIP(AllocaIP);
2586 CntAddr =
2587 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2588
2589 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2590 CntAddr->getName() + ".ascast");
2591 Builder.restoreIP(CodeGenIP);
2593 CntAddr,
2594 /*Volatile=*/false);
2595 PrecondBB = BasicBlock::Create(Ctx, "precond");
2596 ExitBB = BasicBlock::Create(Ctx, "exit");
2597 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2598 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2599 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2600 /*Volatile=*/false);
2602 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2603 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2605 }
2606
2607 // kmpc_barrier.
2608 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2609 omp::Directive::OMPD_unknown,
2610 /* ForceSimpleCall */ false,
2611 /* CheckCancelFlag */ true);
2612 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2613 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2614 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2615
2616 // if (lane_id == 0)
2617 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2618 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2620
2621 // Reduce element = LocalReduceList[i]
2622 auto *RedListArrayTy =
2623 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2624 Type *IndexTy = Builder.getIndexTy(
2626 Value *ElemPtrPtr =
2627 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2628 {ConstantInt::get(IndexTy, 0),
2629 ConstantInt::get(IndexTy, En.index())});
2630 // elemptr = ((CopyType*)(elemptrptr)) + I
2631 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2632 if (NumIters > 1)
2633 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2634
2635 // Get pointer to location in transfer medium.
2636 // MediumPtr = &medium[warp_id]
2637 Value *MediumPtr = Builder.CreateInBoundsGEP(
2638 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2639 // elem = *elemptr
2640 //*MediumPtr = elem
2641 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2642 // Store the source element value to the dest element address.
2643 Builder.CreateStore(Elem, MediumPtr,
2644 /*IsVolatile*/ true);
2645 Builder.CreateBr(MergeBB);
2646
2647 // else
2649 Builder.CreateBr(MergeBB);
2650
2651 // endif
2653 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2654 omp::Directive::OMPD_unknown,
2655 /* ForceSimpleCall */ false,
2656 /* CheckCancelFlag */ true);
2657
2658 // Warp 0 copies reduce element from transfer medium
2659 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2660 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2661 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2662
2663 Value *NumWarpsVal =
2664 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2665 // Up to 32 threads in warp 0 are active.
2666 Value *IsActiveThread =
2667 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2668 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2669
2670 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2671
2672 // SecMediumPtr = &medium[tid]
2673 // SrcMediumVal = *SrcMediumPtr
2674 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2675 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2676 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2677 Value *TargetElemPtrPtr =
2678 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2679 {ConstantInt::get(IndexTy, 0),
2680 ConstantInt::get(IndexTy, En.index())});
2681 Value *TargetElemPtrVal =
2682 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2683 Value *TargetElemPtr = TargetElemPtrVal;
2684 if (NumIters > 1)
2685 TargetElemPtr =
2686 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2687
2688 // *TargetElemPtr = SrcMediumVal;
2689 Value *SrcMediumValue =
2690 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2691 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2692 Builder.CreateBr(W0MergeBB);
2693
2694 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2695 Builder.CreateBr(W0MergeBB);
2696
2697 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2698
2699 if (NumIters > 1) {
2700 Cnt = Builder.CreateNSWAdd(
2701 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2702 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2703
2704 auto *CurFn = Builder.GetInsertBlock()->getParent();
2705 emitBranch(PrecondBB);
2706 emitBlock(ExitBB, CurFn);
2707 }
2708 RealTySize %= TySize;
2709 }
2710 }
2711
2713 Builder.restoreIP(SavedIP);
2714
2715 return WcFunc;
2716}
2717
2718Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2719 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2720 AttributeList FuncAttrs) {
2721 LLVMContext &Ctx = M.getContext();
2722 FunctionType *FuncTy =
2724 {Builder.getPtrTy(), Builder.getInt16Ty(),
2725 Builder.getInt16Ty(), Builder.getInt16Ty()},
2726 /* IsVarArg */ false);
2727 Function *SarFunc =
2729 "_omp_reduction_shuffle_and_reduce_func", &M);
2730 SarFunc->setAttributes(FuncAttrs);
2731 SarFunc->addParamAttr(0, Attribute::NoUndef);
2732 SarFunc->addParamAttr(1, Attribute::NoUndef);
2733 SarFunc->addParamAttr(2, Attribute::NoUndef);
2734 SarFunc->addParamAttr(3, Attribute::NoUndef);
2735 SarFunc->addParamAttr(1, Attribute::SExt);
2736 SarFunc->addParamAttr(2, Attribute::SExt);
2737 SarFunc->addParamAttr(3, Attribute::SExt);
2738 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2739 Builder.SetInsertPoint(EntryBB);
2740
2741 // Thread local Reduce list used to host the values of data to be reduced.
2742 Argument *ReduceListArg = SarFunc->getArg(0);
2743 // Current lane id; could be logical.
2744 Argument *LaneIDArg = SarFunc->getArg(1);
2745 // Offset of the remote source lane relative to the current lane.
2746 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2747 // Algorithm version. This is expected to be known at compile time.
2748 Argument *AlgoVerArg = SarFunc->getArg(3);
2749
2750 Type *ReduceListArgType = ReduceListArg->getType();
2751 Type *LaneIDArgType = LaneIDArg->getType();
2752 Type *LaneIDArgPtrType = LaneIDArg->getType()->getPointerTo();
2753 Value *ReduceListAlloca = Builder.CreateAlloca(
2754 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2755 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2756 LaneIDArg->getName() + ".addr");
2757 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2758 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2759 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2760 AlgoVerArg->getName() + ".addr");
2761 ArrayType *RedListArrayTy =
2762 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2763
2764 // Create a local thread-private variable to host the Reduce list
2765 // from a remote lane.
2766 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2767 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2768
2770 ReduceListAlloca, ReduceListArgType,
2771 ReduceListAlloca->getName() + ".ascast");
2773 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2774 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2775 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2776 RemoteLaneOffsetAlloca->getName() + ".ascast");
2778 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2780 RemoteReductionListAlloca, Builder.getPtrTy(),
2781 RemoteReductionListAlloca->getName() + ".ascast");
2782
2783 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2784 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2785 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2786 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2787
2788 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2789 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2790 Value *RemoteLaneOffset =
2791 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2792 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2793
2794 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2795
2796 // This loop iterates through the list of reduce elements and copies,
2797 // element by element, from a remote lane in the warp to RemoteReduceList,
2798 // hosted on the thread's stack.
2799 emitReductionListCopy(
2800 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2801 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2802
2803 // The actions to be performed on the Remote Reduce list is dependent
2804 // on the algorithm version.
2805 //
2806 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2807 // LaneId % 2 == 0 && Offset > 0):
2808 // do the reduction value aggregation
2809 //
2810 // The thread local variable Reduce list is mutated in place to host the
2811 // reduced data, which is the aggregated value produced from local and
2812 // remote lanes.
2813 //
2814 // Note that AlgoVer is expected to be a constant integer known at compile
2815 // time.
2816 // When AlgoVer==0, the first conjunction evaluates to true, making
2817 // the entire predicate true during compile time.
2818 // When AlgoVer==1, the second conjunction has only the second part to be
2819 // evaluated during runtime. Other conjunctions evaluates to false
2820 // during compile time.
2821 // When AlgoVer==2, the third conjunction has only the second part to be
2822 // evaluated during runtime. Other conjunctions evaluates to false
2823 // during compile time.
2824 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2825 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2826 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2827 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2828 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2829 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2830 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2831 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2832 Value *RemoteOffsetComp =
2833 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2834 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
2835 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
2836 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
2837
2838 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2839 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2840 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2841
2842 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
2845 ReduceList, Builder.getPtrTy());
2846 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2847 RemoteListAddrCast, Builder.getPtrTy());
2848 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
2849 ->addFnAttr(Attribute::NoUnwind);
2850 Builder.CreateBr(MergeBB);
2851
2853 Builder.CreateBr(MergeBB);
2854
2856
2857 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2858 // Reduce list.
2859 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2860 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
2861 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
2862
2863 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
2864 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
2865 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
2866 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2867
2868 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
2869 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
2870 ReductionInfos, RemoteListAddrCast, ReduceList);
2871 Builder.CreateBr(CpyMergeBB);
2872
2873 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
2874 Builder.CreateBr(CpyMergeBB);
2875
2876 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
2877
2879
2880 return SarFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
2885 AttributeList FuncAttrs) {
2887 LLVMContext &Ctx = M.getContext();
2890 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
2891 /* IsVarArg */ false);
2892 Function *LtGCFunc =
2894 "_omp_reduction_list_to_global_copy_func", &M);
2895 LtGCFunc->setAttributes(FuncAttrs);
2896 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
2897 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
2898 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
2899
2900 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
2901 Builder.SetInsertPoint(EntryBlock);
2902
2903 // Buffer: global reduction buffer.
2904 Argument *BufferArg = LtGCFunc->getArg(0);
2905 // Idx: index of the buffer.
2906 Argument *IdxArg = LtGCFunc->getArg(1);
2907 // ReduceList: thread local Reduce list.
2908 Argument *ReduceListArg = LtGCFunc->getArg(2);
2909
2910 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
2911 BufferArg->getName() + ".addr");
2912 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
2913 IdxArg->getName() + ".addr");
2914 Value *ReduceListArgAlloca = Builder.CreateAlloca(
2915 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
2917 BufferArgAlloca, Builder.getPtrTy(),
2918 BufferArgAlloca->getName() + ".ascast");
2920 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
2921 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2922 ReduceListArgAlloca, Builder.getPtrTy(),
2923 ReduceListArgAlloca->getName() + ".ascast");
2924
2925 Builder.CreateStore(BufferArg, BufferArgAddrCast);
2926 Builder.CreateStore(IdxArg, IdxArgAddrCast);
2927 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
2928
2929 Value *LocalReduceList =
2930 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
2931 Value *BufferArgVal =
2932 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
2933 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
2934 Type *IndexTy = Builder.getIndexTy(
2936 for (auto En : enumerate(ReductionInfos)) {
2937 const ReductionInfo &RI = En.value();
2938 auto *RedListArrayTy =
2939 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2940 // Reduce element = LocalReduceList[i]
2941 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
2942 RedListArrayTy, LocalReduceList,
2943 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2944 // elemptr = ((CopyType*)(elemptrptr)) + I
2945 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2946
2947 // Global = Buffer.VD[Idx];
2948 Value *BufferVD =
2949 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
2951 ReductionsBufferTy, BufferVD, 0, En.index());
2952
2953 switch (RI.EvaluationKind) {
2954 case EvalKind::Scalar: {
2955 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
2956 Builder.CreateStore(TargetElement, GlobVal);
2957 break;
2958 }
2959 case EvalKind::Complex: {
2961 RI.ElementType, ElemPtr, 0, 0, ".realp");
2962 Value *SrcReal = Builder.CreateLoad(
2963 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2965 RI.ElementType, ElemPtr, 0, 1, ".imagp");
2966 Value *SrcImg = Builder.CreateLoad(
2967 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2968
2970 RI.ElementType, GlobVal, 0, 0, ".realp");
2972 RI.ElementType, GlobVal, 0, 1, ".imagp");
2973 Builder.CreateStore(SrcReal, DestRealPtr);
2974 Builder.CreateStore(SrcImg, DestImgPtr);
2975 break;
2976 }
2977 case EvalKind::Aggregate: {
2978 Value *SizeVal =
2979 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
2981 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
2982 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
2983 break;
2984 }
2985 }
2986 }
2987
2989 Builder.restoreIP(OldIP);
2990 return LtGCFunc;
2991}
2992
2993Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
2994 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2995 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
2997 LLVMContext &Ctx = M.getContext();
3000 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3001 /* IsVarArg */ false);
3002 Function *LtGRFunc =
3004 "_omp_reduction_list_to_global_reduce_func", &M);
3005 LtGRFunc->setAttributes(FuncAttrs);
3006 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3007 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3008 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3009
3010 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3011 Builder.SetInsertPoint(EntryBlock);
3012
3013 // Buffer: global reduction buffer.
3014 Argument *BufferArg = LtGRFunc->getArg(0);
3015 // Idx: index of the buffer.
3016 Argument *IdxArg = LtGRFunc->getArg(1);
3017 // ReduceList: thread local Reduce list.
3018 Argument *ReduceListArg = LtGRFunc->getArg(2);
3019
3020 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3021 BufferArg->getName() + ".addr");
3022 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3023 IdxArg->getName() + ".addr");
3024 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3025 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3026 auto *RedListArrayTy =
3027 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3028
3029 // 1. Build a list of reduction variables.
3030 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3031 Value *LocalReduceList =
3032 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3033
3035 BufferArgAlloca, Builder.getPtrTy(),
3036 BufferArgAlloca->getName() + ".ascast");
3038 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3039 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3040 ReduceListArgAlloca, Builder.getPtrTy(),
3041 ReduceListArgAlloca->getName() + ".ascast");
3042 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3043 LocalReduceList, Builder.getPtrTy(),
3044 LocalReduceList->getName() + ".ascast");
3045
3046 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3047 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3048 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3049
3050 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3051 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3052 Type *IndexTy = Builder.getIndexTy(
3054 for (auto En : enumerate(ReductionInfos)) {
3055 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3056 RedListArrayTy, LocalReduceListAddrCast,
3057 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3058 Value *BufferVD =
3059 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3060 // Global = Buffer.VD[Idx];
3062 ReductionsBufferTy, BufferVD, 0, En.index());
3063 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3064 }
3065
3066 // Call reduce_function(GlobalReduceList, ReduceList)
3067 Value *ReduceList =
3068 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3069 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3070 ->addFnAttr(Attribute::NoUnwind);
3072 Builder.restoreIP(OldIP);
3073 return LtGRFunc;
3074}
3075
3076Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3077 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3078 AttributeList FuncAttrs) {
3080 LLVMContext &Ctx = M.getContext();
3083 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3084 /* IsVarArg */ false);
3085 Function *LtGCFunc =
3087 "_omp_reduction_global_to_list_copy_func", &M);
3088 LtGCFunc->setAttributes(FuncAttrs);
3089 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3090 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3091 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3092
3093 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3094 Builder.SetInsertPoint(EntryBlock);
3095
3096 // Buffer: global reduction buffer.
3097 Argument *BufferArg = LtGCFunc->getArg(0);
3098 // Idx: index of the buffer.
3099 Argument *IdxArg = LtGCFunc->getArg(1);
3100 // ReduceList: thread local Reduce list.
3101 Argument *ReduceListArg = LtGCFunc->getArg(2);
3102
3103 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3104 BufferArg->getName() + ".addr");
3105 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3106 IdxArg->getName() + ".addr");
3107 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3108 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3110 BufferArgAlloca, Builder.getPtrTy(),
3111 BufferArgAlloca->getName() + ".ascast");
3113 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3114 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3115 ReduceListArgAlloca, Builder.getPtrTy(),
3116 ReduceListArgAlloca->getName() + ".ascast");
3117 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3118 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3119 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3120
3121 Value *LocalReduceList =
3122 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3123 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3124 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3125 Type *IndexTy = Builder.getIndexTy(
3127 for (auto En : enumerate(ReductionInfos)) {
3128 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3129 auto *RedListArrayTy =
3130 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3131 // Reduce element = LocalReduceList[i]
3132 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3133 RedListArrayTy, LocalReduceList,
3134 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3135 // elemptr = ((CopyType*)(elemptrptr)) + I
3136 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3137 // Global = Buffer.VD[Idx];
3138 Value *BufferVD =
3139 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3141 ReductionsBufferTy, BufferVD, 0, En.index());
3142
3143 switch (RI.EvaluationKind) {
3144 case EvalKind::Scalar: {
3145 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3146 Builder.CreateStore(TargetElement, ElemPtr);
3147 break;
3148 }
3149 case EvalKind::Complex: {
3151 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3152 Value *SrcReal = Builder.CreateLoad(
3153 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3155 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3156 Value *SrcImg = Builder.CreateLoad(
3157 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3158
3160 RI.ElementType, ElemPtr, 0, 0, ".realp");
3162 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3163 Builder.CreateStore(SrcReal, DestRealPtr);
3164 Builder.CreateStore(SrcImg, DestImgPtr);
3165 break;
3166 }
3167 case EvalKind::Aggregate: {
3168 Value *SizeVal =
3172 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3173 SizeVal, false);
3174 break;
3175 }
3176 }
3177 }
3178
3180 Builder.restoreIP(OldIP);
3181 return LtGCFunc;
3182}
3183
3184Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3185 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3186 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3188 LLVMContext &Ctx = M.getContext();
3189 auto *FuncTy = FunctionType::get(
3191 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3192 /* IsVarArg */ false);
3193 Function *LtGRFunc =
3195 "_omp_reduction_global_to_list_reduce_func", &M);
3196 LtGRFunc->setAttributes(FuncAttrs);
3197 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3198 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3199 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3200
3201 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3202 Builder.SetInsertPoint(EntryBlock);
3203
3204 // Buffer: global reduction buffer.
3205 Argument *BufferArg = LtGRFunc->getArg(0);
3206 // Idx: index of the buffer.
3207 Argument *IdxArg = LtGRFunc->getArg(1);
3208 // ReduceList: thread local Reduce list.
3209 Argument *ReduceListArg = LtGRFunc->getArg(2);
3210
3211 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3212 BufferArg->getName() + ".addr");
3213 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3214 IdxArg->getName() + ".addr");
3215 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3216 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3217 ArrayType *RedListArrayTy =
3218 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3219
3220 // 1. Build a list of reduction variables.
3221 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3222 Value *LocalReduceList =
3223 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3224
3226 BufferArgAlloca, Builder.getPtrTy(),
3227 BufferArgAlloca->getName() + ".ascast");
3229 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3230 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3231 ReduceListArgAlloca, Builder.getPtrTy(),
3232 ReduceListArgAlloca->getName() + ".ascast");
3234 LocalReduceList, Builder.getPtrTy(),
3235 LocalReduceList->getName() + ".ascast");
3236
3237 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3238 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3239 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3240
3241 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3242 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3243 Type *IndexTy = Builder.getIndexTy(
3245 for (auto En : enumerate(ReductionInfos)) {
3246 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3247 RedListArrayTy, ReductionList,
3248 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3249 // Global = Buffer.VD[Idx];
3250 Value *BufferVD =
3251 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3253 ReductionsBufferTy, BufferVD, 0, En.index());
3254 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3255 }
3256
3257 // Call reduce_function(ReduceList, GlobalReduceList)
3258 Value *ReduceList =
3259 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3260 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3261 ->addFnAttr(Attribute::NoUnwind);
3263 Builder.restoreIP(OldIP);
3264 return LtGRFunc;
3265}
3266
3267std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3268 std::string Suffix =
3269 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3270 return (Name + Suffix).str();
3271}
3272
3273Function *OpenMPIRBuilder::createReductionFunction(
3274 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3275 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3276 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3277 {Builder.getPtrTy(), Builder.getPtrTy()},
3278 /* IsVarArg */ false);
3279 std::string Name = getReductionFuncName(ReducerName);
3280 Function *ReductionFunc =
3282 ReductionFunc->setAttributes(FuncAttrs);
3283 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3284 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3285 BasicBlock *EntryBB =
3286 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3287 Builder.SetInsertPoint(EntryBB);
3288
3289 // Need to alloca memory here and deal with the pointers before getting
3290 // LHS/RHS pointers out
3291 Value *LHSArrayPtr = nullptr;
3292 Value *RHSArrayPtr = nullptr;
3293 Argument *Arg0 = ReductionFunc->getArg(0);
3294 Argument *Arg1 = ReductionFunc->getArg(1);
3295 Type *Arg0Type = Arg0->getType();
3296 Type *Arg1Type = Arg1->getType();
3297
3298 Value *LHSAlloca =
3299 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3300 Value *RHSAlloca =
3301 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3303 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3305 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3306 Builder.CreateStore(Arg0, LHSAddrCast);
3307 Builder.CreateStore(Arg1, RHSAddrCast);
3308 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3309 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3310
3311 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3312 Type *IndexTy = Builder.getIndexTy(
3314 SmallVector<Value *> LHSPtrs, RHSPtrs;
3315 for (auto En : enumerate(ReductionInfos)) {
3316 const ReductionInfo &RI = En.value();
3317 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3318 RedArrayTy, RHSArrayPtr,
3319 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3320 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3322 RHSI8Ptr, RI.PrivateVariable->getType(),
3323 RHSI8Ptr->getName() + ".ascast");
3324
3325 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3326 RedArrayTy, LHSArrayPtr,
3327 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3328 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3330 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3331
3333 LHSPtrs.emplace_back(LHSPtr);
3334 RHSPtrs.emplace_back(RHSPtr);
3335 } else {
3336 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3337 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3338 Value *Reduced;
3339 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3340 if (!Builder.GetInsertBlock())
3341 return ReductionFunc;
3342 Builder.CreateStore(Reduced, LHSPtr);
3343 }
3344 }
3345
3347 for (auto En : enumerate(ReductionInfos)) {
3348 unsigned Index = En.index();
3349 const ReductionInfo &RI = En.value();
3350 Value *LHSFixupPtr, *RHSFixupPtr;
3351 Builder.restoreIP(RI.ReductionGenClang(
3352 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3353
3354 // Fix the CallBack code genereated to use the correct Values for the LHS
3355 // and RHS
3356 LHSFixupPtr->replaceUsesWithIf(
3357 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3358 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3359 ReductionFunc;
3360 });
3361 RHSFixupPtr->replaceUsesWithIf(
3362 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3363 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3364 ReductionFunc;
3365 });
3366 }
3367
3369 return ReductionFunc;
3370}
3371
3372static void
3374 bool IsGPU) {
3375 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3376 (void)RI;
3377 assert(RI.Variable && "expected non-null variable");
3378 assert(RI.PrivateVariable && "expected non-null private variable");
3379 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3380 "expected non-null reduction generator callback");
3381 if (!IsGPU) {
3382 assert(
3383 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3384 "expected variables and their private equivalents to have the same "
3385 "type");
3386 }
3387 assert(RI.Variable->getType()->isPointerTy() &&
3388 "expected variables to be pointers");
3389 }
3390}
3391
3393 const LocationDescription &Loc, InsertPointTy AllocaIP,
3394 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3395 bool IsNoWait, bool IsTeamsReduction, bool HasDistribute,
3396 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3397 unsigned ReductionBufNum, Value *SrcLocInfo) {
3398 if (!updateToLocation(Loc))
3399 return InsertPointTy();
3400 Builder.restoreIP(CodeGenIP);
3401 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3402 LLVMContext &Ctx = M.getContext();
3403
3404 // Source location for the ident struct
3405 if (!SrcLocInfo) {
3406 uint32_t SrcLocStrSize;
3407 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3408 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3409 }
3410
3411 if (ReductionInfos.size() == 0)
3412 return Builder.saveIP();
3413
3414 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3415 AttributeList FuncAttrs;
3416 AttrBuilder AttrBldr(Ctx);
3417 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3418 AttrBldr.addAttribute(Attr);
3419 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3420 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3421
3422 Function *ReductionFunc = nullptr;
3423 CodeGenIP = Builder.saveIP();
3424 ReductionFunc =
3425 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3426 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3427 Builder.restoreIP(CodeGenIP);
3428
3429 // Set the grid value in the config needed for lowering later on
3430 if (GridValue.has_value())
3431 Config.setGridValue(GridValue.value());
3432 else
3433 Config.setGridValue(getGridValue(T, ReductionFunc));
3434
3435 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3436 // RedList, shuffle_reduce_func, interwarp_copy_func);
3437 // or
3438 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3439 Value *Res;
3440
3441 // 1. Build a list of reduction variables.
3442 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3443 auto Size = ReductionInfos.size();
3444 Type *PtrTy = PointerType::getUnqual(Ctx);
3445 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3446 CodeGenIP = Builder.saveIP();
3447 Builder.restoreIP(AllocaIP);
3448 Value *ReductionListAlloca =
3449 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3451 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3452 Builder.restoreIP(CodeGenIP);
3453 Type *IndexTy = Builder.getIndexTy(
3455 for (auto En : enumerate(ReductionInfos)) {
3456 const ReductionInfo &RI = En.value();
3457 Value *ElemPtr = Builder.CreateInBoundsGEP(
3458 RedArrayTy, ReductionList,
3459 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3460 Value *CastElem =
3462 Builder.CreateStore(CastElem, ElemPtr);
3463 }
3464 CodeGenIP = Builder.saveIP();
3465 Function *SarFunc =
3466 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3467 Function *WcFunc = emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3468 Builder.restoreIP(CodeGenIP);
3469
3470 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3471
3472 unsigned MaxDataSize = 0;
3473 SmallVector<Type *> ReductionTypeArgs;
3474 for (auto En : enumerate(ReductionInfos)) {
3475 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3476 if (Size > MaxDataSize)
3477 MaxDataSize = Size;
3478 ReductionTypeArgs.emplace_back(En.value().ElementType);
3479 }
3480 Value *ReductionDataSize =
3481 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3482 if (!IsTeamsReduction) {
3483 Value *SarFuncCast =
3485 Value *WcFuncCast =
3487 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3488 WcFuncCast};
3490 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3491 Res = Builder.CreateCall(Pv2Ptr, Args);
3492 } else {
3493 CodeGenIP = Builder.saveIP();
3494 StructType *ReductionsBufferTy = StructType::create(
3495 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3496 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3497 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3498 Function *LtGCFunc = emitListToGlobalCopyFunction(
3499 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3500 Function *LtGRFunc = emitListToGlobalReduceFunction(
3501 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3502 Function *GtLCFunc = emitGlobalToListCopyFunction(
3503 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3504 Function *GtLRFunc = emitGlobalToListReduceFunction(
3505 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3506 Builder.restoreIP(CodeGenIP);
3507
3508 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3509 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3510
3511 Value *Args3[] = {SrcLocInfo,
3512 KernelTeamsReductionPtr,
3513 Builder.getInt32(ReductionBufNum),
3514 ReductionDataSize,
3515 RL,
3516 SarFunc,
3517 WcFunc,
3518 LtGCFunc,
3519 LtGRFunc,
3520 GtLCFunc,
3521 GtLRFunc};
3522
3523 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3524 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3525 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3526 }
3527
3528 // 5. Build if (res == 1)
3529 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3530 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3532 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3533
3534 // 6. Build then branch: where we have reduced values in the master
3535 // thread in each team.
3536 // __kmpc_end_reduce{_nowait}(<gtid>);
3537 // break;
3538 emitBlock(ThenBB, CurFunc);
3539
3540 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3541 for (auto En : enumerate(ReductionInfos)) {
3542 const ReductionInfo &RI = En.value();
3543 Value *LHS = RI.Variable;
3544 Value *RHS =
3546
3548 Value *LHSPtr, *RHSPtr;
3550 &LHSPtr, &RHSPtr, CurFunc));
3551
3552 // Fix the CallBack code genereated to use the correct Values for the LHS
3553 // and RHS
3554 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3555 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3556 ReductionFunc;
3557 });
3558 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3559 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3560 ReductionFunc;
3561 });
3562 } else {
3563 assert(false && "Unhandled ReductionGenCBKind");
3564 }
3565 }
3566 emitBlock(ExitBB, CurFunc);
3567
3569
3570 return Builder.saveIP();
3571}
3572
3574 Type *VoidTy = Type::getVoidTy(M.getContext());
3575 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3576 auto *FuncTy =
3577 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3579 ".omp.reduction.func", &M);
3580}
3581
3584 InsertPointTy AllocaIP,
3585 ArrayRef<ReductionInfo> ReductionInfos,
3586 ArrayRef<bool> IsByRef, bool IsNoWait) {
3587 assert(ReductionInfos.size() == IsByRef.size());
3588 for (const ReductionInfo &RI : ReductionInfos) {
3589 (void)RI;
3590 assert(RI.Variable && "expected non-null variable");
3591 assert(RI.PrivateVariable && "expected non-null private variable");
3592 assert(RI.ReductionGen && "expected non-null reduction generator callback");
3593 assert(RI.Variable->getType() == RI.PrivateVariable->getType() &&
3594 "expected variables and their private equivalents to have the same "
3595 "type");
3596 assert(RI.Variable->getType()->isPointerTy() &&
3597 "expected variables to be pointers");
3598 }
3599
3600 if (!updateToLocation(Loc))
3601 return InsertPointTy();
3602
3603 BasicBlock *InsertBlock = Loc.IP.getBlock();
3604 BasicBlock *ContinuationBlock =
3605 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3606 InsertBlock->getTerminator()->eraseFromParent();
3607
3608 // Create and populate array of type-erased pointers to private reduction
3609 // values.
3610 unsigned NumReductions = ReductionInfos.size();
3611 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3613 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3614
3615 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3616
3617 for (auto En : enumerate(ReductionInfos)) {
3618 unsigned Index = En.index();
3619 const ReductionInfo &RI = En.value();
3620 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3621 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3622 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3623 }
3624
3625 // Emit a call to the runtime function that orchestrates the reduction.
3626 // Declare the reduction function in the process.
3628 Module *Module = Func->getParent();
3629 uint32_t SrcLocStrSize;
3630 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3631 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3632 return RI.AtomicReductionGen;
3633 });
3634 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3635 CanGenerateAtomic
3636 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3637 : IdentFlag(0));
3638 Value *ThreadId = getOrCreateThreadID(Ident);
3639 Constant *NumVariables = Builder.getInt32(NumReductions);
3640 const DataLayout &DL = Module->getDataLayout();
3641 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3642 Constant *RedArraySize = Builder.getInt64(RedArrayByteSize);
3643 Function *ReductionFunc = getFreshReductionFunc(*Module);
3644 Value *Lock = getOMPCriticalRegionLock(".reduction");
3646 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3647 : RuntimeFunction::OMPRTL___kmpc_reduce);
3648 CallInst *ReduceCall =
3649 Builder.CreateCall(ReduceFunc,
3650 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3651 ReductionFunc, Lock},
3652 "reduce");
3653
3654 // Create final reduction entry blocks for the atomic and non-atomic case.
3655 // Emit IR that dispatches control flow to one of the blocks based on the
3656 // reduction supporting the atomic mode.
3657 BasicBlock *NonAtomicRedBlock =
3658 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3659 BasicBlock *AtomicRedBlock =
3660 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3661 SwitchInst *Switch =
3662 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3663 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3664 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3665
3666 // Populate the non-atomic reduction using the elementwise reduction function.
3667 // This loads the elements from the global and private variables and reduces
3668 // them before storing back the result to the global variable.
3669 Builder.SetInsertPoint(NonAtomicRedBlock);
3670 for (auto En : enumerate(ReductionInfos)) {
3671 const ReductionInfo &RI = En.value();
3673 // We have one less load for by-ref case because that load is now inside of
3674 // the reduction region
3675 Value *RedValue = nullptr;
3676 if (!IsByRef[En.index()]) {
3677 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3678 "red.value." + Twine(En.index()));
3679 }
3680 Value *PrivateRedValue =
3682 "red.private.value." + Twine(En.index()));
3683 Value *Reduced;
3684 if (IsByRef[En.index()]) {
3686 PrivateRedValue, Reduced));
3687 } else {
3689 PrivateRedValue, Reduced));
3690 }
3691 if (!Builder.GetInsertBlock())
3692 return InsertPointTy();
3693 // for by-ref case, the load is inside of the reduction region
3694 if (!IsByRef[En.index()])
3695 Builder.CreateStore(Reduced, RI.Variable);
3696 }
3697 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3698 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3699 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3700 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3701 Builder.CreateBr(ContinuationBlock);
3702
3703 // Populate the atomic reduction using the atomic elementwise reduction
3704 // function. There are no loads/stores here because they will be happening
3705 // inside the atomic elementwise reduction.
3706 Builder.SetInsertPoint(AtomicRedBlock);
3707 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3708 for (const ReductionInfo &RI : ReductionInfos) {
3710 RI.Variable, RI.PrivateVariable));
3711 if (!Builder.GetInsertBlock())
3712 return InsertPointTy();
3713 }
3714 Builder.CreateBr(ContinuationBlock);
3715 } else {
3717 }
3718
3719 // Populate the outlined reduction function using the elementwise reduction
3720 // function. Partial values are extracted from the type-erased array of
3721 // pointers to private variables.
3722 BasicBlock *ReductionFuncBlock =
3723 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3724 Builder.SetInsertPoint(ReductionFuncBlock);
3725 Value *LHSArrayPtr = ReductionFunc->getArg(0);
3726 Value *RHSArrayPtr = ReductionFunc->getArg(1);
3727
3728 for (auto En : enumerate(ReductionInfos)) {
3729 const ReductionInfo &RI = En.value();
3731 RedArrayTy, LHSArrayPtr, 0, En.index());
3732 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3733 Value *LHSPtr = Builder.CreateBitCast(LHSI8Ptr, RI.Variable->getType());
3734 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3736 RedArrayTy, RHSArrayPtr, 0, En.index());
3737 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3738 Value *RHSPtr =
3740 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3741 Value *Reduced;
3743 if (!Builder.GetInsertBlock())
3744 return InsertPointTy();
3745 // store is inside of the reduction region when using by-ref
3746 if (!IsByRef[En.index()])
3747 Builder.CreateStore(Reduced, LHSPtr);
3748 }
3750
3751 Builder.SetInsertPoint(ContinuationBlock);
3752 return Builder.saveIP();
3753}
3754
3757 BodyGenCallbackTy BodyGenCB,
3758 FinalizeCallbackTy FiniCB) {
3759
3760 if (!updateToLocation(Loc))
3761 return Loc.IP;
3762
3763 Directive OMPD = Directive::OMPD_master;
3764 uint32_t SrcLocStrSize;
3765 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3766 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3767 Value *ThreadId = getOrCreateThreadID(Ident);
3768 Value *Args[] = {Ident, ThreadId};
3769
3770 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
3771 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3772
3773 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
3774 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
3775
3776 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3777 /*Conditional*/ true, /*hasFinalize*/ true);
3778}
3779
3782 BodyGenCallbackTy BodyGenCB,
3783 FinalizeCallbackTy FiniCB, Value *Filter) {
3784 if (!updateToLocation(Loc))
3785 return Loc.IP;
3786
3787 Directive OMPD = Directive::OMPD_masked;
3788 uint32_t SrcLocStrSize;
3789 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3790 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3791 Value *ThreadId = getOrCreateThreadID(Ident);
3792 Value *Args[] = {Ident, ThreadId, Filter};
3793 Value *ArgsEnd[] = {Ident, ThreadId};
3794
3795 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
3796 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
3797
3798 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
3799 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
3800
3801 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
3802 /*Conditional*/ true, /*hasFinalize*/ true);
3803}
3804
3806 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
3807 BasicBlock *PostInsertBefore, const Twine &Name) {
3808 Module *M = F->getParent();
3809 LLVMContext &Ctx = M->getContext();
3810 Type *IndVarTy = TripCount->getType();
3811
3812 // Create the basic block structure.
3813 BasicBlock *Preheader =
3814 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
3815 BasicBlock *Header =
3816 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
3817 BasicBlock *Cond =
3818 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
3819 BasicBlock *Body =
3820 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
3821 BasicBlock *Latch =
3822 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
3823 BasicBlock *Exit =
3824 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
3825 BasicBlock *After =
3826 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
3827
3828 // Use specified DebugLoc for new instructions.
3830
3831 Builder.SetInsertPoint(Preheader);
3832 Builder.CreateBr(Header);
3833
3834 Builder.SetInsertPoint(Header);
3835 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
3836 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
3838
3840 Value *Cmp =
3841 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
3842 Builder.CreateCondBr(Cmp, Body, Exit);
3843
3844 Builder.SetInsertPoint(Body);
3845 Builder.CreateBr(Latch);
3846
3847 Builder.SetInsertPoint(Latch);
3848 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
3849 "omp_" + Name + ".next", /*HasNUW=*/true);
3850 Builder.CreateBr(Header);
3851 IndVarPHI->addIncoming(Next, Latch);
3852
3853 Builder.SetInsertPoint(Exit);
3855
3856 // Remember and return the canonical control flow.
3857 LoopInfos.emplace_front();
3858 CanonicalLoopInfo *CL = &LoopInfos.front();
3859
3860 CL->Header = Header;
3861 CL->Cond = Cond;
3862 CL->Latch = Latch;
3863 CL->Exit = Exit;
3864
3865#ifndef NDEBUG
3866 CL->assertOK();
3867#endif
3868 return CL;
3869}
3870
3873 LoopBodyGenCallbackTy BodyGenCB,
3874 Value *TripCount, const Twine &Name) {
3875 BasicBlock *BB = Loc.IP.getBlock();
3876 BasicBlock *NextBB = BB->getNextNode();
3877
3878 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
3879 NextBB, NextBB, Name);
3880 BasicBlock *After = CL->getAfter();
3881
3882 // If location is not set, don't connect the loop.
3883 if (updateToLocation(Loc)) {
3884 // Split the loop at the insertion point: Branch to the preheader and move
3885 // every following instruction to after the loop (the After BB). Also, the
3886 // new successor is the loop's after block.
3887 spliceBB(Builder, After, /*CreateBranch=*/false);
3889 }
3890
3891 // Emit the body content. We do it after connecting the loop to the CFG to
3892 // avoid that the callback encounters degenerate BBs.
3893 BodyGenCB(CL->getBodyIP(), CL->getIndVar());
3894
3895#ifndef NDEBUG
3896 CL->assertOK();
3897#endif
3898 return CL;
3899}
3900
3902 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
3903 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
3904 InsertPointTy ComputeIP, const Twine &Name) {
3905
3906 // Consider the following difficulties (assuming 8-bit signed integers):
3907 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
3908 // DO I = 1, 100, 50
3909 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
3910 // DO I = 100, 0, -128
3911
3912 // Start, Stop and Step must be of the same integer type.
3913 auto *IndVarTy = cast<IntegerType>(Start->getType());
3914 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
3915 assert(IndVarTy == Step->getType() && "Step type mismatch");
3916
3917 LocationDescription ComputeLoc =
3918 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
3919 updateToLocation(ComputeLoc);
3920
3921 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
3922 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
3923
3924 // Like Step, but always positive.
3925 Value *Incr = Step;
3926
3927 // Distance between Start and Stop; always positive.
3928 Value *Span;
3929
3930 // Condition whether there are no iterations are executed at all, e.g. because
3931 // UB < LB.
3932 Value *ZeroCmp;
3933
3934 if (IsSigned) {
3935 // Ensure that increment is positive. If not, negate and invert LB and UB.
3936 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
3937 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
3938 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
3939 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
3940 Span = Builder.CreateSub(UB, LB, "", false, true);
3941 ZeroCmp = Builder.CreateICmp(
3942 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
3943 } else {
3944 Span = Builder.CreateSub(Stop, Start, "", true);
3945 ZeroCmp = Builder.CreateICmp(
3946 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
3947 }
3948
3949 Value *CountIfLooping;
3950 if (InclusiveStop) {
3951 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
3952 } else {
3953 // Avoid incrementing past stop since it could overflow.
3954 Value *CountIfTwo = Builder.CreateAdd(
3955 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
3956 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
3957 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
3958 }
3959 Value *TripCount = Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
3960 "omp_" + Name + ".tripcount");
3961
3962 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
3963 Builder.restoreIP(CodeGenIP);
3964 Value *Span = Builder.CreateMul(IV, Step);
3965 Value *IndVar = Builder.CreateAdd(Span, Start);
3966 BodyGenCB(Builder.saveIP(), IndVar);
3967 };
3968 LocationDescription LoopLoc = ComputeIP.isSet() ? Loc.IP : Builder.saveIP();
3969 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
3970}
3971
3972// Returns an LLVM function to call for initializing loop bounds using OpenMP
3973// static scheduling depending on `type`. Only i32 and i64 are supported by the
3974// runtime. Always interpret integers as unsigned similarly to
3975// CanonicalLoopInfo.
3977 OpenMPIRBuilder &OMPBuilder) {
3978 unsigned Bitwidth = Ty->getIntegerBitWidth();
3979 if (Bitwidth == 32)
3980 return OMPBuilder.getOrCreateRuntimeFunction(
3981 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
3982 if (Bitwidth == 64)
3983 return OMPBuilder.getOrCreateRuntimeFunction(
3984 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
3985 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
3986}
3987
3989OpenMPIRBuilder::applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
3990 InsertPointTy AllocaIP,
3991 bool NeedsBarrier) {
3992 assert(CLI->isValid() && "Requires a valid canonical loop");
3993 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
3994 "Require dedicated allocate IP");
3995
3996 // Set up the source location value for OpenMP runtime.
3999
4000 uint32_t SrcLocStrSize;
4001 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4002 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4003
4004 // Declare useful OpenMP runtime functions.
4005 Value *IV = CLI->getIndVar();
4006 Type *IVTy = IV->getType();
4007 FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
4008 FunctionCallee StaticFini =
4009 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4010
4011 // Allocate space for computed loop bounds as expected by the "init" function.
4012 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4013
4014 Type *I32Type = Type::getInt32Ty(M.getContext());
4015 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4016 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4017 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4018 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4019
4020 // At the end of the preheader, prepare for calling the "init" function by
4021 // storing the current loop bounds into the allocated space. A canonical loop
4022 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4023 // and produces an inclusive upper bound.
4025 Constant *Zero = ConstantInt::get(IVTy, 0);
4026 Constant *One = ConstantInt::get(IVTy, 1);
4027 Builder.CreateStore(Zero, PLowerBound);
4028 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4029 Builder.CreateStore(UpperBound, PUpperBound);
4030 Builder.CreateStore(One, PStride);
4031
4032 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4033
4034 Constant *SchedulingType = ConstantInt::get(
4035 I32Type, static_cast<int>(OMPScheduleType::UnorderedStatic));
4036
4037 // Call the "init" function and update the trip count of the loop with the
4038 // value it produced.
4039 Builder.CreateCall(StaticInit,
4040 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
4041 PUpperBound, PStride, One, Zero});
4042 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4043 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4044 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4045 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4046 CLI->setTripCount(TripCount);
4047
4048 // Update all uses of the induction variable except the one in the condition
4049 // block that compares it with the actual upper bound, and the increment in
4050 // the latch block.
4051
4052 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4054 CLI->getBody()->getFirstInsertionPt());
4056 return Builder.CreateAdd(OldIV, LowerBound);
4057 });
4058
4059 // In the "exit" block, call the "fini" function.
4061 CLI->getExit()->getTerminator()->getIterator());
4062 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4063
4064 // Add the barrier if requested.
4065 if (NeedsBarrier)
4066 createBarrier(LocationDescription(Builder.saveIP(), DL),
4067 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4068 /* CheckCancelFlag */ false);
4069
4070 InsertPointTy AfterIP = CLI->getAfterIP();
4071 CLI->invalidate();
4072
4073 return AfterIP;
4074}
4075
4076OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4077 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4078 bool NeedsBarrier, Value *ChunkSize) {
4079 assert(CLI->isValid() && "Requires a valid canonical loop");
4080 assert(ChunkSize && "Chunk size is required");
4081
4082 LLVMContext &Ctx = CLI->getFunction()->getContext();
4083 Value *IV = CLI->getIndVar();
4084 Value *OrigTripCount = CLI->getTripCount();
4085 Type *IVTy = IV->getType();
4086 assert(IVTy->getIntegerBitWidth() <= 64 &&
4087 "Max supported tripcount bitwidth is 64 bits");
4088 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4089 : Type::getInt64Ty(Ctx);
4090 Type *I32Type = Type::getInt32Ty(M.getContext());
4091 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4092 Constant *One = ConstantInt::get(InternalIVTy, 1);
4093
4094 // Declare useful OpenMP runtime functions.
4095 FunctionCallee StaticInit =
4096 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4097 FunctionCallee StaticFini =
4098 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4099
4100 // Allocate space for computed loop bounds as expected by the "init" function.
4101 Builder.restoreIP(AllocaIP);
4103 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4104 Value *PLowerBound =
4105 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4106 Value *PUpperBound =
4107 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4108 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4109
4110 // Set up the source location value for the OpenMP runtime.
4113
4114 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4115 Value *CastedChunkSize =
4116 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4117 Value *CastedTripCount =
4118 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4119
4120 Constant *SchedulingType = ConstantInt::get(
4121 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4122 Builder.CreateStore(Zero, PLowerBound);
4123 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4124 Builder.CreateStore(OrigUpperBound, PUpperBound);
4125 Builder.CreateStore(One, PStride);
4126
4127 // Call the "init" function and update the trip count of the loop with the
4128 // value it produced.
4129 uint32_t SrcLocStrSize;
4130 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4131 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4132 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4133 Builder.CreateCall(StaticInit,
4134 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4135 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4136 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4137 /*pstride=*/PStride, /*incr=*/One,
4138 /*chunk=*/CastedChunkSize});
4139
4140 // Load values written by the "init" function.
4141 Value *FirstChunkStart =
4142 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4143 Value *FirstChunkStop =
4144 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4145 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4146 Value *ChunkRange =
4147 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4148 Value *NextChunkStride =
4149 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4150
4151 // Create outer "dispatch" loop for enumerating the chunks.
4152 BasicBlock *DispatchEnter = splitBB(Builder, true);
4153 Value *DispatchCounter;
4155 {Builder.saveIP(), DL},
4156 [&](InsertPointTy BodyIP, Value *Counter) { DispatchCounter = Counter; },
4157 FirstChunkStart, CastedTripCount, NextChunkStride,
4158 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4159 "dispatch");
4160
4161 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4162 // not have to preserve the canonical invariant.
4163 BasicBlock *DispatchBody = DispatchCLI->getBody();
4164 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4165 BasicBlock *DispatchExit = DispatchCLI->getExit();
4166 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4167 DispatchCLI->invalidate();
4168
4169 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4170 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4171 redirectTo(CLI->getExit(), DispatchLatch, DL);
4172 redirectTo(DispatchBody, DispatchEnter, DL);
4173
4174 // Prepare the prolog of the chunk loop.
4177
4178 // Compute the number of iterations of the chunk loop.
4180 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4181 Value *IsLastChunk =
4182 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4183 Value *CountUntilOrigTripCount =
4184 Builder.CreateSub(CastedTripCount, DispatchCounter);
4185 Value *ChunkTripCount = Builder.CreateSelect(
4186 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4187 Value *BackcastedChunkTC =
4188 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4189 CLI->setTripCount(BackcastedChunkTC);
4190
4191 // Update all uses of the induction variable except the one in the condition
4192 // block that compares it with the actual upper bound, and the increment in
4193 // the latch block.
4194 Value *BackcastedDispatchCounter =
4195 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4196 CLI->mapIndVar([&](Instruction *) -> Value * {
4197 Builder.restoreIP(CLI->getBodyIP());
4198 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4199 });
4200
4201 // In the "exit" block, call the "fini" function.
4202 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4203 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4204
4205 // Add the barrier if requested.
4206 if (NeedsBarrier)
4207 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4208 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4209
4210#ifndef NDEBUG
4211 // Even though we currently do not support applying additional methods to it,
4212 // the chunk loop should remain a canonical loop.
4213 CLI->assertOK();
4214#endif
4215
4216 return {DispatchAfter, DispatchAfter->getFirstInsertionPt()};
4217}
4218
4219// Returns an LLVM function to call for executing an OpenMP static worksharing
4220// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4221// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4222static FunctionCallee
4224 WorksharingLoopType LoopType) {
4225 unsigned Bitwidth = Ty->getIntegerBitWidth();
4226 Module &M = OMPBuilder->M;
4227 switch (LoopType) {
4228 case WorksharingLoopType::ForStaticLoop:
4229 if (Bitwidth == 32)
4230 return OMPBuilder->getOrCreateRuntimeFunction(
4231 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4232 if (Bitwidth == 64)
4233 return OMPBuilder->getOrCreateRuntimeFunction(
4234 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4235 break;
4236 case WorksharingLoopType::DistributeStaticLoop:
4237 if (Bitwidth == 32)
4238 return OMPBuilder->getOrCreateRuntimeFunction(
4239 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4240 if (Bitwidth == 64)
4241 return OMPBuilder->getOrCreateRuntimeFunction(
4242 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4243 break;
4244 case WorksharingLoopType::DistributeForStaticLoop:
4245 if (Bitwidth == 32)
4246 return OMPBuilder->getOrCreateRuntimeFunction(
4247 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4248 if (Bitwidth == 64)
4249 return OMPBuilder->getOrCreateRuntimeFunction(
4250 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4251 break;
4252 }
4253 if (Bitwidth != 32 && Bitwidth != 64) {
4254 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4255 }
4256 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4257}
4258
4259// Inserts a call to proper OpenMP Device RTL function which handles
4260// loop worksharing.
4262 OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType,
4263 BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg,
4264 Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn) {
4265 Type *TripCountTy = TripCount->getType();
4266 Module &M = OMPBuilder->M;
4267 IRBuilder<> &Builder = OMPBuilder->Builder;
4268 FunctionCallee RTLFn =
4269 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4270 SmallVector<Value *, 8> RealArgs;
4271 RealArgs.push_back(Ident);
4272 RealArgs.push_back(Builder.CreateBitCast(&LoopBodyFn, ParallelTaskPtr));
4273 RealArgs.push_back(LoopBodyArg);
4274 RealArgs.push_back(TripCount);
4275 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4276 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4277 Builder.CreateCall(RTLFn, RealArgs);
4278 return;
4279 }
4280 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
4281 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
4282 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4283 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
4284
4285 RealArgs.push_back(
4286 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
4287 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4288 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4289 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4290 }
4291
4292 Builder.CreateCall(RTLFn, RealArgs);
4293}
4294
4295static void
4297 CanonicalLoopInfo *CLI, Value *Ident,
4298 Function &OutlinedFn, Type *ParallelTaskPtr,
4299 const SmallVector<Instruction *, 4> &ToBeDeleted,
4300 WorksharingLoopType LoopType) {
4301 IRBuilder<> &Builder = OMPIRBuilder->Builder;
4302 BasicBlock *Preheader = CLI->getPreheader();
4303 Value *TripCount = CLI->getTripCount();
4304
4305 // After loop body outling, the loop body contains only set up
4306 // of loop body argument structure and the call to the outlined
4307 // loop body function. Firstly, we need to move setup of loop body args
4308 // into loop preheader.
4309 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
4310 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
4311
4312 // The next step is to remove the whole loop. We do not it need anymore.
4313 // That's why make an unconditional branch from loop preheader to loop
4314 // exit block
4315 Builder.restoreIP({Preheader, Preheader->end()});
4316 Preheader->getTerminator()->eraseFromParent();
4317 Builder.CreateBr(CLI->getExit());
4318
4319 // Delete dead loop blocks
4320 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
4321 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
4322 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
4323 CleanUpInfo.EntryBB = CLI->getHeader();
4324 CleanUpInfo.ExitBB = CLI->getExit();
4325 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
4326 DeleteDeadBlocks(BlocksToBeRemoved);
4327
4328 // Find the instruction which corresponds to loop body argument structure
4329 // and remove the call to loop body function instruction.
4330 Value *LoopBodyArg;
4331 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
4332 assert(OutlinedFnUser &&
4333 "Expected unique undroppable user of outlined function");
4334 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
4335 assert(OutlinedFnCallInstruction && "Expected outlined function call");
4336 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
4337 "Expected outlined function call to be located in loop preheader");
4338 // Check in case no argument structure has been passed.
4339 if (OutlinedFnCallInstruction->arg_size() > 1)
4340 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
4341 else
4342 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
4343 OutlinedFnCallInstruction->eraseFromParent();
4344
4345 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
4346 LoopBodyArg, ParallelTaskPtr, TripCount,
4347 OutlinedFn);
4348
4349 for (auto &ToBeDeletedItem : ToBeDeleted)
4350 ToBeDeletedItem->eraseFromParent();
4351 CLI->invalidate();
4352}
4353
4355OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
4356 InsertPointTy AllocaIP,
4357 WorksharingLoopType LoopType) {
4358 uint32_t SrcLocStrSize;
4359 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4360 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4361
4362 OutlineInfo OI;
4363 OI.OuterAllocaBB = CLI->getPreheader();
4364 Function *OuterFn = CLI->getPreheader()->getParent();
4365
4366 // Instructions which need to be deleted at the end of code generation
4368
4369 OI.OuterAllocaBB = AllocaIP.getBlock();
4370
4371 // Mark the body loop as region which needs to be extracted
4372 OI.EntryBB = CLI->getBody();
4373 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
4374 "omp.prelatch", true);
4375
4376 // Prepare loop body for extraction
4377 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
4378
4379 // Insert new loop counter variable which will be used only in loop
4380 // body.
4381 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
4382 Instruction *NewLoopCntLoad =
4383 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
4384 // New loop counter instructions are redundant in the loop preheader when
4385 // code generation for workshare loop is finshed. That's why mark them as
4386 // ready for deletion.
4387 ToBeDeleted.push_back(NewLoopCntLoad);
4388 ToBeDeleted.push_back(NewLoopCnt);
4389
4390 // Analyse loop body region. Find all input variables which are used inside
4391 // loop body region.
4392 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
4394 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
4395 SmallVector<BasicBlock *, 32> BlocksT(ParallelRegionBlockSet.begin(),
4396 ParallelRegionBlockSet.end());
4397
4398 CodeExtractorAnalysisCache CEAC(*OuterFn);
4399 CodeExtractor Extractor(Blocks,
4400 /* DominatorTree */ nullptr,
4401 /* AggregateArgs */ true,
4402 /* BlockFrequencyInfo */ nullptr,
4403 /* BranchProbabilityInfo */ nullptr,
4404 /* AssumptionCache */ nullptr,
4405 /* AllowVarArgs */ true,
4406 /* AllowAlloca */ true,
4407 /* AllocationBlock */ CLI->getPreheader(),
4408 /* Suffix */ ".omp_wsloop",
4409 /* AggrArgsIn0AddrSpace */ true);
4410
4411 BasicBlock *CommonExit = nullptr;
4412 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
4413
4414 // Find allocas outside the loop body region which are used inside loop
4415 // body
4416 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
4417
4418 // We need to model loop body region as the function f(cnt, loop_arg).
4419 // That's why we replace loop induction variable by the new counter
4420 // which will be one of loop body function argument
4422 CLI->getIndVar()->user_end());
4423 for (auto Use : Users) {
4424 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
4425 if (ParallelRegionBlockSet.count(Inst->getParent())) {
4426 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
4427 }
4428 }
4429 }
4430 // Make sure that loop counter variable is not merged into loop body
4431 // function argument structure and it is passed as separate variable
4432 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
4433
4434 // PostOutline CB is invoked when loop body function is outlined and
4435 // loop body is replaced by call to outlined function. We need to add
4436 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
4437 // function will handle loop control logic.
4438 //
4439 OI.PostOutlineCB = [=, ToBeDeletedVec =
4440 std::move(ToBeDeleted)](Function &OutlinedFn) {
4441 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ParallelTaskPtr,
4442 ToBeDeletedVec, LoopType);
4443 };
4444 addOutlineInfo(std::move(OI));
4445 return CLI->getAfterIP();
4446}
4447
4450 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
4451 bool HasSimdModifier, bool HasMonotonicModifier,
4452 bool HasNonmonotonicModifier, bool HasOrderedClause,
4453 WorksharingLoopType LoopType) {
4454 if (Config.isTargetDevice())
4455 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType);
4456 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
4457 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
4458 HasNonmonotonicModifier, HasOrderedClause);
4459
4460 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
4461 OMPScheduleType::ModifierOrdered;
4462 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
4463 case OMPScheduleType::BaseStatic:
4464 assert(!ChunkSize && "No chunk size with static-chunked schedule");
4465 if (IsOrdered)
4466 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4467 NeedsBarrier, ChunkSize);
4468 // FIXME: Monotonicity ignored?
4469 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier);
4470
4471 case OMPScheduleType::BaseStaticChunked:
4472 if (IsOrdered)
4473 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4474 NeedsBarrier, ChunkSize);
4475 // FIXME: Monotonicity ignored?
4476 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
4477 ChunkSize);
4478
4479 case OMPScheduleType::BaseRuntime:
4480 case OMPScheduleType::BaseAuto:
4481 case OMPScheduleType::BaseGreedy:
4482 case OMPScheduleType::BaseBalanced:
4483 case OMPScheduleType::BaseSteal:
4484 case OMPScheduleType::BaseGuidedSimd:
4485 case OMPScheduleType::BaseRuntimeSimd:
4486 assert(!ChunkSize &&
4487 "schedule type does not support user-defined chunk sizes");
4488 [[fallthrough]];
4489 case OMPScheduleType::BaseDynamicChunked:
4490 case OMPScheduleType::BaseGuidedChunked:
4491 case OMPScheduleType::BaseGuidedIterativeChunked:
4492 case OMPScheduleType::BaseGuidedAnalyticalChunked:
4493 case OMPScheduleType::BaseStaticBalancedChunked:
4494 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
4495 NeedsBarrier, ChunkSize);
4496
4497 default:
4498 llvm_unreachable("Unknown/unimplemented schedule kind");
4499 }
4500}
4501
4502/// Returns an LLVM function to call for initializing loop bounds using OpenMP
4503/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4504/// the runtime. Always interpret integers as unsigned similarly to
4505/// CanonicalLoopInfo.
4506static FunctionCallee
4508 unsigned Bitwidth = Ty->getIntegerBitWidth();
4509 if (Bitwidth == 32)
4510 return OMPBuilder.getOrCreateRuntimeFunction(
4511 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
4512 if (Bitwidth == 64)
4513 return OMPBuilder.getOrCreateRuntimeFunction(
4514 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
4515 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4516}
4517
4518/// Returns an LLVM function to call for updating the next loop using OpenMP
4519/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
4520/// the runtime. Always interpret integers as unsigned similarly to
4521/// CanonicalLoopInfo.
4522static FunctionCallee
4524 unsigned Bitwidth = Ty->getIntegerBitWidth();
4525 if (Bitwidth == 32)
4526 return OMPBuilder.getOrCreateRuntimeFunction(
4527 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
4528 if (Bitwidth == 64)
4529 return OMPBuilder.getOrCreateRuntimeFunction(
4530 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
4531 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4532}
4533
4534/// Returns an LLVM function to call for finalizing the dynamic loop using
4535/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
4536/// interpret integers as unsigned similarly to CanonicalLoopInfo.
4537static FunctionCallee
4539 unsigned Bitwidth = Ty->getIntegerBitWidth();
4540 if (Bitwidth == 32)
4541 return OMPBuilder.getOrCreateRuntimeFunction(
4542 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
4543 if (Bitwidth == 64)
4544 return OMPBuilder.getOrCreateRuntimeFunction(
4545 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
4546 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4547}
4548
4549OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyDynamicWorkshareLoop(
4550 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4551 OMPScheduleType SchedType, bool NeedsBarrier, Value *Chunk) {
4552 assert(CLI->isValid() && "Requires a valid canonical loop");
4553 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4554 "Require dedicated allocate IP");
4556 "Require valid schedule type");
4557
4558 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
4559 OMPScheduleType::ModifierOrdered;
4560
4561 // Set up the source location value for OpenMP runtime.
4563
4564 uint32_t SrcLocStrSize;
4565 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4566 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4567
4568 // Declare useful OpenMP runtime functions.
4569 Value *IV = CLI->getIndVar();
4570 Type *IVTy = IV->getType();
4571 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
4572 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
4573
4574 // Allocate space for computed loop bounds as expected by the "init" function.
4575 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4576 Type *I32Type = Type::getInt32Ty(M.getContext());
4577 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4578 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4579 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4580 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4581
4582 // At the end of the preheader, prepare for calling the "init" function by
4583 // storing the current loop bounds into the allocated space. A canonical loop
4584 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4585 // and produces an inclusive upper bound.
4586 BasicBlock *PreHeader = CLI->getPreheader();
4587 Builder.SetInsertPoint(PreHeader->getTerminator());
4588 Constant *One = ConstantInt::get(IVTy, 1);
4589 Builder.CreateStore(One, PLowerBound);
4590 Value *UpperBound = CLI->getTripCount();
4591 Builder.CreateStore(UpperBound, PUpperBound);
4592 Builder.CreateStore(One, PStride);
4593
4594 BasicBlock *Header = CLI->getHeader();
4595 BasicBlock *Exit = CLI->getExit();
4596 BasicBlock *Cond = CLI->getCond();
4597 BasicBlock *Latch = CLI->getLatch();
4598 InsertPointTy AfterIP = CLI->getAfterIP();
4599
4600 // The CLI will be "broken" in the code below, as the loop is no longer
4601 // a valid canonical loop.
4602
4603 if (!Chunk)
4604 Chunk = One;
4605
4606 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4607
4608 Constant *SchedulingType =
4609 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4610
4611 // Call the "init" function.
4612 Builder.CreateCall(DynamicInit,
4613 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
4614 UpperBound, /* step */ One, Chunk});
4615
4616 // An outer loop around the existing one.
4617 BasicBlock *OuterCond = BasicBlock::Create(
4618 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
4619 PreHeader->getParent());
4620 // This needs to be 32-bit always, so can't use the IVTy Zero above.
4621 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
4622 Value *Res =
4623 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
4624 PLowerBound, PUpperBound, PStride});
4625 Constant *Zero32 = ConstantInt::get(I32Type, 0);
4626 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
4627 Value *LowerBound =
4628 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
4629 Builder.CreateCondBr(MoreWork, Header, Exit);
4630
4631 // Change PHI-node in loop header to use outer cond rather than preheader,
4632 // and set IV to the LowerBound.
4633 Instruction *Phi = &Header->front();
4634 auto *PI = cast<PHINode>(Phi);
4635 PI->setIncomingBlock(0, OuterCond);
4636 PI->setIncomingValue(0, LowerBound);
4637
4638 // Then set the pre-header to jump to the OuterCond
4639 Instruction *Term = PreHeader->getTerminator();
4640 auto *Br = cast<BranchInst>(Term);
4641 Br->setSuccessor(0, OuterCond);
4642
4643 // Modify the inner condition:
4644 // * Use the UpperBound returned from the DynamicNext call.
4645 // * jump to the loop outer loop when done with one of the inner loops.
4646 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
4647 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
4649 auto *CI = cast<CmpInst>(Comp);
4650 CI->setOperand(1, UpperBound);
4651 // Redirect the inner exit to branch to outer condition.
4652 Instruction *Branch = &Cond->back();
4653 auto *BI = cast<BranchInst>(Branch);
4654 assert(BI->getSuccessor(1) == Exit);
4655 BI->setSuccessor(1, OuterCond);
4656
4657 // Call the "fini" function if "ordered" is present in wsloop directive.
4658 if (Ordered) {
4659 Builder.SetInsertPoint(&Latch->back());
4660 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
4661 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
4662 }
4663
4664 // Add the barrier if requested.
4665 if (NeedsBarrier) {
4666 Builder.SetInsertPoint(&Exit->back());
4667 createBarrier(LocationDescription(Builder.saveIP(), DL),
4668 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4669 /* CheckCancelFlag */ false);
4670 }
4671
4672 CLI->invalidate();
4673 return AfterIP;
4674}
4675
4676/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
4677/// after this \p OldTarget will be orphaned.
4679 BasicBlock *NewTarget, DebugLoc DL) {
4680 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
4681 redirectTo(Pred, NewTarget, DL);
4682}
4683
4684/// Determine which blocks in \p BBs are reachable from outside and remove the
4685/// ones that are not reachable from the function.
4687 SmallPtrSet<BasicBlock *, 6> BBsToErase{BBs.begin(), BBs.end()};
4688 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
4689 for (Use &U : BB->uses()) {
4690 auto *UseInst = dyn_cast<Instruction>(U.getUser());
4691 if (!UseInst)
4692 continue;
4693 if (BBsToErase.count(UseInst->getParent()))
4694 continue;
4695 return true;
4696 }
4697 return false;
4698 };
4699
4700 while (BBsToErase.remove_if(HasRemainingUses)) {
4701 // Try again if anything was removed.
4702 }
4703
4704 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
4705 DeleteDeadBlocks(BBVec);
4706}
4707
4710 InsertPointTy ComputeIP) {
4711 assert(Loops.size() >= 1 && "At least one loop required");
4712 size_t NumLoops = Loops.size();
4713
4714 // Nothing to do if there is already just one loop.
4715 if (NumLoops == 1)
4716 return Loops.front();
4717
4718 CanonicalLoopInfo *Outermost = Loops.front();
4719 CanonicalLoopInfo *Innermost = Loops.back();
4720 BasicBlock *OrigPreheader = Outermost->getPreheader();
4721 BasicBlock *OrigAfter = Outermost->getAfter();
4722 Function *F = OrigPreheader->getParent();
4723
4724 // Loop control blocks that may become orphaned later.
4725 SmallVector<BasicBlock *, 12> OldControlBBs;
4726 OldControlBBs.reserve(6 * Loops.size());
4728 Loop->collectControlBlocks(OldControlBBs);
4729
4730 // Setup the IRBuilder for inserting the trip count computation.
4732 if (ComputeIP.isSet())
4733 Builder.restoreIP(ComputeIP);
4734 else
4735 Builder.restoreIP(Outermost->getPreheaderIP());
4736
4737 // Derive the collapsed' loop trip count.
4738 // TODO: Find common/largest indvar type.
4739 Value *CollapsedTripCount = nullptr;
4740 for (CanonicalLoopInfo *L : Loops) {
4741 assert(L->isValid() &&
4742 "All loops to collapse must be valid canonical loops");
4743 Value *OrigTripCount = L->getTripCount();
4744 if (!CollapsedTripCount) {
4745 CollapsedTripCount = OrigTripCount;
4746 continue;
4747 }
4748
4749 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
4750 CollapsedTripCount = Builder.CreateMul(CollapsedTripCount, OrigTripCount,
4751 {}, /*HasNUW=*/true);
4752 }
4753
4754 // Create the collapsed loop control flow.
4755 CanonicalLoopInfo *Result =
4756 createLoopSkeleton(DL, CollapsedTripCount, F,
4757 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
4758
4759 // Build the collapsed loop body code.
4760 // Start with deriving the input loop induction variables from the collapsed
4761 // one, using a divmod scheme. To preserve the original loops' order, the
4762 // innermost loop use the least significant bits.
4763 Builder.restoreIP(Result->getBodyIP());
4764
4765 Value *Leftover = Result->getIndVar();
4766 SmallVector<Value *> NewIndVars;
4767 NewIndVars.resize(NumLoops);
4768 for (int i = NumLoops - 1; i >= 1; --i) {
4769 Value *OrigTripCount = Loops[i]->getTripCount();
4770
4771 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
4772 NewIndVars[i] = NewIndVar;
4773
4774 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
4775 }
4776 // Outermost loop gets all the remaining bits.
4777 NewIndVars[0] = Leftover;
4778
4779 // Construct the loop body control flow.
4780 // We progressively construct the branch structure following in direction of
4781 // the control flow, from the leading in-between code, the loop nest body, the
4782 // trailing in-between code, and rejoining the collapsed loop's latch.
4783 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
4784 // the ContinueBlock is set, continue with that block. If ContinuePred, use
4785 // its predecessors as sources.
4786 BasicBlock *ContinueBlock = Result->getBody();
4787 BasicBlock *ContinuePred = nullptr;
4788 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
4789 BasicBlock *NextSrc) {
4790 if (ContinueBlock)
4791 redirectTo(ContinueBlock, Dest, DL);
4792 else
4793 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
4794
4795 ContinueBlock = nullptr;
4796 ContinuePred = NextSrc;
4797 };
4798
4799 // The code before the nested loop of each level.
4800 // Because we are sinking it into the nest, it will be executed more often
4801 // that the original loop. More sophisticated schemes could keep track of what
4802 // the in-between code is and instantiate it only once per thread.
4803 for (size_t i = 0; i < NumLoops - 1; ++i)
4804 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
4805
4806 // Connect the loop nest body.
4807 ContinueWith(Innermost->getBody(), Innermost->getLatch());
4808
4809 // The code after the nested loop at each level.
4810 for (size_t i = NumLoops - 1; i > 0; --i)
4811 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
4812
4813 // Connect the finished loop to the collapsed loop latch.
4814 ContinueWith(Result->getLatch(), nullptr);
4815
4816 // Replace the input loops with the new collapsed loop.
4817 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
4818 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
4819
4820 // Replace the input loop indvars with the derived ones.
4821 for (size_t i = 0; i < NumLoops; ++i)
4822 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
4823
4824 // Remove unused parts of the input loops.
4825 removeUnusedBlocksFromParent(OldControlBBs);
4826
4827 for (CanonicalLoopInfo *L : Loops)
4828 L->invalidate();
4829
4830#ifndef NDEBUG
4831 Result->assertOK();
4832#endif
4833 return Result;
4834}
4835
4836std::vector<CanonicalLoopInfo *>
4838 ArrayRef<Value *> TileSizes) {
4839 assert(TileSizes.size() == Loops.size() &&
4840 "Must pass as many tile sizes as there are loops");
4841 int NumLoops = Loops.size();
4842 assert(NumLoops >= 1 && "At least one loop to tile required");
4843
4844 CanonicalLoopInfo *OutermostLoop = Loops.front();
4845 CanonicalLoopInfo *InnermostLoop = Loops.back();
4846 Function *F = OutermostLoop->getBody()->getParent();
4847 BasicBlock *InnerEnter = InnermostLoop->getBody();
4848 BasicBlock *InnerLatch = InnermostLoop->getLatch();
4849
4850 // Loop control blocks that may become orphaned later.
4851 SmallVector<BasicBlock *, 12> OldControlBBs;
4852 OldControlBBs.reserve(6 * Loops.size());
4854 Loop->collectControlBlocks(OldControlBBs);
4855
4856 // Collect original trip counts and induction variable to be accessible by
4857 // index. Also, the structure of the original loops is not preserved during
4858 // the construction of the tiled loops, so do it before we scavenge the BBs of
4859 // any original CanonicalLoopInfo.
4860 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
4861 for (CanonicalLoopInfo *L : Loops) {
4862 assert(L->isValid() && "All input loops must be valid canonical loops");
4863 OrigTripCounts.push_back(L->getTripCount());
4864 OrigIndVars.push_back(L->getIndVar());
4865 }
4866
4867 // Collect the code between loop headers. These may contain SSA definitions
4868 // that are used in the loop nest body. To be usable with in the innermost
4869 // body, these BasicBlocks will be sunk into the loop nest body. That is,
4870 // these instructions may be executed more often than before the tiling.
4871 // TODO: It would be sufficient to only sink them into body of the
4872 // corresponding tile loop.
4874 for (int i = 0; i < NumLoops - 1; ++i) {
4875 CanonicalLoopInfo *Surrounding = Loops[i];
4876 CanonicalLoopInfo *Nested = Loops[i + 1];
4877
4878 BasicBlock *EnterBB = Surrounding->getBody();
4879 BasicBlock *ExitBB = Nested->getHeader();
4880 InbetweenCode.emplace_back(EnterBB, ExitBB);
4881 }
4882
4883 // Compute the trip counts of the floor loops.
4885 Builder.restoreIP(OutermostLoop->getPreheaderIP());
4886 SmallVector<Value *, 4> FloorCount, FloorRems;
4887 for (int i = 0; i < NumLoops; ++i) {
4888 Value *TileSize = TileSizes[i];
4889 Value *OrigTripCount = OrigTripCounts[i];
4890 Type *IVType = OrigTripCount->getType();
4891
4892 Value *FloorTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
4893 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
4894
4895 // 0 if tripcount divides the tilesize, 1 otherwise.
4896 // 1 means we need an additional iteration for a partial tile.
4897 //
4898 // Unfortunately we cannot just use the roundup-formula
4899 // (tripcount + tilesize - 1)/tilesize
4900 // because the summation might overflow. We do not want introduce undefined
4901 // behavior when the untiled loop nest did not.
4902 Value *FloorTripOverflow =
4903 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
4904
4905 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
4906 FloorTripCount =
4907 Builder.CreateAdd(FloorTripCount, FloorTripOverflow,
4908 "omp_floor" + Twine(i) + ".tripcount", true);
4909
4910 // Remember some values for later use.
4911 FloorCount.push_back(FloorTripCount);
4912 FloorRems.push_back(FloorTripRem);
4913 }
4914
4915 // Generate the new loop nest, from the outermost to the innermost.
4916 std::vector<CanonicalLoopInfo *> Result;
4917 Result.reserve(NumLoops * 2);
4918
4919 // The basic block of the surrounding loop that enters the nest generated
4920 // loop.
4921 BasicBlock *Enter = OutermostLoop->getPreheader();
4922
4923 // The basic block of the surrounding loop where the inner code should
4924 // continue.
4925 BasicBlock *Continue = OutermostLoop->getAfter();
4926
4927 // Where the next loop basic block should be inserted.
4928 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
4929
4930 auto EmbeddNewLoop =
4931 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
4932 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
4933 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
4934 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
4935 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
4936 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
4937
4938 // Setup the position where the next embedded loop connects to this loop.
4939 Enter = EmbeddedLoop->getBody();
4940 Continue = EmbeddedLoop->getLatch();
4941 OutroInsertBefore = EmbeddedLoop->getLatch();
4942 return EmbeddedLoop;
4943 };
4944
4945 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
4946 const Twine &NameBase) {
4947 for (auto P : enumerate(TripCounts)) {
4948 CanonicalLoopInfo *EmbeddedLoop =
4949 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
4950 Result.push_back(EmbeddedLoop);
4951 }
4952 };
4953
4954 EmbeddNewLoops(FloorCount, "floor");
4955
4956 // Within the innermost floor loop, emit the code that computes the tile
4957 // sizes.
4959 SmallVector<Value *, 4> TileCounts;
4960 for (int i = 0; i < NumLoops; ++i) {
4961 CanonicalLoopInfo *FloorLoop = Result[i];
4962 Value *TileSize = TileSizes[i];
4963
4964 Value *FloorIsEpilogue =
4965 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCount[i]);
4966 Value *TileTripCount =
4967 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
4968
4969 TileCounts.push_back(TileTripCount);
4970 }
4971
4972 // Create the tile loops.
4973 EmbeddNewLoops(TileCounts, "tile");
4974
4975 // Insert the inbetween code into the body.
4976 BasicBlock *BodyEnter = Enter;
4977 BasicBlock *BodyEntered = nullptr;
4978 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
4979 BasicBlock *EnterBB = P.first;
4980 BasicBlock *ExitBB = P.second;
4981
4982 if (BodyEnter)
4983 redirectTo(BodyEnter, EnterBB, DL);
4984 else
4985 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
4986
4987 BodyEnter = nullptr;
4988 BodyEntered = ExitBB;
4989 }
4990
4991 // Append the original loop nest body into the generated loop nest body.
4992 if (BodyEnter)
4993 redirectTo(BodyEnter, InnerEnter, DL);
4994 else
4995 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
4997
4998 // Replace the original induction variable with an induction variable computed
4999 // from the tile and floor induction variables.
5000 Builder.restoreIP(Result.back()->getBodyIP());
5001 for (int i = 0; i < NumLoops; ++i) {
5002 CanonicalLoopInfo *FloorLoop = Result[i];
5003 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5004 Value *OrigIndVar = OrigIndVars[i];
5005 Value *Size = TileSizes[i];
5006
5007 Value *Scale =
5008 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5009 Value *Shift =
5010 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5011 OrigIndVar->replaceAllUsesWith(Shift);
5012 }
5013
5014 // Remove unused parts of the original loops.
5015 removeUnusedBlocksFromParent(OldControlBBs);
5016
5017 for (CanonicalLoopInfo *L : Loops)
5018 L->invalidate();
5019
5020#ifndef NDEBUG
5021 for (CanonicalLoopInfo *GenL : Result)
5022 GenL->assertOK();
5023#endif
5024 return Result;
5025}
5026
5027/// Attach metadata \p Properties to the basic block described by \p BB. If the
5028/// basic block already has metadata, the basic block properties are appended.
5030 ArrayRef<Metadata *> Properties) {
5031 // Nothing to do if no property to attach.
5032 if (Properties.empty())
5033 return;
5034
5035 LLVMContext &Ctx = BB->getContext();
5036 SmallVector<Metadata *> NewProperties;
5037 NewProperties.push_back(nullptr);
5038
5039 // If the basic block already has metadata, prepend it to the new metadata.
5040 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5041 if (Existing)
5042 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5043
5044 append_range(NewProperties, Properties);
5045 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5046 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5047
5048 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5049}
5050
5051/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5052/// loop already has metadata, the loop properties are appended.
5054 ArrayRef<Metadata *> Properties) {
5055 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5056
5057 // Attach metadata to the loop's latch
5058 BasicBlock *Latch = Loop->getLatch();
5059 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5060 addBasicBlockMetadata(Latch, Properties);
5061}
5062
5063/// Attach llvm.access.group metadata to the memref instructions of \p Block
5064static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5065 LoopInfo &LI) {
5066 for (Instruction &I : *Block) {
5067 if (I.mayReadOrWriteMemory()) {
5068 // TODO: This instruction may already have access group from
5069 // other pragmas e.g. #pragma clang loop vectorize. Append
5070 // so that the existing metadata is not overwritten.
5071 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5072 }
5073 }
5074}
5075
5079 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5080 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5081}
5082
5086 Loop, {
5087 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5088 });
5089}
5090
5091void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5092 Value *IfCond, ValueToValueMapTy &VMap,
5093 const Twine &NamePrefix) {
5094 Function *F = CanonicalLoop->getFunction();
5095
5096 // Define where if branch should be inserted
5097 Instruction *SplitBefore;
5098 if (Instruction::classof(IfCond)) {
5099 SplitBefore = dyn_cast<Instruction>(IfCond);
5100 } else {
5101 SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
5102 }
5103
5104 // TODO: We should not rely on pass manager. Currently we use pass manager
5105 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5106 // object. We should have a method which returns all blocks between
5107 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5109 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5110 FAM.registerPass([]() { return LoopAnalysis(); });
5111 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5112
5113 // Get the loop which needs to be cloned
5114 LoopAnalysis LIA;
5115 LoopInfo &&LI = LIA.run(*F, FAM);
5116 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5117
5118 // Create additional blocks for the if statement
5119 BasicBlock *Head = SplitBefore->getParent();
5120 Instruction *HeadOldTerm = Head->getTerminator();
5121 llvm::LLVMContext &C = Head->getContext();
5123 C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
5125 C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
5126
5127 // Create if condition branch.
5128 Builder.SetInsertPoint(HeadOldTerm);
5129 Instruction *BrInstr =
5130 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5131 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5132 // Then block contains branch to omp loop which needs to be vectorized
5133 spliceBB(IP, ThenBlock, false);
5134 ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
5135
5136 Builder.SetInsertPoint(ElseBlock);
5137
5138 // Clone loop for the else branch
5140
5141 VMap[CanonicalLoop->getPreheader()] = ElseBlock;
5142 for (BasicBlock *Block : L->getBlocks()) {
5143 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5144 NewBB->moveBefore(CanonicalLoop->getExit());
5145 VMap[Block] = NewBB;
5146 NewBlocks.push_back(NewBB);
5147 }
5148 remapInstructionsInBlocks(NewBlocks, VMap);
5149 Builder.CreateBr(NewBlocks.front());
5150}
5151
5152unsigned
5154 const StringMap<bool> &Features) {
5155 if (TargetTriple.isX86()) {
5156 if (Features.lookup("avx512f"))
5157 return 512;
5158 else if (Features.lookup("avx"))
5159 return 256;
5160 return 128;
5161 }
5162 if (TargetTriple.isPPC())
5163 return 128;
5164 if (TargetTriple.isWasm())
5165 return 128;
5166 return 0;
5167}
5168
5170 MapVector<Value *, Value *> AlignedVars,
5171 Value *IfCond, OrderKind Order,
5172 ConstantInt *Simdlen, ConstantInt *Safelen) {
5174
5175 Function *F = CanonicalLoop->getFunction();
5176
5177 // TODO: We should not rely on pass manager. Currently we use pass manager
5178 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5179 // object. We should have a method which returns all blocks between
5180 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5182 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5183 FAM.registerPass([]() { return LoopAnalysis(); });
5184 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5185
5186 LoopAnalysis LIA;
5187 LoopInfo &&LI = LIA.run(*F, FAM);
5188
5189 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5190 if (AlignedVars.size()) {
5192 Builder.SetInsertPoint(CanonicalLoop->getPreheader()->getTerminator());
5193 for (auto &AlignedItem : AlignedVars) {
5194 Value *AlignedPtr = AlignedItem.first;
5195 Value *Alignment = AlignedItem.second;
5196 Builder.CreateAlignmentAssumption(F->getDataLayout(),
5197 AlignedPtr, Alignment);
5198 }
5199 Builder.restoreIP(IP);
5200 }
5201
5202 if (IfCond) {
5203 ValueToValueMapTy VMap;
5204 createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
5205 // Add metadata to the cloned loop which disables vectorization
5206 Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
5207 assert(MappedLatch &&
5208 "Cannot find value which corresponds to original loop latch");
5209 assert(isa<BasicBlock>(MappedLatch) &&
5210 "Cannot cast mapped latch block value to BasicBlock");
5211 BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
5212 ConstantAsMetadata *BoolConst =
5215 NewLatchBlock,
5216 {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
5217 BoolConst})});
5218 }
5219
5220 SmallSet<BasicBlock *, 8> Reachable;
5221
5222 // Get the basic blocks from the loop in which memref instructions
5223 // can be found.
5224 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5225 // preferably without running any passes.
5226 for (BasicBlock *Block : L->getBlocks()) {
5227 if (Block == CanonicalLoop->getCond() ||
5228 Block == CanonicalLoop->getHeader())
5229 continue;
5230 Reachable.insert(Block);
5231 }
5232
5233 SmallVector<Metadata *> LoopMDList;
5234
5235 // In presence of finite 'safelen', it may be unsafe to mark all
5236 // the memory instructions parallel, because loop-carried
5237 // dependences of 'safelen' iterations are possible.
5238 // If clause order(concurrent) is specified then the memory instructions
5239 // are marked parallel even if 'safelen' is finite.
5240 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5241 // Add access group metadata to memory-access instructions.
5242 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5243 for (BasicBlock *BB : Reachable)
5244 addSimdMetadata(BB, AccessGroup, LI);
5245 // TODO: If the loop has existing parallel access metadata, have
5246 // to combine two lists.
5247 LoopMDList.push_back(MDNode::get(
5248 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5249 }
5250
5251 // Use the above access group metadata to create loop level
5252 // metadata, which should be distinct for each loop.
5253 ConstantAsMetadata *BoolConst =
5255 LoopMDList.push_back(MDNode::get(
5256 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
5257
5258 if (Simdlen || Safelen) {
5259 // If both simdlen and safelen clauses are specified, the value of the
5260 // simdlen parameter must be less than or equal to the value of the safelen
5261 // parameter. Therefore, use safelen only in the absence of simdlen.
5262 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
5263 LoopMDList.push_back(
5264 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
5265 ConstantAsMetadata::get(VectorizeWidth)}));
5266 }
5267
5268 addLoopMetadata(CanonicalLoop, LoopMDList);
5269}
5270
5271/// Create the TargetMachine object to query the backend for optimization
5272/// preferences.
5273///
5274/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
5275/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
5276/// needed for the LLVM pass pipline. We use some default options to avoid
5277/// having to pass too many settings from the frontend that probably do not
5278/// matter.
5279///
5280/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
5281/// method. If we are going to use TargetMachine for more purposes, especially
5282/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
5283/// might become be worth requiring front-ends to pass on their TargetMachine,
5284/// or at least cache it between methods. Note that while fontends such as Clang
5285/// have just a single main TargetMachine per translation unit, "target-cpu" and
5286/// "target-features" that determine the TargetMachine are per-function and can
5287/// be overrided using __attribute__((target("OPTIONS"))).
5288static std::unique_ptr<TargetMachine>
5290 Module *M = F->getParent();
5291
5292 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
5293 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
5294 const std::string &Triple = M->getTargetTriple();
5295
5296 std::string Error;
5298 if (!TheTarget)
5299 return {};
5300
5302 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
5303 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
5304 /*CodeModel=*/std::nullopt, OptLevel));
5305}
5306
5307/// Heuristically determine the best-performant unroll factor for \p CLI. This
5308/// depends on the target processor. We are re-using the same heuristics as the
5309/// LoopUnrollPass.
5311 Function *F = CLI->getFunction();
5312
5313 // Assume the user requests the most aggressive unrolling, even if the rest of
5314 // the code is optimized using a lower setting.
5316 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
5317
5319 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
5320 FAM.registerPass([]() { return AssumptionAnalysis(); });
5321 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5322 FAM.registerPass([]() { return LoopAnalysis(); });
5323 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
5324 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5325 TargetIRAnalysis TIRA;
5326 if (TM)
5327 TIRA = TargetIRAnalysis(
5328 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
5329 FAM.registerPass([&]() { return TIRA; });
5330
5331 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
5333 ScalarEvolution &&SE = SEA.run(*F, FAM);
5335 DominatorTree &&DT = DTA.run(*F, FAM);
5336 LoopAnalysis LIA;
5337 LoopInfo &&LI = LIA.run(*F, FAM);
5339 AssumptionCache &&AC = ACT.run(*F, FAM);
5341
5342 Loop *L = LI.getLoopFor(CLI->getHeader());
5343 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
5344
5347 /*BlockFrequencyInfo=*/nullptr,
5348 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
5349 /*UserThreshold=*/std::nullopt,
5350 /*UserCount=*/std::nullopt,
5351 /*UserAllowPartial=*/true,
5352 /*UserAllowRuntime=*/true,
5353 /*UserUpperBound=*/std::nullopt,
5354 /*UserFullUnrollMaxCount=*/std::nullopt);
5355
5356 UP.Force = true;
5357
5358 // Account for additional optimizations taking place before the LoopUnrollPass
5359 // would unroll the loop.
5362
5363 // Use normal unroll factors even if the rest of the code is optimized for
5364 // size.
5367
5368 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
5369 << " Threshold=" << UP.Threshold << "\n"
5370 << " PartialThreshold=" << UP.PartialThreshold << "\n"
5371 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
5372 << " PartialOptSizeThreshold="
5373 << UP.PartialOptSizeThreshold << "\n");
5374
5375 // Disable peeling.
5378 /*UserAllowPeeling=*/false,
5379 /*UserAllowProfileBasedPeeling=*/false,
5380 /*UnrollingSpecficValues=*/false);
5381
5383 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
5384
5385 // Assume that reads and writes to stack variables can be eliminated by
5386 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
5387 // size.
5388 for (BasicBlock *BB : L->blocks()) {
5389 for (Instruction &I : *BB) {
5390 Value *Ptr;
5391 if (auto *Load = dyn_cast<LoadInst>(&I)) {
5392 Ptr = Load->getPointerOperand();
5393 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
5394 Ptr = Store->getPointerOperand();
5395 } else
5396 continue;
5397
5398 Ptr = Ptr->stripPointerCasts();
5399
5400 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
5401 if (Alloca->getParent() == &F->getEntryBlock())
5402 EphValues.insert(&I);
5403 }
5404 }
5405 }
5406
5407 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
5408
5409 // Loop is not unrollable if the loop contains certain instructions.
5410 if (!UCE.canUnroll()) {
5411 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
5412 return 1;
5413 }
5414
5415 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
5416 << "\n");
5417
5418 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
5419 // be able to use it.
5420 int TripCount = 0;
5421 int MaxTripCount = 0;
5422 bool MaxOrZero = false;
5423 unsigned TripMultiple = 0;
5424
5425 bool UseUpperBound = false;
5426 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
5427 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
5428 UseUpperBound);
5429 unsigned Factor = UP.Count;
5430 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
5431
5432 // This function returns 1 to signal to not unroll a loop.
5433 if (Factor == 0)
5434 return 1;
5435 return Factor;
5436}
5437
5439 int32_t Factor,
5440 CanonicalLoopInfo **UnrolledCLI) {
5441 assert(Factor >= 0 && "Unroll factor must not be negative");
5442
5443 Function *F = Loop->getFunction();
5444 LLVMContext &Ctx = F->getContext();
5445
5446 // If the unrolled loop is not used for another loop-associated directive, it
5447 // is sufficient to add metadata for the LoopUnrollPass.
5448 if (!UnrolledCLI) {
5449 SmallVector<Metadata *, 2> LoopMetadata;
5450 LoopMetadata.push_back(
5451 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
5452
5453 if (Factor >= 1) {
5455 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5456 LoopMetadata.push_back(MDNode::get(
5457 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
5458 }
5459
5460 addLoopMetadata(Loop, LoopMetadata);
5461 return;
5462 }
5463
5464 // Heuristically determine the unroll factor.
5465 if (Factor == 0)
5467
5468 // No change required with unroll factor 1.
5469 if (Factor == 1) {
5470 *UnrolledCLI = Loop;
5471 return;
5472 }
5473
5474 assert(Factor >= 2 &&
5475 "unrolling only makes sense with a factor of 2 or larger");
5476
5477 Type *IndVarTy = Loop->getIndVarType();
5478
5479 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
5480 // unroll the inner loop.
5481 Value *FactorVal =
5482 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
5483 /*isSigned=*/false));
5484 std::vector<CanonicalLoopInfo *> LoopNest =
5485 tileLoops(DL, {Loop}, {FactorVal});
5486 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
5487 *UnrolledCLI = LoopNest[0];
5488 CanonicalLoopInfo *InnerLoop = LoopNest[1];
5489
5490 // LoopUnrollPass can only fully unroll loops with constant trip count.
5491 // Unroll by the unroll factor with a fallback epilog for the remainder
5492 // iterations if necessary.
5494 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
5496 InnerLoop,
5497 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5499 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
5500
5501#ifndef NDEBUG
5502 (*UnrolledCLI)->assertOK();
5503#endif
5504}
5505
5508 llvm::Value *BufSize, llvm::Value *CpyBuf,
5509 llvm::Value *CpyFn, llvm::Value *DidIt) {
5510 if (!updateToLocation(Loc))
5511 return Loc.IP;
5512
5513 uint32_t SrcLocStrSize;
5514 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5515 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5516 Value *ThreadId = getOrCreateThreadID(Ident);
5517
5518 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
5519
5520 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
5521
5522 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
5523 Builder.CreateCall(Fn, Args);
5524
5525 return Builder.saveIP();
5526}
5527
5529 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5530 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
5532
5533 if (!updateToLocation(Loc))
5534 return Loc.IP;
5535
5536 // If needed allocate and initialize `DidIt` with 0.
5537 // DidIt: flag variable: 1=single thread; 0=not single thread.
5538 llvm::Value *DidIt = nullptr;
5539 if (!CPVars.empty()) {
5542 }
5543
5544 Directive OMPD = Directive::OMPD_single;
5545 uint32_t SrcLocStrSize;
5546 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5547 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5548 Value *ThreadId = getOrCreateThreadID(Ident);
5549 Value *Args[] = {Ident, ThreadId};
5550
5551 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
5552 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5553
5554 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
5555 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5556
5557 auto FiniCBWrapper = [&](InsertPointTy IP) {
5558 FiniCB(IP);
5559
5560 // The thread that executes the single region must set `DidIt` to 1.
5561 // This is used by __kmpc_copyprivate, to know if the caller is the
5562 // single thread or not.
5563 if (DidIt)
5565 };
5566
5567 // generates the following:
5568 // if (__kmpc_single()) {
5569 // .... single region ...
5570 // __kmpc_end_single
5571 // }
5572 // __kmpc_copyprivate
5573 // __kmpc_barrier
5574
5575 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
5576 /*Conditional*/ true,
5577 /*hasFinalize*/ true);
5578
5579 if (DidIt) {
5580 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
5581 // NOTE BufSize is currently unused, so just pass 0.
5583 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
5584 CPFuncs[I], DidIt);
5585 // NOTE __kmpc_copyprivate already inserts a barrier
5586 } else if (!IsNowait)
5588 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
5589 /* CheckCancelFlag */ false);
5590 return Builder.saveIP();
5591}
5592
5594 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5595 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
5596
5597 if (!updateToLocation(Loc))
5598 return Loc.IP;
5599
5600 Directive OMPD = Directive::OMPD_critical;
5601 uint32_t SrcLocStrSize;
5602 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5603 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5604 Value *ThreadId = getOrCreateThreadID(Ident);
5605 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
5606 Value *Args[] = {Ident, ThreadId, LockVar};
5607
5608 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
5609 Function *RTFn = nullptr;
5610 if (HintInst) {
5611 // Add Hint to entry Args and create call
5612 EnterArgs.push_back(HintInst);
5613 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
5614 } else {
5615 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
5616 }
5617 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
5618
5619 Function *ExitRTLFn =
5620 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
5621 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5622
5623 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5624 /*Conditional*/ false, /*hasFinalize*/ true);
5625}
5626
5629 InsertPointTy AllocaIP, unsigned NumLoops,
5630 ArrayRef<llvm::Value *> StoreValues,
5631 const Twine &Name, bool IsDependSource) {
5632 assert(
5633 llvm::all_of(StoreValues,
5634 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
5635 "OpenMP runtime requires depend vec with i64 type");
5636
5637 if (!updateToLocation(Loc))
5638 return Loc.IP;
5639
5640 // Allocate space for vector and generate alloc instruction.
5641 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
5642 Builder.restoreIP(AllocaIP);
5643 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
5644 ArgsBase->setAlignment(Align(8));
5645 Builder.restoreIP(Loc.IP);
5646
5647 // Store the index value with offset in depend vector.
5648 for (unsigned I = 0; I < NumLoops; ++I) {
5649 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
5650 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
5651 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
5652 STInst->setAlignment(Align(8));
5653 }
5654
5655 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
5656 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
5657
5658 uint32_t SrcLocStrSize;
5659 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5660 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5661 Value *ThreadId = getOrCreateThreadID(Ident);
5662 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
5663
5664 Function *RTLFn = nullptr;
5665 if (IsDependSource)
5666 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
5667 else
5668 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
5669 Builder.CreateCall(RTLFn, Args);
5670
5671 return Builder.saveIP();
5672}
5673
5675 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
5676 FinalizeCallbackTy FiniCB, bool IsThreads) {
5677 if (!updateToLocation(Loc))
5678 return Loc.IP;
5679
5680 Directive OMPD = Directive::OMPD_ordered;
5681 Instruction *EntryCall = nullptr;
5682 Instruction *ExitCall = nullptr;
5683
5684 if (IsThreads) {
5685 uint32_t SrcLocStrSize;
5686 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5687 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5688 Value *ThreadId = getOrCreateThreadID(Ident);
5689 Value *Args[] = {Ident, ThreadId};
5690
5691 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
5692 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
5693
5694 Function *ExitRTLFn =
5695 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
5696 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
5697 }
5698
5699 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5700 /*Conditional*/ false, /*hasFinalize*/ true);
5701}
5702
5703OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
5704 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
5705 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
5706 bool HasFinalize, bool IsCancellable) {
5707
5708 if (HasFinalize)
5709 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
5710
5711 // Create inlined region's entry and body blocks, in preparation
5712 // for conditional creation
5713 BasicBlock *EntryBB = Builder.GetInsertBlock();
5714 Instruction *SplitPos = EntryBB->getTerminator();
5715 if (!isa_and_nonnull<BranchInst>(SplitPos))
5716 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
5717 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
5718 BasicBlock *FiniBB =
5719 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
5720
5722 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
5723
5724 // generate body
5725 BodyGenCB(/* AllocaIP */ InsertPointTy(),
5726 /* CodeGenIP */ Builder.saveIP());
5727
5728 // emit exit call and do any needed finalization.
5729 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
5730 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
5731 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
5732 "Unexpected control flow graph state!!");
5733 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
5734 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
5735 "Unexpected Control Flow State!");
5737
5738 // If we are skipping the region of a non conditional, remove the exit
5739 // block, and clear the builder's insertion point.
5740 assert(SplitPos->getParent() == ExitBB &&
5741 "Unexpected Insertion point location!");
5742 auto merged = MergeBlockIntoPredecessor(ExitBB);
5743 BasicBlock *ExitPredBB = SplitPos->getParent();
5744 auto InsertBB = merged ? ExitPredBB : ExitBB;
5745 if (!isa_and_nonnull<BranchInst>(SplitPos))
5746 SplitPos->eraseFromParent();
5747 Builder.SetInsertPoint(InsertBB);
5748
5749 return Builder.saveIP();
5750}
5751
5752OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
5753 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
5754 // if nothing to do, Return current insertion point.
5755 if (!Conditional || !EntryCall)
5756 return Builder.saveIP();
5757
5758 BasicBlock *EntryBB = Builder.GetInsertBlock();
5759 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
5760 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
5761 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
5762
5763 // Emit thenBB and set the Builder's insertion point there for
5764 // body generation next. Place the block after the current block.
5765 Function *CurFn = EntryBB->getParent();
5766 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
5767
5768 // Move Entry branch to end of ThenBB, and replace with conditional
5769 // branch (If-stmt)
5770 Instruction *EntryBBTI = EntryBB->getTerminator();
5771 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
5772 EntryBBTI->removeFromParent();
5774 Builder.Insert(EntryBBTI);
5775 UI->eraseFromParent();
5777
5778 // return an insertion point to ExitBB.
5779 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
5780}
5781
5782OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
5783 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
5784 bool HasFinalize) {
5785
5786 Builder.restoreIP(FinIP);
5787
5788 // If there is finalization to do, emit it before the exit call
5789 if (HasFinalize) {
5790 assert(!FinalizationStack.empty() &&
5791 "Unexpected finalization stack state!");
5792
5793 FinalizationInfo Fi = FinalizationStack.pop_back_val();
5794 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
5795
5796 Fi.FiniCB(FinIP);
5797
5798 BasicBlock *FiniBB = FinIP.getBlock();
5799 Instruction *FiniBBTI = FiniBB->getTerminator();
5800
5801 // set Builder IP for call creation
5802 Builder.SetInsertPoint(FiniBBTI);
5803 }
5804
5805 if (!ExitCall)
5806 return Builder.saveIP();
5807
5808 // place the Exitcall as last instruction before Finalization block terminator
5809 ExitCall->removeFromParent();
5810 Builder.Insert(ExitCall);
5811
5812 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
5813 ExitCall->getIterator());
5814}
5815
5817 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
5818 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
5819 if (!IP.isSet())
5820 return IP;
5821
5823
5824 // creates the following CFG structure
5825 // OMP_Entry : (MasterAddr != PrivateAddr)?
5826 // F T
5827 // | \
5828 // | copin.not.master
5829 // | /
5830 // v /
5831 // copyin.not.master.end
5832 // |
5833 // v
5834 // OMP.Entry.Next
5835
5836 BasicBlock *OMP_Entry = IP.getBlock();
5837 Function *CurFn = OMP_Entry->getParent();
5838 BasicBlock *CopyBegin =
5839 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
5840 BasicBlock *CopyEnd = nullptr;
5841
5842 // If entry block is terminated, split to preserve the branch to following
5843 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
5844 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
5845 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
5846 "copyin.not.master.end");
5847 OMP_Entry->getTerminator()->eraseFromParent();
5848 } else {
5849 CopyEnd =
5850 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
5851 }
5852
5853 Builder.SetInsertPoint(OMP_Entry);
5854 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
5855 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
5856 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
5857 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
5858
5859 Builder.SetInsertPoint(CopyBegin);
5860 if (BranchtoEnd)
5862
5863 return Builder.saveIP();
5864}
5865
5867 Value *Size, Value *Allocator,
5868 std::string Name) {
5870 updateToLocation(Loc);
5871
5872 uint32_t SrcLocStrSize;
5873 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5874 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5875 Value *ThreadId = getOrCreateThreadID(Ident);
5876 Value *Args[] = {ThreadId, Size, Allocator};
5877
5878 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
5879
5880 return Builder.CreateCall(Fn, Args, Name);
5881}
5882
5884 Value *Addr, Value *Allocator,
5885 std::string Name) {
5887 updateToLocation(Loc);
5888
5889 uint32_t SrcLocStrSize;
5890 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5891 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5892 Value *ThreadId = getOrCreateThreadID(Ident);
5893 Value *Args[] = {ThreadId, Addr, Allocator};
5894 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
5895 return Builder.CreateCall(Fn, Args, Name);
5896}
5897
5899 const LocationDescription &Loc, Value *InteropVar,
5900 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
5901 Value *DependenceAddress, bool HaveNowaitClause) {
5903 updateToLocation(Loc);
5904
5905 uint32_t SrcLocStrSize;
5906 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5907 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5908 Value *ThreadId = getOrCreateThreadID(Ident);
5909 if (Device == nullptr)
5911 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
5912 if (NumDependences == nullptr) {
5913 NumDependences = ConstantInt::get(Int32, 0);
5914 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5915 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5916 }
5917 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5918 Value *Args[] = {
5919 Ident, ThreadId, InteropVar, InteropTypeVal,
5920 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
5921
5922 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
5923
5924 return Builder.CreateCall(Fn, Args);
5925}
5926
5928 const LocationDescription &Loc, Value *InteropVar, Value *Device,
5929 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
5931 updateToLocation(Loc);
5932
5933 uint32_t SrcLocStrSize;
5934 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5935 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5936 Value *ThreadId = getOrCreateThreadID(Ident);
5937 if (Device == nullptr)
5939 if (NumDependences == nullptr) {
5940 NumDependences = ConstantInt::get(Int32, 0);
5941 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5942 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5943 }
5944 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5945 Value *Args[] = {
5946 Ident, ThreadId, InteropVar, Device,
5947 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5948
5949 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
5950
5951 return Builder.CreateCall(Fn, Args);
5952}
5953
5955 Value *InteropVar, Value *Device,
5956 Value *NumDependences,
5957 Value *DependenceAddress,
5958 bool HaveNowaitClause) {
5960 updateToLocation(Loc);
5961 uint32_t SrcLocStrSize;
5962 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5963 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5964 Value *ThreadId = getOrCreateThreadID(Ident);
5965 if (Device == nullptr)
5967 if (NumDependences == nullptr) {
5968 NumDependences = ConstantInt::get(Int32, 0);
5969 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
5970 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
5971 }
5972 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
5973 Value *Args[] = {
5974 Ident, ThreadId, InteropVar, Device,
5975 NumDependences, DependenceAddress, HaveNowaitClauseVal};
5976
5977 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
5978
5979 return Builder.CreateCall(Fn, Args);
5980}
5981
5983 const LocationDescription &Loc, llvm::Value *Pointer,
5986 updateToLocation(Loc);
5987
5988 uint32_t SrcLocStrSize;
5989 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5990 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5991 Value *ThreadId = getOrCreateThreadID(Ident);
5992 Constant *ThreadPrivateCache =
5993 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
5994 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
5995
5996 Function *Fn =
5997 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
5998
5999 return Builder.CreateCall(Fn, Args);
6000}
6001
6004 int32_t MinThreadsVal, int32_t MaxThreadsVal,
6005 int32_t MinTeamsVal, int32_t MaxTeamsVal) {
6006 if (!updateToLocation(Loc))
6007 return Loc.IP;
6008
6009 uint32_t SrcLocStrSize;
6010 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6011 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6012 Constant *IsSPMDVal = ConstantInt::getSigned(
6014 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(Int8, !IsSPMD);
6015 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6016 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6017
6018 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6019 Function *Kernel = DebugKernelWrapper;
6020
6021 // We need to strip the debug prefix to get the correct kernel name.
6022 StringRef KernelName = Kernel->getName();
6023 const std::string DebugPrefix = "_debug__";
6024 if (KernelName.ends_with(DebugPrefix)) {
6025 KernelName = KernelName.drop_back(DebugPrefix.length());
6026 Kernel = M.getFunction(KernelName);
6027 assert(Kernel && "Expected the real kernel to exist");
6028 }
6029
6030 // Manifest the launch configuration in the metadata matching the kernel
6031 // environment.
6032 if (MinTeamsVal > 1 || MaxTeamsVal > 0)
6033 writeTeamsForKernel(T, *Kernel, MinTeamsVal, MaxTeamsVal);
6034
6035 // For max values, < 0 means unset, == 0 means set but unknown.
6036 if (MaxThreadsVal < 0)
6037 MaxThreadsVal = std::max(
6038 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), MinThreadsVal);
6039
6040 if (MaxThreadsVal > 0)
6041 writeThreadBoundsForKernel(T, *Kernel, MinThreadsVal, MaxThreadsVal);
6042
6043 Constant *MinThreads = ConstantInt::getSigned(Int32, MinThreadsVal);
6045 Constant *MinTeams = ConstantInt::getSigned(Int32, MinTeamsVal);
6046 Constant *MaxTeams = ConstantInt::getSigned(Int32, MaxTeamsVal);
6047 Constant *ReductionDataSize = ConstantInt::getSigned(Int32, 0);
6048 Constant *ReductionBufferLength = ConstantInt::getSigned(Int32, 0);
6049
6051 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6052 const DataLayout &DL = Fn->getDataLayout();
6053
6054 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6055 Constant *DynamicEnvironmentInitializer =
6056 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6057 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6058 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6059 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6060 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6061 DL.getDefaultGlobalsAddressSpace());
6062 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6063
6064 Constant *DynamicEnvironment =
6065 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6066 ? DynamicEnvironmentGV
6067 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6068 DynamicEnvironmentPtr);
6069
6070 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6071 ConfigurationEnvironment, {
6072 UseGenericStateMachineVal,
6073 MayUseNestedParallelismVal,
6074 IsSPMDVal,
6075 MinThreads,
6076 MaxThreads,
6077 MinTeams,
6078 MaxTeams,
6079 ReductionDataSize,
6080 ReductionBufferLength,
6081 });
6082 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6083 KernelEnvironment, {
6084 ConfigurationEnvironmentInitializer,
6085 Ident,
6086 DynamicEnvironment,
6087 });
6088 std::string KernelEnvironmentName =
6089 (KernelName + "_kernel_environment").str();
6090 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6091 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6092 KernelEnvironmentInitializer, KernelEnvironmentName,
6093 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6094 DL.getDefaultGlobalsAddressSpace());
6095 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6096
6097 Constant *KernelEnvironment =
6098 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6099 ? KernelEnvironmentGV
6100 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6101 KernelEnvironmentPtr);
6102 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6103 CallInst *ThreadKind =
6104 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6105
6106 Value *ExecUserCode = Builder.CreateICmpEQ(
6107 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6108 "exec_user_code");
6109
6110 // ThreadKind = __kmpc_target_init(...)
6111 // if (ThreadKind == -1)
6112 // user_code
6113 // else
6114 // return;
6115
6116 auto *UI = Builder.CreateUnreachable();
6117 BasicBlock *CheckBB = UI->getParent();
6118 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6119
6120 BasicBlock *WorkerExitBB = BasicBlock::Create(
6121 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6122 Builder.SetInsertPoint(WorkerExitBB);
6124
6125 auto *CheckBBTI = CheckBB->getTerminator();
6126 Builder.SetInsertPoint(CheckBBTI);
6127 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6128
6129 CheckBBTI->eraseFromParent();
6130 UI->eraseFromParent();
6131
6132 // Continue in the "user_code" block, see diagram above and in
6133 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6134 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6135}
6136
6138 int32_t TeamsReductionDataSize,
6139 int32_t TeamsReductionBufferLength) {
6140 if (!updateToLocation(Loc))
6141 return;
6142
6144 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6145
6146 Builder.CreateCall(Fn, {});
6147
6148 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6149 return;
6150
6152 // We need to strip the debug prefix to get the correct kernel name.
6153 StringRef KernelName = Kernel->getName();
6154 const std::string DebugPrefix = "_debug__";
6155 if (KernelName.ends_with(DebugPrefix))
6156 KernelName = KernelName.drop_back(DebugPrefix.length());
6157 auto *KernelEnvironmentGV =
6158 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6159 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6160 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6161 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6162 KernelEnvironmentInitializer,
6163 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6164 NewInitializer = ConstantFoldInsertValueInstruction(
6165 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6166 {0, 8});
6167 KernelEnvironmentGV->setInitializer(NewInitializer);
6168}
6169
6171 Module &M = *Kernel.getParent();
6172 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6173 for (auto *Op : MD->operands()) {
6174 if (Op->getNumOperands() != 3)
6175 continue;
6176 auto *KernelOp = dyn_cast<ConstantAsMetadata>(Op->getOperand(0));
6177 if (!KernelOp || KernelOp->getValue() != &Kernel)
6178 continue;
6179 auto *Prop = dyn_cast<MDString>(Op->getOperand(1));
6180 if (!Prop || Prop->getString() != Name)
6181 continue;
6182 return Op;
6183 }
6184 return nullptr;
6185}
6186
6188 bool Min) {
6189 // Update the "maxntidx" metadata for NVIDIA, or add it.
6190 MDNode *ExistingOp = getNVPTXMDNode(Kernel, Name);
6191 if (ExistingOp) {
6192 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6193 int32_t OldLimit = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6194 ExistingOp->replaceOperandWith(
6195 2, ConstantAsMetadata::get(ConstantInt::get(
6196 OldVal->getValue()->getType(),
6197 Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value))));
6198 } else {
6199 LLVMContext &Ctx = Kernel.getContext();
6201 MDString::get(Ctx, Name),
6203 ConstantInt::get(Type::getInt32Ty(Ctx), Value))};
6204 // Append metadata to nvvm.annotations
6205 Module &M = *Kernel.getParent();
6206 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
6207 MD->addOperand(MDNode::get(Ctx, MDVals));
6208 }
6209}
6210
6211std::pair<int32_t, int32_t>
6213 int32_t ThreadLimit =
6214 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6215
6216 if (T.isAMDGPU()) {
6217 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6218 if (!Attr.isValid() || !Attr.isStringAttribute())
6219 return {0, ThreadLimit};
6220 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6221 int32_t LB, UB;
6222 if (!llvm::to_integer(UBStr, UB, 10))
6223 return {0, ThreadLimit};
6224 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6225 if (!llvm::to_integer(LBStr, LB, 10))
6226 return {0, UB};
6227 return {LB, UB};
6228 }
6229
6230 if (MDNode *ExistingOp = getNVPTXMDNode(Kernel, "maxntidx")) {
6231 auto *OldVal = cast<ConstantAsMetadata>(ExistingOp->getOperand(2));
6232 int32_t UB = cast<ConstantInt>(OldVal->getValue())->getZExtValue();
6233 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6234 }
6235 return {0, ThreadLimit};
6236}
6237
6239 Function &Kernel, int32_t LB,
6240 int32_t UB) {
6241 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6242
6243 if (T.isAMDGPU()) {
6244 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6245 llvm::utostr(LB) + "," + llvm::utostr(UB));
6246 return;
6247 }
6248
6249 updateNVPTXMetadata(Kernel, "maxntidx", UB, true);
6250}
6251
6252std::pair<int32_t, int32_t>
6254 // TODO: Read from backend annotations if available.
6255 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
6256}
6257
6259 int32_t LB, int32_t UB) {
6260 if (T.isNVPTX())
6261 if (UB > 0)
6262 updateNVPTXMetadata(Kernel, "maxclusterrank", UB, true);
6263 if (T.isAMDGPU())
6264 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
6265
6266 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
6267}
6268
6269void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
6270 Function *OutlinedFn) {
6271 if (Config.isTargetDevice()) {
6273 // TODO: Determine if DSO local can be set to true.
6274 OutlinedFn->setDSOLocal(false);
6276 if (T.isAMDGCN())
6278 }
6279}
6280
6281Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
6282 StringRef EntryFnIDName) {
6283 if (Config.isTargetDevice()) {
6284 assert(OutlinedFn && "The outlined function must exist if embedded");
6285 return OutlinedFn;
6286 }
6287
6288 return new GlobalVariable(
6289 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
6290 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
6291}
6292
6293Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
6294 StringRef EntryFnName) {
6295 if (OutlinedFn)
6296 return OutlinedFn;
6297
6298 assert(!M.getGlobalVariable(EntryFnName, true) &&
6299 "Named kernel already exists?");
6300 return new GlobalVariable(
6301 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
6302 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
6303}
6304
6306 TargetRegionEntryInfo &EntryInfo,
6307 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
6308 Function *&OutlinedFn, Constant *&OutlinedFnID) {
6309
6310 SmallString<64> EntryFnName;
6311 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
6312
6314 ? GenerateFunctionCallback(EntryFnName)
6315 : nullptr;
6316
6317 // If this target outline function is not an offload entry, we don't need to
6318 // register it. This may be in the case of a false if clause, or if there are
6319 // no OpenMP targets.
6320 if (!IsOffloadEntry)
6321 return;
6322
6323 std::string EntryFnIDName =
6325 ? std::string(EntryFnName)
6326 : createPlatformSpecificName({EntryFnName, "region_id"});
6327
6328 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
6329 EntryFnName, EntryFnIDName);
6330}
6331
6333 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
6334 StringRef EntryFnName, StringRef EntryFnIDName) {
6335 if (OutlinedFn)
6336 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
6337 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
6338 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
6340 EntryInfo, EntryAddr, OutlinedFnID,
6342 return OutlinedFnID;
6343}
6344
6346 const LocationDescription &Loc, InsertPointTy AllocaIP,
6347 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
6348 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
6349 omp::RuntimeFunction *MapperFunc,
6350 function_ref<InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)>
6351 BodyGenCB,
6352 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
6353 function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
6354 if (!updateToLocation(Loc))
6355 return InsertPointTy();
6356
6357 // Disable TargetData CodeGen on Device pass.
6358 if (Config.IsTargetDevice.value_or(false)) {
6359 if (BodyGenCB)
6361 return Builder.saveIP();
6362 }
6363
6364 Builder.restoreIP(CodeGenIP);
6365 bool IsStandAlone = !BodyGenCB;
6366 MapInfosTy *MapInfo;
6367 // Generate the code for the opening of the data environment. Capture all the
6368 // arguments of the runtime call by reference because they are used in the
6369 // closing of the region.
6370 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6371 MapInfo = &GenMapInfoCB(Builder.saveIP());
6372 emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
6373 /*IsNonContiguous=*/true, DeviceAddrCB,
6374 CustomMapperCB);
6375
6376 TargetDataRTArgs RTArgs;
6378
6379 // Emit the number of elements in the offloading arrays.
6380 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6381
6382 // Source location for the ident struct
6383 if (!SrcLocInfo) {
6384 uint32_t SrcLocStrSize;
6385 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6386 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6387 }
6388
6389 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6390 PointerNum, RTArgs.BasePointersArray,
6391 RTArgs.PointersArray, RTArgs.SizesArray,
6392 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6393 RTArgs.MappersArray};
6394
6395 if (IsStandAlone) {
6396 assert(MapperFunc && "MapperFunc missing for standalone target data");
6398 OffloadingArgs);
6399 } else {
6400 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
6401 omp::OMPRTL___tgt_target_data_begin_mapper);
6402
6403 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
6404
6405 for (auto DeviceMap : Info.DevicePtrInfoMap) {
6406 if (isa<AllocaInst>(DeviceMap.second.second)) {
6407 auto *LI =
6408 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
6409 Builder.CreateStore(LI, DeviceMap.second.second);
6410 }
6411 }
6412
6413 // If device pointer privatization is required, emit the body of the
6414 // region here. It will have to be duplicated: with and without
6415 // privatization.
6417 }
6418 };
6419
6420 // If we need device pointer privatization, we need to emit the body of the
6421 // region with no privatization in the 'else' branch of the conditional.
6422 // Otherwise, we don't have to do anything.
6423 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6425 };
6426
6427 // Generate code for the closing of the data region.
6428 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
6429 TargetDataRTArgs RTArgs;
6430 Info.EmitDebug = !MapInfo->Names.empty();
6431 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
6432
6433 // Emit the number of elements in the offloading arrays.
6434 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
6435
6436 // Source location for the ident struct
6437 if (!SrcLocInfo) {
6438 uint32_t SrcLocStrSize;
6439 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6440 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6441 }
6442
6443 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
6444 PointerNum, RTArgs.BasePointersArray,
6445 RTArgs.PointersArray, RTArgs.SizesArray,
6446 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
6447 RTArgs.MappersArray};
6448 Function *EndMapperFunc =
6449 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
6450
6451 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
6452 };
6453
6454 // We don't have to do anything to close the region if the if clause evaluates
6455 // to false.
6456 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {};
6457
6458 if (BodyGenCB) {
6459 if (IfCond) {
6460 emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
6461 } else {
6462 BeginThenGen(AllocaIP, Builder.saveIP());
6463 }
6464
6465 // If we don't require privatization of device pointers, we emit the body in
6466 // between the runtime calls. This avoids duplicating the body code.
6468
6469 if (IfCond) {
6470 emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
6471 } else {
6472 EndThenGen(AllocaIP, Builder.saveIP());
6473 }
6474 } else {
6475 if (IfCond) {
6476 emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
6477 } else {
6478 BeginThenGen(AllocaIP, Builder.saveIP());
6479 }
6480 }
6481
6482 return Builder.saveIP();
6483}
6484
6487 bool IsGPUDistribute) {
6488 assert((IVSize == 32 || IVSize == 64) &&
6489 "IV size is not compatible with the omp runtime");
6491 if (IsGPUDistribute)
6492 Name = IVSize == 32
6493 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
6494 : omp::OMPRTL___kmpc_distribute_static_init_4u)
6495 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
6496 : omp::OMPRTL___kmpc_distribute_static_init_8u);
6497 else
6498 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
6499 : omp::OMPRTL___kmpc_for_static_init_4u)
6500 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
6501 : omp::OMPRTL___kmpc_for_static_init_8u);
6502
6504}
6505
6507 bool IVSigned) {
6508 assert((IVSize == 32 || IVSize == 64) &&
6509 "IV size is not compatible with the omp runtime");
6510 RuntimeFunction Name = IVSize == 32
6511 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
6512 : omp::OMPRTL___kmpc_dispatch_init_4u)
6513 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
6514 : omp::OMPRTL___kmpc_dispatch_init_8u);
6515
6517}
6518
6520 bool IVSigned) {
6521 assert((IVSize == 32 || IVSize == 64) &&
6522 "IV size is not compatible with the omp runtime");
6523 RuntimeFunction Name = IVSize == 32
6524 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
6525 : omp::OMPRTL___kmpc_dispatch_next_4u)
6526 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
6527 : omp::OMPRTL___kmpc_dispatch_next_8u);
6528
6530}
6531
6533 bool IVSigned) {
6534 assert((IVSize == 32 || IVSize == 64) &&
6535 "IV size is not compatible with the omp runtime");
6536 RuntimeFunction Name = IVSize == 32
6537 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
6538 : omp::OMPRTL___kmpc_dispatch_fini_4u)
6539 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
6540 : omp::OMPRTL___kmpc_dispatch_fini_8u);
6541
6543}
6544
6546 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
6547}
6548
6550 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName,
6554 SmallVector<Type *> ParameterTypes;
6555 if (OMPBuilder.Config.isTargetDevice()) {
6556 // Add the "implicit" runtime argument we use to provide launch specific
6557 // information for target devices.
6558 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
6559 ParameterTypes.push_back(Int8PtrTy);
6560
6561 // All parameters to target devices are passed as pointers
6562 // or i64. This assumes 64-bit address spaces/pointers.
6563 for (auto &Arg : Inputs)
6564 ParameterTypes.push_back(Arg->getType()->isPointerTy()
6565 ? Arg->getType()
6566 : Type::getInt64Ty(Builder.getContext()));
6567 } else {
6568 for (auto &Arg : Inputs)
6569 ParameterTypes.push_back(Arg->getType());
6570 }
6571
6572 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
6573 /*isVarArg*/ false);
6574 auto Func = Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName,
6575 Builder.GetInsertBlock()->getModule());
6576
6577 // Save insert point.
6578 auto OldInsertPoint = Builder.saveIP();
6579
6580 // Generate the region into the function.
6581 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
6582 Builder.SetInsertPoint(EntryBB);
6583
6584 // Insert target init call in the device compilation pass.
6585 if (OMPBuilder.Config.isTargetDevice())
6586 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, /*IsSPMD*/ false));
6587
6588 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
6589
6590 // As we embed the user code in the middle of our target region after we
6591 // generate entry code, we must move what allocas we can into the entry
6592 // block to avoid possible breaking optimisations for device
6593 if (OMPBuilder.Config.isTargetDevice())
6595
6596 // Insert target deinit call in the device compilation pass.
6597 Builder.restoreIP(CBFunc(Builder.saveIP(), Builder.saveIP()));
6598 if (OMPBuilder.Config.isTargetDevice())
6599 OMPBuilder.createTargetDeinit(Builder);
6600
6601 // Insert return instruction.
6602 Builder.CreateRetVoid();
6603
6604 // New Alloca IP at entry point of created device function.
6605 Builder.SetInsertPoint(EntryBB->getFirstNonPHI());
6606 auto AllocaIP = Builder.saveIP();
6607
6608 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
6609
6610 // Skip the artificial dyn_ptr on the device.
6611 const auto &ArgRange =
6612 OMPBuilder.Config.isTargetDevice()
6613 ? make_range(Func->arg_begin() + 1, Func->arg_end())
6614 : Func->args();
6615
6616 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
6617 // Things like GEP's can come in the form of Constants. Constants and
6618 // ConstantExpr's do not have access to the knowledge of what they're
6619 // contained in, so we must dig a little to find an instruction so we
6620 // can tell if they're used inside of the function we're outlining. We
6621 // also replace the original constant expression with a new instruction
6622 // equivalent; an instruction as it allows easy modification in the
6623 // following loop, as we can now know the constant (instruction) is
6624 // owned by our target function and replaceUsesOfWith can now be invoked
6625 // on it (cannot do this with constants it seems). A brand new one also
6626 // allows us to be cautious as it is perhaps possible the old expression
6627 // was used inside of the function but exists and is used externally
6628 // (unlikely by the nature of a Constant, but still).
6629 // NOTE: We cannot remove dead constants that have been rewritten to
6630 // instructions at this stage, we run the risk of breaking later lowering
6631 // by doing so as we could still be in the process of lowering the module
6632 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
6633 // constants we have created rewritten versions of.
6634 if (auto *Const = dyn_cast<Constant>(Input))
6635 convertUsersOfConstantsToInstructions(Const, Func, false);
6636
6637 // Collect all the instructions
6638 for (User *User : make_early_inc_range(Input->users()))
6639 if (auto *Instr = dyn_cast<Instruction>(User))
6640 if (Instr->getFunction() == Func)
6641 Instr->replaceUsesOfWith(Input, InputCopy);
6642 };
6643
6644 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
6645
6646 // Rewrite uses of input valus to parameters.
6647 for (auto InArg : zip(Inputs, ArgRange)) {
6648 Value *Input = std::get<0>(InArg);
6649 Argument &Arg = std::get<1>(InArg);
6650 Value *InputCopy = nullptr;
6651
6652 Builder.restoreIP(
6653 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP()));
6654
6655 // In certain cases a Global may be set up for replacement, however, this
6656 // Global may be used in multiple arguments to the kernel, just segmented
6657 // apart, for example, if we have a global array, that is sectioned into
6658 // multiple mappings (technically not legal in OpenMP, but there is a case
6659 // in Fortran for Common Blocks where this is neccesary), we will end up
6660 // with GEP's into this array inside the kernel, that refer to the Global
6661 // but are technically seperate arguments to the kernel for all intents and
6662 // purposes. If we have mapped a segment that requires a GEP into the 0-th
6663 // index, it will fold into an referal to the Global, if we then encounter
6664 // this folded GEP during replacement all of the references to the
6665 // Global in the kernel will be replaced with the argument we have generated
6666 // that corresponds to it, including any other GEP's that refer to the
6667 // Global that may be other arguments. This will invalidate all of the other
6668 // preceding mapped arguments that refer to the same global that may be
6669 // seperate segments. To prevent this, we defer global processing until all
6670 // other processing has been performed.
6671 if (llvm::isa<llvm::GlobalValue>(std::get<0>(InArg)) ||
6672 llvm::isa<llvm::GlobalObject>(std::get<0>(InArg)) ||
6673 llvm::isa<llvm::GlobalVariable>(std::get<0>(InArg))) {
6674 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
6675 continue;
6676 }
6677
6678 ReplaceValue(Input, InputCopy, Func);
6679 }
6680
6681 // Replace all of our deferred Input values, currently just Globals.
6682 for (auto Deferred : DeferredReplacement)
6683 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
6684
6685 // Restore insert point.
6686 Builder.restoreIP(OldInsertPoint);
6687
6688 return Func;
6689}
6690
6691/// Create an entry point for a target task with the following.
6692/// It'll have the following signature
6693/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
6694/// This function is called from emitTargetTask once the
6695/// code to launch the target kernel has been outlined already.
6697 IRBuilderBase &Builder,
6698 CallInst *StaleCI) {
6699 Module &M = OMPBuilder.M;
6700 // KernelLaunchFunction is the target launch function, i.e.
6701 // the function that sets up kernel arguments and calls
6702 // __tgt_target_kernel to launch the kernel on the device.
6703 //
6704 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
6705
6706 // StaleCI is the CallInst which is the call to the outlined
6707 // target kernel launch function. If there are values that the
6708 // outlined function uses then these are aggregated into a structure
6709 // which is passed as the second argument. If not, then there's
6710 // only one argument, the threadID. So, StaleCI can be
6711 //
6712 // %structArg = alloca { ptr, ptr }, align 8
6713 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
6714 // store ptr %20, ptr %gep_, align 8
6715 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
6716 // store ptr %21, ptr %gep_8, align 8
6717 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
6718 //
6719 // OR
6720 //
6721 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
6723 StaleCI->getIterator());
6724 LLVMContext &Ctx = StaleCI->getParent()->getContext();
6725 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
6726 Type *TaskPtrTy = OMPBuilder.TaskPtr;
6727 Type *TaskTy = OMPBuilder.Task;
6728 auto ProxyFnTy =
6729 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
6730 /* isVarArg */ false);
6731 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
6732 ".omp_target_task_proxy_func",
6733 Builder.GetInsertBlock()->getModule());
6734 ProxyFn->getArg(0)->setName("thread.id");
6735 ProxyFn->getArg(1)->setName("task");
6736
6737 BasicBlock *EntryBB =
6738 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
6739 Builder.SetInsertPoint(EntryBB);
6740
6741 bool HasShareds = StaleCI->arg_size() > 1;
6742 // TODO: This is a temporary assert to prove to ourselves that
6743 // the outlined target launch function is always going to have
6744 // atmost two arguments if there is any data shared between
6745 // host and device.
6746 assert((!HasShareds || (StaleCI->arg_size() == 2)) &&
6747 "StaleCI with shareds should have exactly two arguments.");
6748 if (HasShareds) {
6749 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6750 assert(ArgStructAlloca &&
6751 "Unable to find the alloca instruction corresponding to arguments "
6752 "for extracted function");
6753 auto *ArgStructType =
6754 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
6755
6756 AllocaInst *NewArgStructAlloca =
6757 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
6758 Value *TaskT = ProxyFn->getArg(1);
6759 Value *ThreadId = ProxyFn->getArg(0);
6760 Value *SharedsSize =
6761 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
6762
6763 Value *Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
6764 LoadInst *LoadShared =
6765 Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
6766
6767 Builder.CreateMemCpy(
6768 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
6769 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
6770
6771 Builder.CreateCall(KernelLaunchFunction, {ThreadId, NewArgStructAlloca});
6772 }
6773 Builder.CreateRetVoid();
6774 return ProxyFn;
6775}
6777 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
6778 TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn,
6779 Constant *&OutlinedFnID, SmallVectorImpl<Value *> &Inputs,
6782
6783 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
6784 [&OMPBuilder, &Builder, &Inputs, &CBFunc,
6785 &ArgAccessorFuncCB](StringRef EntryFnName) {
6786 return createOutlinedFunction(OMPBuilder, Builder, EntryFnName, Inputs,
6787 CBFunc, ArgAccessorFuncCB);
6788 };
6789
6790 OMPBuilder.emitTargetRegionFunction(EntryInfo, GenerateOutlinedFunction,
6791 IsOffloadEntry, OutlinedFn, OutlinedFnID);
6792}
6794 Function *OutlinedFn, Value *OutlinedFnID,
6795 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
6796 Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP,
6798 bool HasNoWait) {
6799
6800 // When we arrive at this function, the target region itself has been
6801 // outlined into the function OutlinedFn.
6802 // So at ths point, for
6803 // --------------------------------------------------
6804 // void user_code_that_offloads(...) {
6805 // omp target depend(..) map(from:a) map(to:b, c)
6806 // a = b + c
6807 // }
6808 //
6809 // --------------------------------------------------
6810 //
6811 // we have
6812 //
6813 // --------------------------------------------------
6814 //
6815 // void user_code_that_offloads(...) {
6816 // %.offload_baseptrs = alloca [3 x ptr], align 8
6817 // %.offload_ptrs = alloca [3 x ptr], align 8
6818 // %.offload_mappers = alloca [3 x ptr], align 8
6819 // ;; target region has been outlined and now we need to
6820 // ;; offload to it via a target task.
6821 // }
6822 // void outlined_device_function(ptr a, ptr b, ptr c) {
6823 // *a = *b + *c
6824 // }
6825 //
6826 // We have to now do the following
6827 // (i) Make an offloading call to outlined_device_function using the OpenMP
6828 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
6829 // emitted by emitKernelLaunch
6830 // (ii) Create a task entry point function that calls kernel_launch_function
6831 // and is the entry point for the target task. See
6832 // '@.omp_target_task_proxy_func in the pseudocode below.
6833 // (iii) Create a task with the task entry point created in (ii)
6834 //
6835 // That is we create the following
6836 //
6837 // void user_code_that_offloads(...) {
6838 // %.offload_baseptrs = alloca [3 x ptr], align 8
6839 // %.offload_ptrs = alloca [3 x ptr], align 8
6840 // %.offload_mappers = alloca [3 x ptr], align 8
6841 //
6842 // %structArg = alloca { ptr, ptr, ptr }, align 8
6843 // %strucArg[0] = %.offload_baseptrs
6844 // %strucArg[1] = %.offload_ptrs
6845 // %strucArg[2] = %.offload_mappers
6846 // proxy_target_task = @__kmpc_omp_task_alloc(...,
6847 // @.omp_target_task_proxy_func)
6848 // memcpy(proxy_target_task->shareds, %structArg, sizeof(structArg))
6849 // dependencies_array = ...
6850 // ;; if nowait not present
6851 // call @__kmpc_omp_wait_deps(..., dependencies_array)
6852 // call @__kmpc_omp_task_begin_if0(...)
6853 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
6854 // %proxy_target_task) call @__kmpc_omp_task_complete_if0(...)
6855 // }
6856 //
6857 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
6858 // ptr %task) {
6859 // %structArg = alloca {ptr, ptr, ptr}
6860 // %shared_data = load (getelementptr %task, 0, 0)
6861 // mempcy(%structArg, %shared_data, sizeof(structArg))
6862 // kernel_launch_function(%thread.id, %structArg)
6863 // }
6864 //
6865 // We need the proxy function because the signature of the task entry point
6866 // expected by kmpc_omp_task is always the same and will be different from
6867 // that of the kernel_launch function.
6868 //
6869 // kernel_launch_function is generated by emitKernelLaunch and has the
6870 // always_inline attribute.
6871 // void kernel_launch_function(thread_id,
6872 // structArg) alwaysinline {
6873 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
6874 // offload_baseptrs = load(getelementptr structArg, 0, 0)
6875 // offload_ptrs = load(getelementptr structArg, 0, 1)
6876 // offload_mappers = load(getelementptr structArg, 0, 2)
6877 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
6878 // ; offload_mappers
6879 // call i32 @__tgt_target_kernel(...,
6880 // outlined_device_function,
6881 // ptr %kernel_args)
6882 // }
6883 // void outlined_device_function(ptr a, ptr b, ptr c) {
6884 // *a = *b + *c
6885 // }
6886 //
6887 BasicBlock *TargetTaskBodyBB =
6888 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
6889 BasicBlock *TargetTaskAllocaBB =
6890 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
6891
6892 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
6893 TargetTaskAllocaBB->begin());
6894 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
6895
6896 OutlineInfo OI;
6897 OI.EntryBB = TargetTaskAllocaBB;
6898 OI.OuterAllocaBB = AllocaIP.getBlock();
6899
6900 // Add the thread ID argument.
6903 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
6904
6905 Builder.restoreIP(TargetTaskBodyIP);
6906
6907 if (OutlinedFnID) {
6908 // emitKernelLaunch makes the necessary runtime call to offload the kernel.
6909 // We then outline all that code into a separate function
6910 // ('kernel_launch_function' in the pseudo code above). This function is
6911 // then called by the target task proxy function (see
6912 // '@.omp_target_task_proxy_func' in the pseudo code above)
6913 // "@.omp_target_task_proxy_func' is generated by
6914 // emitTargetTaskProxyFunction.
6915 Builder.restoreIP(emitKernelLaunch(Builder, OutlinedFn, OutlinedFnID,
6916 EmitTargetCallFallbackCB, Args, DeviceID,
6917 RTLoc, TargetTaskAllocaIP));
6918 } else {
6919 // When OutlinedFnID is set to nullptr, then it's not an offloading call. In
6920 // this case, we execute the host implementation directly.
6921 Builder.restoreIP(EmitTargetCallFallbackCB(Builder.saveIP()));
6922 }
6923
6924 OI.ExitBB = Builder.saveIP().getBlock();
6925 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies,
6926 HasNoWait](Function &OutlinedFn) mutable {
6927 assert(OutlinedFn.getNumUses() == 1 &&
6928 "there must be a single user for the outlined function");
6929
6930 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
6931 bool HasShareds = StaleCI->arg_size() > 1;
6932
6933 Function *ProxyFn = emitTargetTaskProxyFunction(*this, Builder, StaleCI);
6934
6935 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
6936 << "\n");
6937
6938 Builder.SetInsertPoint(StaleCI);
6939
6940 // Gather the arguments for emitting the runtime call.
6941 uint32_t SrcLocStrSize;
6942 Constant *SrcLocStr =
6944 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6945
6946 // @__kmpc_omp_task_alloc
6947 Function *TaskAllocFn =
6948 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
6949
6950 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
6951 // call.
6952 Value *ThreadID = getOrCreateThreadID(Ident);
6953
6954 // Argument - `sizeof_kmp_task_t` (TaskSize)
6955 // Tasksize refers to the size in bytes of kmp_task_t data structure
6956 // including private vars accessed in task.
6957 // TODO: add kmp_task_t_with_privates (privates)
6958 Value *TaskSize =
6960
6961 // Argument - `sizeof_shareds` (SharedsSize)
6962 // SharedsSize refers to the shareds array size in the kmp_task_t data
6963 // structure.
6964 Value *SharedsSize = Builder.getInt64(0);
6965 if (HasShareds) {
6966 auto *ArgStructAlloca = dyn_cast<AllocaInst>(StaleCI->getArgOperand(1));
6967 assert(ArgStructAlloca &&
6968 "Unable to find the alloca instruction corresponding to arguments "
6969 "for extracted function");
6970 auto *ArgStructType =
6971 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
6972 assert(ArgStructType && "Unable to find struct type corresponding to "
6973 "arguments for extracted function");
6974 SharedsSize =
6976 }
6977
6978 // Argument - `flags`
6979 // Task is tied iff (Flags & 1) == 1.
6980 // Task is untied iff (Flags & 1) == 0.
6981 // Task is final iff (Flags & 2) == 2.
6982 // Task is not final iff (Flags & 2) == 0.
6983 // A target task is not final and is untied.
6985
6986 // Emit the @__kmpc_omp_task_alloc runtime call
6987 // The runtime call returns a pointer to an area where the task captured
6988 // variables must be copied before the task is run (TaskData)
6989 CallInst *TaskData = Builder.CreateCall(
6990 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
6991 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
6992 /*task_func=*/ProxyFn});
6993
6994 if (HasShareds) {
6995 Value *Shareds = StaleCI->getArgOperand(1);
6996 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
6997 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
6998 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
6999 SharedsSize);
7000 }
7001
7002 Value *DepArray = emitTaskDependencies(*this, Dependencies);
7003
7004 // ---------------------------------------------------------------
7005 // V5.2 13.8 target construct
7006 // If the nowait clause is present, execution of the target task
7007 // may be deferred. If the nowait clause is not present, the target task is
7008 // an included task.
7009 // ---------------------------------------------------------------
7010 // The above means that the lack of a nowait on the target construct
7011 // translates to '#pragma omp task if(0)'
7012 if (!HasNoWait) {
7013 if (DepArray) {
7014 Function *TaskWaitFn =
7015 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
7017 TaskWaitFn,
7018 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
7019 /*ndeps=*/Builder.getInt32(Dependencies.size()),
7020 /*dep_list=*/DepArray,
7021 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
7022 /*noalias_dep_list=*/
7024 }
7025 // Included task.
7026 Function *TaskBeginFn =
7027 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
7028 Function *TaskCompleteFn =
7029 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
7030 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
7031 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
7032 CI->setDebugLoc(StaleCI->getDebugLoc());
7033 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
7034 } else if (DepArray) {
7035 // HasNoWait - meaning the task may be deferred. Call
7036 // __kmpc_omp_task_with_deps if there are dependencies,
7037 // else call __kmpc_omp_task
7038 Function *TaskFn =
7039 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
7041 TaskFn,
7042 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
7043 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
7045 } else {
7046 // Emit the @__kmpc_omp_task runtime call to spawn the task
7047 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
7048 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
7049 }
7050
7051 StaleCI->eraseFromParent();
7052 for (Instruction *I : llvm::reverse(ToBeDeleted))
7053 I->eraseFromParent();
7054 };
7055 addOutlineInfo(std::move(OI));
7056
7057 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
7058 << *(Builder.GetInsertBlock()) << "\n");
7059 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
7061 << "\n");
7062 return Builder.saveIP();
7063}
7064
7066 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
7067 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
7068 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7069 function_ref<Value *(unsigned int)> CustomMapperCB) {
7070 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
7071 DeviceAddrCB, CustomMapperCB);
7072 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
7073}
7074
7075static void emitTargetCall(
7076 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7077 OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn,
7078 Constant *OutlinedFnID, ArrayRef<int32_t> NumTeams, int32_t NumThreads,
7082 // Generate a function call to the host fallback implementation of the target
7083 // region. This is called by the host when no offload entry was generated for
7084 // the target region and when the offloading call fails at runtime.
7085 auto &&EmitTargetCallFallbackCB =
7087 Builder.restoreIP(IP);
7088 Builder.CreateCall(OutlinedFn, Args);
7089 return Builder.saveIP();
7090 };
7091
7092 bool HasNoWait = false;
7093 bool HasDependencies = Dependencies.size() > 0;
7094 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
7095
7096 // If we don't have an ID for the target region, it means an offload entry
7097 // wasn't created. In this case we just run the host fallback directly.
7098 if (!OutlinedFnID) {
7099 if (RequiresOuterTargetTask) {
7100 // Arguments that are intended to be directly forwarded to an
7101 // emitKernelLaunch call are pased as nullptr, since OutlinedFnID=nullptr
7102 // results in that call not being done.
7104 Builder.restoreIP(OMPBuilder.emitTargetTask(
7105 OutlinedFn, /*OutlinedFnID=*/nullptr, EmitTargetCallFallbackCB, KArgs,
7106 /*DeviceID=*/nullptr, /*RTLoc=*/nullptr, AllocaIP, Dependencies,
7107 HasNoWait));
7108 } else {
7109 Builder.restoreIP(EmitTargetCallFallbackCB(Builder.saveIP()));
7110 }
7111 return;
7112 }
7113
7115 /*RequiresDevicePointerInfo=*/false,
7116 /*SeparateBeginEndCalls=*/true);
7117
7118 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
7120 OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
7121 RTArgs, MapInfo,
7122 /*IsNonContiguous=*/true,
7123 /*ForEndCall=*/false);
7124
7125 SmallVector<Value *, 3> NumTeamsC;
7126 for (auto V : NumTeams)
7127 NumTeamsC.push_back(llvm::ConstantInt::get(Builder.getInt32Ty(), V));
7128
7129 unsigned NumTargetItems = Info.NumberOfPtrs;
7130 // TODO: Use correct device ID
7131 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
7132 Value *NumThreadsVal = Builder.getInt32(NumThreads);
7133 uint32_t SrcLocStrSize;
7134 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
7135 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
7136 llvm::omp::IdentFlag(0), 0);
7137 // TODO: Use correct NumIterations
7138 Value *NumIterations = Builder.getInt64(0);
7139 // TODO: Use correct DynCGGroupMem
7140 Value *DynCGGroupMem = Builder.getInt32(0);
7141
7142 OpenMPIRBuilder::TargetKernelArgs KArgs(NumTargetItems, RTArgs, NumIterations,
7143 NumTeamsC, NumThreadsVal,
7144 DynCGGroupMem, HasNoWait);
7145
7146 // The presence of certain clauses on the target directive require the
7147 // explicit generation of the target task.
7148 if (RequiresOuterTargetTask) {
7149 Builder.restoreIP(OMPBuilder.emitTargetTask(
7150 OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs, DeviceID,
7151 RTLoc, AllocaIP, Dependencies, HasNoWait));
7152 } else {
7153 Builder.restoreIP(OMPBuilder.emitKernelLaunch(
7154 Builder, OutlinedFn, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
7155 DeviceID, RTLoc, AllocaIP));
7156 }
7157}
7158
7160 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
7161 InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
7162 ArrayRef<int32_t> NumTeams, int32_t NumThreads,
7166 SmallVector<DependData> Dependenciess) {
7167
7168 if (!updateToLocation(Loc))
7169 return InsertPointTy();
7170
7171 Builder.restoreIP(CodeGenIP);
7172
7173 Function *OutlinedFn;
7174 Constant *OutlinedFnID = nullptr;
7175 // The target region is outlined into its own function. The LLVM IR for
7176 // the target region itself is generated using the callbacks CBFunc
7177 // and ArgAccessorFuncCB
7178 emitTargetOutlinedFunction(*this, Builder, IsOffloadEntry, EntryInfo,
7179 OutlinedFn, OutlinedFnID, Args, CBFunc,
7180 ArgAccessorFuncCB);
7181
7182 // If we are not on the target device, then we need to generate code
7183 // to make a remote call (offload) to the previously outlined function
7184 // that represents the target region. Do that now.
7185 if (!Config.isTargetDevice())
7186 emitTargetCall(*this, Builder, AllocaIP, OutlinedFn, OutlinedFnID, NumTeams,
7187 NumThreads, Args, GenMapInfoCB, Dependenciess);
7188 return Builder.saveIP();
7189}
7190
7191std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
7192 StringRef FirstSeparator,
7193 StringRef Separator) {
7194 SmallString<128> Buffer;
7196 StringRef Sep = FirstSeparator;
7197 for (StringRef Part : Parts) {
7198 OS << Sep << Part;
7199 Sep = Separator;
7200 }
7201 return OS.str().str();
7202}
7203
7204std::string
7206 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
7207 Config.separator());
7208}
7209
7212 unsigned AddressSpace) {
7213 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
7214 if (Elem.second) {
7215 assert(Elem.second->getValueType() == Ty &&
7216 "OMP internal variable has different type than requested");
7217 } else {
7218 // TODO: investigate the appropriate linkage type used for the global
7219 // variable for possibly changing that to internal or private, or maybe
7220 // create different versions of the function for different OMP internal
7221 // variables.
7222 auto Linkage = this->M.getTargetTriple().rfind("wasm32") == 0
7225 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
7226 Constant::getNullValue(Ty), Elem.first(),
7227 /*InsertBefore=*/nullptr,
7229 const DataLayout &DL = M.getDataLayout();
7230 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
7231 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
7232 GV->setAlignment(std::max(TypeAlign, PtrAlign));
7233 Elem.second = GV;
7234 }
7235
7236 return Elem.second;
7237}
7238
7239Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
7240 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
7241 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
7242 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
7243}
7244
7247 Value *Null =
7248 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
7249 Value *SizeGep =
7250 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
7251 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
7252 return SizePtrToInt;
7253}
7254
7257 std::string VarName) {
7258 llvm::Constant *MaptypesArrayInit =
7260 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
7261 M, MaptypesArrayInit->getType(),
7262 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
7263 VarName);
7264 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
7265 return MaptypesArrayGlobal;
7266}
7267
7269 InsertPointTy AllocaIP,
7270 unsigned NumOperands,
7271 struct MapperAllocas &MapperAllocas) {
7272 if (!updateToLocation(Loc))
7273 return;
7274
7275 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7276 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7277 Builder.restoreIP(AllocaIP);
7278 AllocaInst *ArgsBase = Builder.CreateAlloca(
7279 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
7280 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
7281 ".offload_ptrs");
7282 AllocaInst *ArgSizes = Builder.CreateAlloca(
7283 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
7284 Builder.restoreIP(Loc.IP);
7285 MapperAllocas.ArgsBase = ArgsBase;
7286 MapperAllocas.Args = Args;
7287 MapperAllocas.ArgSizes = ArgSizes;
7288}
7289
7291 Function *MapperFunc, Value *SrcLocInfo,
7292 Value *MaptypesArg, Value *MapnamesArg,
7294 int64_t DeviceID, unsigned NumOperands) {
7295 if (!updateToLocation(Loc))
7296 return;
7297
7298 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
7299 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
7300 Value *ArgsBaseGEP =
7302 {Builder.getInt32(0), Builder.getInt32(0)});
7303 Value *ArgsGEP =
7305 {Builder.getInt32(0), Builder.getInt32(0)});
7306 Value *ArgSizesGEP =
7308 {Builder.getInt32(0), Builder.getInt32(0)});
7309 Value *NullPtr =
7310 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
7311 Builder.CreateCall(MapperFunc,
7312 {SrcLocInfo, Builder.getInt64(DeviceID),
7313 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
7314 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
7315}
7316
7318 TargetDataRTArgs &RTArgs,
7319 TargetDataInfo &Info,
7320 bool ForEndCall) {
7321 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
7322 "expected region end call to runtime only when end call is separate");
7323 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
7324 auto VoidPtrTy = UnqualPtrTy;
7325 auto VoidPtrPtrTy = UnqualPtrTy;
7326 auto Int64Ty = Type::getInt64Ty(M.getContext());
7327 auto Int64PtrTy = UnqualPtrTy;
7328
7329 if (!Info.NumberOfPtrs) {
7330 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7331 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7332 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
7333 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
7334 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7335 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7336 return;
7337 }
7338
7340 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
7341 Info.RTArgs.BasePointersArray,
7342 /*Idx0=*/0, /*Idx1=*/0);
7344 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
7345 /*Idx0=*/0,
7346 /*Idx1=*/0);
7348 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7349 /*Idx0=*/0, /*Idx1=*/0);
7351 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
7352 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
7353 : Info.RTArgs.MapTypesArray,
7354 /*Idx0=*/0,
7355 /*Idx1=*/0);
7356
7357 // Only emit the mapper information arrays if debug information is
7358 // requested.
7359 if (!Info.EmitDebug)
7360 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
7361 else
7363 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
7364 /*Idx0=*/0,
7365 /*Idx1=*/0);
7366 // If there is no user-defined mapper, set the mapper array to nullptr to
7367 // avoid an unnecessary data privatization
7368 if (!Info.HasMapper)
7369 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
7370 else
7371 RTArgs.MappersArray =
7372 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
7373}
7374
7376 InsertPointTy CodeGenIP,
7377 MapInfosTy &CombinedInfo,
7378 TargetDataInfo &Info) {
7380 CombinedInfo.NonContigInfo;
7381
7382 // Build an array of struct descriptor_dim and then assign it to
7383 // offload_args.
7384 //
7385 // struct descriptor_dim {
7386 // uint64_t offset;
7387 // uint64_t count;
7388 // uint64_t stride
7389 // };
7390 Type *Int64Ty = Builder.getInt64Ty();
7392 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
7393 "struct.descriptor_dim");
7394
7395 enum { OffsetFD = 0, CountFD, StrideFD };
7396 // We need two index variable here since the size of "Dims" is the same as
7397 // the size of Components, however, the size of offset, count, and stride is
7398 // equal to the size of base declaration that is non-contiguous.
7399 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
7400 // Skip emitting ir if dimension size is 1 since it cannot be
7401 // non-contiguous.
7402 if (NonContigInfo.Dims[I] == 1)
7403 continue;
7404 Builder.restoreIP(AllocaIP);
7405 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
7406 AllocaInst *DimsAddr =
7407 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
7408 Builder.restoreIP(CodeGenIP);
7409 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
7410 unsigned RevIdx = EE - II - 1;
7411 Value *DimsLVal = Builder.CreateInBoundsGEP(
7412 DimsAddr->getAllocatedType(), DimsAddr,
7413 {Builder.getInt64(0), Builder.getInt64(II)});
7414 // Offset
7415 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
7417 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
7418 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
7419 // Count
7420 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
7422 NonContigInfo.Counts[L][RevIdx], CountLVal,
7423 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7424 // Stride
7425 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
7427 NonContigInfo.Strides[L][RevIdx], StrideLVal,
7428 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
7429 }
7430 // args[I] = &dims
7431 Builder.restoreIP(CodeGenIP);
7433 DimsAddr, Builder.getPtrTy());
7435 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
7436 Info.RTArgs.PointersArray, 0, I);
7439 ++L;
7440 }
7441}
7442
7444 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
7445 TargetDataInfo &Info, bool IsNonContiguous,
7446 function_ref<void(unsigned int, Value *)> DeviceAddrCB,
7447 function_ref<Value *(unsigned int)> CustomMapperCB) {
7448
7449 // Reset the array information.
7450 Info.clearArrayInfo();
7451 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
7452
7453 if (Info.NumberOfPtrs == 0)
7454 return;
7455
7456 Builder.restoreIP(AllocaIP);
7457 // Detect if we have any capture size requiring runtime evaluation of the
7458 // size so that a constant array could be eventually used.
7459 ArrayType *PointerArrayType =
7460 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
7461
7462 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
7463 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
7464
7465 Info.RTArgs.PointersArray = Builder.CreateAlloca(
7466 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
7467 AllocaInst *MappersArray = Builder.CreateAlloca(
7468 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
7469 Info.RTArgs.MappersArray = MappersArray;
7470
7471 // If we don't have any VLA types or other types that require runtime
7472 // evaluation, we can use a constant array for the map sizes, otherwise we
7473 // need to fill up the arrays as we do for the pointers.
7474 Type *Int64Ty = Builder.getInt64Ty();
7475 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
7476 ConstantInt::get(Int64Ty, 0));
7477 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
7478 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
7479 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
7480 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
7481 if (IsNonContiguous &&
7482 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7483 CombinedInfo.Types[I] &
7484 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
7485 ConstSizes[I] =
7486 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
7487 else
7488 ConstSizes[I] = CI;
7489 continue;
7490 }
7491 }
7492 RuntimeSizes.set(I);
7493 }
7494
7495 if (RuntimeSizes.all()) {
7496 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7497 Info.RTArgs.SizesArray = Builder.CreateAlloca(
7498 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7499 Builder.restoreIP(CodeGenIP);
7500 } else {
7501 auto *SizesArrayInit = ConstantArray::get(
7502 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
7503 std::string Name = createPlatformSpecificName({"offload_sizes"});
7504 auto *SizesArrayGbl =
7505 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
7506 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
7507 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
7508
7509 if (!RuntimeSizes.any()) {
7510 Info.RTArgs.SizesArray = SizesArrayGbl;
7511 } else {
7512 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7513 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
7514 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
7516 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
7517 Buffer->setAlignment(OffloadSizeAlign);
7518 Builder.restoreIP(CodeGenIP);
7520 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
7521 SizesArrayGbl, OffloadSizeAlign,
7523 IndexSize,
7524 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
7525
7526 Info.RTArgs.SizesArray = Buffer;
7527 }
7528 Builder.restoreIP(CodeGenIP);
7529 }
7530
7531 // The map types are always constant so we don't need to generate code to
7532 // fill arrays. Instead, we create an array constant.
7534 for (auto mapFlag : CombinedInfo.Types)
7535 Mapping.push_back(
7536 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7537 mapFlag));
7538 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
7539 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7540 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
7541
7542 // The information types are only built if provided.
7543 if (!CombinedInfo.Names.empty()) {
7544 std::string MapnamesName = createPlatformSpecificName({"offload_mapnames"});
7545 auto *MapNamesArrayGbl =
7546 createOffloadMapnames(CombinedInfo.Names, MapnamesName);
7547 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
7548 Info.EmitDebug = true;
7549 } else {
7550 Info.RTArgs.MapNamesArray =
7552 Info.EmitDebug = false;
7553 }
7554
7555 // If there's a present map type modifier, it must not be applied to the end
7556 // of a region, so generate a separate map type array in that case.
7557 if (Info.separateBeginEndCalls()) {
7558 bool EndMapTypesDiffer = false;
7559 for (uint64_t &Type : Mapping) {
7560 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7561 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
7562 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
7563 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
7564 EndMapTypesDiffer = true;
7565 }
7566 }
7567 if (EndMapTypesDiffer) {
7568 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
7569 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
7570 }
7571 }
7572
7573 PointerType *PtrTy = Builder.getPtrTy();
7574 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
7575 Value *BPVal = CombinedInfo.BasePointers[I];
7577 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
7578 0, I);
7579 Builder.CreateAlignedStore(BPVal, BP,
7581
7582 if (Info.requiresDevicePointerInfo()) {
7583 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
7584 CodeGenIP = Builder.saveIP();
7585 Builder.restoreIP(AllocaIP);
7586 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
7587 Builder.restoreIP(CodeGenIP);
7588 if (DeviceAddrCB)
7589 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
7590 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
7591 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
7592 if (DeviceAddrCB)
7593 DeviceAddrCB(I, BP);
7594 }
7595 }
7596
7597 Value *PVal = CombinedInfo.Pointers[I];
7599 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
7600 I);
7601 // TODO: Check alignment correct.
7604
7605 if (RuntimeSizes.test(I)) {
7607 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
7608 /*Idx0=*/0,
7609 /*Idx1=*/I);
7611 Int64Ty,
7612 /*isSigned=*/true),
7613 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
7614 }
7615 // Fill up the mapper array.
7616 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
7617 Value *MFunc = ConstantPointerNull::get(PtrTy);
7618 if (CustomMapperCB)
7619 if (Value *CustomMFunc = CustomMapperCB(I))
7620 MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
7622 MappersArray->getAllocatedType(), MappersArray,
7623 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
7625 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
7626 }
7627
7628 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
7629 Info.NumberOfPtrs == 0)
7630 return;
7631 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
7632}
7633
7636
7637 if (!CurBB || CurBB->getTerminator()) {
7638 // If there is no insert point or the previous block is already
7639 // terminated, don't touch it.
7640 } else {
7641 // Otherwise, create a fall-through branch.
7643 }
7644
7646}
7647
7649 bool IsFinished) {
7651
7652 // Fall out of the current block (if necessary).
7653 emitBranch(BB);
7654
7655 if (IsFinished && BB->use_empty()) {
7656 BB->eraseFromParent();
7657 return;
7658 }
7659
7660 // Place the block after the current block, if possible, or else at
7661 // the end of the function.
7662 if (CurBB && CurBB->getParent())
7663 CurFn->insert(std::next(CurBB->getIterator()), BB);
7664 else
7665 CurFn->insert(CurFn->end(), BB);
7667}
7668
7670 BodyGenCallbackTy ElseGen,
7671 InsertPointTy AllocaIP) {
7672 // If the condition constant folds and can be elided, try to avoid emitting
7673 // the condition and the dead arm of the if/else.
7674 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
7675 auto CondConstant = CI->getSExtValue();
7676 if (CondConstant)
7677 ThenGen(AllocaIP, Builder.saveIP());
7678 else
7679 ElseGen(AllocaIP, Builder.saveIP());
7680 return;
7681 }
7682
7684
7685 // Otherwise, the condition did not fold, or we couldn't elide it. Just
7686 // emit the conditional branch.
7687 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
7688 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
7689 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
7690 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
7691 // Emit the 'then' code.
7692 emitBlock(ThenBlock, CurFn);
7693 ThenGen(AllocaIP, Builder.saveIP());
7694 emitBranch(ContBlock);
7695 // Emit the 'else' code if present.
7696 // There is no need to emit line number for unconditional branch.
7697 emitBlock(ElseBlock, CurFn);
7698 ElseGen(AllocaIP, Builder.saveIP());
7699 // There is no need to emit line number for unconditional branch.
7700 emitBranch(ContBlock);
7701 // Emit the continuation block for code after the if.
7702 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
7703}
7704
7705bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
7706 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
7709 "Unexpected Atomic Ordering.");
7710
7711 bool Flush = false;
7713
7714 switch (AK) {
7715 case Read:
7718 FlushAO = AtomicOrdering::Acquire;
7719 Flush = true;
7720 }
7721 break;
7722 case Write:
7723 case Compare:
7724 case Update:
7727 FlushAO = AtomicOrdering::Release;
7728 Flush = true;
7729 }
7730 break;
7731 case Capture:
7732 switch (AO) {
7734 FlushAO = AtomicOrdering::Acquire;
7735 Flush = true;
7736 break;
7738 FlushAO = AtomicOrdering::Release;
7739 Flush = true;
7740 break;
7744 Flush = true;
7745 break;
7746 default:
7747 // do nothing - leave silently.
7748 break;
7749 }
7750 }
7751
7752 if (Flush) {
7753 // Currently Flush RT call still doesn't take memory_ordering, so for when
7754 // that happens, this tries to do the resolution of which atomic ordering
7755 // to use with but issue the flush call
7756 // TODO: pass `FlushAO` after memory ordering support is added
7757 (void)FlushAO;
7758 emitFlush(Loc);
7759 }
7760
7761 // for AO == AtomicOrdering::Monotonic and all other case combinations
7762 // do nothing
7763 return Flush;
7764}
7765
7769 AtomicOrdering AO) {
7770 if (!updateToLocation(Loc))
7771 return Loc.IP;
7772
7773 assert(X.Var->getType()->isPointerTy() &&
7774 "OMP Atomic expects a pointer to target memory");
7775 Type *XElemTy = X.ElemTy;
7776 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7777 XElemTy->isPointerTy()) &&
7778 "OMP atomic read expected a scalar type");
7779
7780 Value *XRead = nullptr;
7781
7782 if (XElemTy->isIntegerTy()) {
7783 LoadInst *XLD =
7784 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
7785 XLD->setAtomic(AO);
7786 XRead = cast<Value>(XLD);
7787 } else {
7788 // We need to perform atomic op as integer
7789 IntegerType *IntCastTy =
7791 LoadInst *XLoad =
7792 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
7793 XLoad->setAtomic(AO);
7794 if (XElemTy->isFloatingPointTy()) {
7795 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
7796 } else {
7797 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
7798 }
7799 }
7800 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
7801 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
7802 return Builder.saveIP();
7803}
7804
7807 AtomicOpValue &X, Value *Expr,
7808 AtomicOrdering AO) {
7809 if (!updateToLocation(Loc))
7810 return Loc.IP;
7811
7812 assert(X.Var->getType()->isPointerTy() &&
7813 "OMP Atomic expects a pointer to target memory");
7814 Type *XElemTy = X.ElemTy;
7815 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7816 XElemTy->isPointerTy()) &&
7817 "OMP atomic write expected a scalar type");
7818
7819 if (XElemTy->isIntegerTy()) {
7820 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
7821 XSt->setAtomic(AO);
7822 } else {
7823 // We need to bitcast and perform atomic op as integers
7824 IntegerType *IntCastTy =
7826 Value *ExprCast =
7827 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
7828 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
7829 XSt->setAtomic(AO);
7830 }
7831
7832 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
7833 return Builder.saveIP();
7834}
7835
7837 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7838 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
7839 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr) {
7840 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
7841 if (!updateToLocation(Loc))
7842 return Loc.IP;
7843
7844 LLVM_DEBUG({
7845 Type *XTy = X.Var->getType();
7846 assert(XTy->isPointerTy() &&
7847 "OMP Atomic expects a pointer to target memory");
7848 Type *XElemTy = X.ElemTy;
7849 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
7850 XElemTy->isPointerTy()) &&
7851 "OMP atomic update expected a scalar type");
7852 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
7853 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
7854 "OpenMP atomic does not support LT or GT operations");
7855 });
7856
7857 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp,
7858 X.IsVolatile, IsXBinopExpr);
7859 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
7860 return Builder.saveIP();
7861}
7862
7863// FIXME: Duplicating AtomicExpand
7864Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
7865 AtomicRMWInst::BinOp RMWOp) {
7866 switch (RMWOp) {
7867 case AtomicRMWInst::Add:
7868 return Builder.CreateAdd(Src1, Src2);
7869 case AtomicRMWInst::Sub:
7870 return Builder.CreateSub(Src1, Src2);
7871 case AtomicRMWInst::And:
7872 return Builder.CreateAnd(Src1, Src2);
7874 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
7875 case AtomicRMWInst::Or:
7876 return Builder.CreateOr(Src1, Src2);
7877 case AtomicRMWInst::Xor:
7878 return Builder.CreateXor(Src1, Src2);
7883 case AtomicRMWInst::Max:
7884 case AtomicRMWInst::Min:
7891 llvm_unreachable("Unsupported atomic update operation");
7892 }
7893 llvm_unreachable("Unsupported atomic update operation");
7894}
7895
7896std::pair<Value *, Value *> OpenMPIRBuilder::emitAtomicUpdate(
7897 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
7899 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr) {
7900 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
7901 // or a complex datatype.
7902 bool emitRMWOp = false;
7903 switch (RMWOp) {
7904 case AtomicRMWInst::Add:
7905 case AtomicRMWInst::And:
7907 case AtomicRMWInst::Or:
7908 case AtomicRMWInst::Xor:
7910 emitRMWOp = XElemTy;
7911 break;
7912 case AtomicRMWInst::Sub:
7913 emitRMWOp = (IsXBinopExpr && XElemTy);
7914 break;
7915 default:
7916 emitRMWOp = false;
7917 }
7918 emitRMWOp &= XElemTy->isIntegerTy();
7919
7920 std::pair<Value *, Value *> Res;
7921 if (emitRMWOp) {
7922 Res.first = Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
7923 // not needed except in case of postfix captures. Generate anyway for
7924 // consistency with the else part. Will be removed with any DCE pass.
7925 // AtomicRMWInst::Xchg does not have a coressponding instruction.
7926 if (RMWOp == AtomicRMWInst::Xchg)
7927 Res.second = Res.first;
7928 else
7929 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
7930 } else {
7931 IntegerType *IntCastTy =
7933 LoadInst *OldVal =
7934 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
7935 OldVal->setAtomic(AO);
7936 // CurBB
7937 // | /---\
7938 // ContBB |
7939 // | \---/
7940 // ExitBB
7942 Instruction *CurBBTI = CurBB->getTerminator();
7943 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
7944 BasicBlock *ExitBB =
7945 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
7946 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
7947 X->getName() + ".atomic.cont");
7948 ContBB->getTerminator()->eraseFromParent();
7949 Builder.restoreIP(AllocaIP);
7950 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
7951 NewAtomicAddr->setName(X->getName() + "x.new.val");
7952 Builder.SetInsertPoint(ContBB);
7953 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
7954 PHI->addIncoming(OldVal, CurBB);
7955 bool IsIntTy = XElemTy->isIntegerTy();
7956 Value *OldExprVal = PHI;
7957 if (!IsIntTy) {
7958 if (XElemTy->isFloatingPointTy()) {
7959 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
7960 X->getName() + ".atomic.fltCast");
7961 } else {
7962 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
7963 X->getName() + ".atomic.ptrCast");
7964 }
7965 }
7966
7967 Value *Upd = UpdateOp(OldExprVal, Builder);
7968 Builder.CreateStore(Upd, NewAtomicAddr);
7969 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
7973 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
7974 Result->setVolatile(VolatileX);
7975 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
7976 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
7977 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
7978 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
7979
7980 Res.first = OldExprVal;
7981 Res.second = Upd;
7982
7983 // set Insertion point in exit block
7984 if (UnreachableInst *ExitTI =
7985 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
7986 CurBBTI->eraseFromParent();
7987 Builder.SetInsertPoint(ExitBB);
7988 } else {
7989 Builder.SetInsertPoint(ExitTI);
7990 }
7991 }
7992
7993 return Res;
7994}
7995
7997 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
7998 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
8000 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr) {
8001 if (!updateToLocation(Loc))
8002 return Loc.IP;
8003
8004 LLVM_DEBUG({
8005 Type *XTy = X.Var->getType();
8006 assert(XTy->isPointerTy() &&
8007 "OMP Atomic expects a pointer to target memory");
8008 Type *XElemTy = X.ElemTy;
8009 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
8010 XElemTy->isPointerTy()) &&
8011 "OMP atomic capture expected a scalar type");
8012 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
8013 "OpenMP atomic does not support LT or GT operations");
8014 });
8015
8016 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
8017 // 'x' is simply atomically rewritten with 'expr'.
8018 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
8019 std::pair<Value *, Value *> Result =
8020 emitAtomicUpdate(AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp,
8021 X.IsVolatile, IsXBinopExpr);
8022
8023 Value *CapturedVal = (IsPostfixUpdate ? Result.first : Result.second);
8024 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
8025
8026 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
8027 return Builder.saveIP();
8028}
8029
8033 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8034 bool IsFailOnly) {
8035
8037 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
8038 IsPostfixUpdate, IsFailOnly, Failure);
8039}
8040
8044 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
8045 bool IsFailOnly, AtomicOrdering Failure) {
8046
8047 if (!updateToLocation(Loc))
8048 return Loc.IP;
8049
8050 assert(X.Var->getType()->isPointerTy() &&
8051 "OMP atomic expects a pointer to target memory");
8052 // compare capture
8053 if (V.Var) {
8054 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
8055 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
8056 }
8057
8058 bool IsInteger = E->getType()->isIntegerTy();
8059
8060 if (Op == OMPAtomicCompareOp::EQ) {
8061 AtomicCmpXchgInst *Result = nullptr;
8062 if (!IsInteger) {
8063 IntegerType *IntCastTy =
8064 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
8065 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
8066 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
8067 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
8068 AO, Failure);
8069 } else {
8070 Result =
8071 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
8072 }
8073
8074 if (V.Var) {
8075 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
8076 if (!IsInteger)
8077 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
8078 assert(OldValue->getType() == V.ElemTy &&
8079 "OldValue and V must be of same type");
8080 if (IsPostfixUpdate) {
8081 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
8082 } else {
8083 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8084 if (IsFailOnly) {
8085 // CurBB----
8086 // | |
8087 // v |
8088 // ContBB |
8089 // | |
8090 // v |
8091 // ExitBB <-
8092 //
8093 // where ContBB only contains the store of old value to 'v'.
8095 Instruction *CurBBTI = CurBB->getTerminator();
8096 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
8097 BasicBlock *ExitBB = CurBB->splitBasicBlock(
8098 CurBBTI, X.Var->getName() + ".atomic.exit");
8099 BasicBlock *ContBB = CurBB->splitBasicBlock(
8100 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
8101 ContBB->getTerminator()->eraseFromParent();
8102 CurBB->getTerminator()->eraseFromParent();
8103
8104 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
8105
8106 Builder.SetInsertPoint(ContBB);
8107 Builder.CreateStore(OldValue, V.Var);
8108 Builder.CreateBr(ExitBB);
8109
8110 if (UnreachableInst *ExitTI =
8111 dyn_cast<UnreachableInst>(ExitBB->getTerminator())) {
8112 CurBBTI->eraseFromParent();
8113 Builder.SetInsertPoint(ExitBB);
8114 } else {
8115 Builder.SetInsertPoint(ExitTI);
8116 }
8117 } else {
8118 Value *CapturedValue =
8119 Builder.CreateSelect(SuccessOrFail, E, OldValue);
8120 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8121 }
8122 }
8123 }
8124 // The comparison result has to be stored.
8125 if (R.Var) {
8126 assert(R.Var->getType()->isPointerTy() &&
8127 "r.var must be of pointer type");
8128 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
8129
8130 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
8131 Value *ResultCast = R.IsSigned
8132 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
8133 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
8134 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
8135 }
8136 } else {
8137 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
8138 "Op should be either max or min at this point");
8139 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
8140
8141 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
8142 // Let's take max as example.
8143 // OpenMP form:
8144 // x = x > expr ? expr : x;
8145 // LLVM form:
8146 // *ptr = *ptr > val ? *ptr : val;
8147 // We need to transform to LLVM form.
8148 // x = x <= expr ? x : expr;
8150 if (IsXBinopExpr) {
8151 if (IsInteger) {
8152 if (X.IsSigned)
8153 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
8155 else
8156 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
8158 } else {
8159 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
8161 }
8162 } else {
8163 if (IsInteger) {
8164 if (X.IsSigned)
8165 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
8167 else
8168 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
8170 } else {
8171 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
8173 }
8174 }
8175
8176 AtomicRMWInst *OldValue =
8177 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
8178 if (V.Var) {
8179 Value *CapturedValue = nullptr;
8180 if (IsPostfixUpdate) {
8181 CapturedValue = OldValue;
8182 } else {
8183 CmpInst::Predicate Pred;
8184 switch (NewOp) {
8185 case AtomicRMWInst::Max:
8186 Pred = CmpInst::ICMP_SGT;
8187 break;
8189 Pred = CmpInst::ICMP_UGT;
8190 break;
8192 Pred = CmpInst::FCMP_OGT;
8193 break;
8194 case AtomicRMWInst::Min:
8195 Pred = CmpInst::ICMP_SLT;
8196 break;
8198 Pred = CmpInst::ICMP_ULT;
8199 break;
8201 Pred = CmpInst::FCMP_OLT;
8202 break;
8203 default:
8204 llvm_unreachable("unexpected comparison op");
8205 }
8206 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
8207 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
8208 }
8209 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
8210 }
8211 }
8212
8213 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
8214
8215 return Builder.saveIP();
8216}
8217
8220 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
8221 Value *NumTeamsUpper, Value *ThreadLimit,
8222 Value *IfExpr) {
8223 if (!updateToLocation(Loc))
8224 return InsertPointTy();
8225
8226 uint32_t SrcLocStrSize;
8227 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8228 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8229 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
8230
8231 // Outer allocation basicblock is the entry block of the current function.
8232 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
8233 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
8234 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
8235 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
8236 }
8237
8238 // The current basic block is split into four basic blocks. After outlining,
8239 // they will be mapped as follows:
8240 // ```
8241 // def current_fn() {
8242 // current_basic_block:
8243 // br label %teams.exit
8244 // teams.exit:
8245 // ; instructions after teams
8246 // }
8247 //
8248 // def outlined_fn() {
8249 // teams.alloca:
8250 // br label %teams.body
8251 // teams.body:
8252 // ; instructions within teams body
8253 // }
8254 // ```
8255 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
8256 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
8257 BasicBlock *AllocaBB =
8258 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
8259
8260 bool SubClausesPresent =
8261 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
8262 // Push num_teams
8263 if (!Config.isTargetDevice() && SubClausesPresent) {
8264 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
8265 "if lowerbound is non-null, then upperbound must also be non-null "
8266 "for bounds on num_teams");
8267
8268 if (NumTeamsUpper == nullptr)
8269 NumTeamsUpper = Builder.getInt32(0);
8270
8271 if (NumTeamsLower == nullptr)
8272 NumTeamsLower = NumTeamsUpper;
8273
8274 if (IfExpr) {
8275 assert(IfExpr->getType()->isIntegerTy() &&
8276 "argument to if clause must be an integer value");
8277
8278 // upper = ifexpr ? upper : 1
8279 if (IfExpr->getType() != Int1)
8280 IfExpr = Builder.CreateICmpNE(IfExpr,
8281 ConstantInt::get(IfExpr->getType(), 0));
8282 NumTeamsUpper = Builder.CreateSelect(
8283 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
8284
8285 // lower = ifexpr ? lower : 1
8286 NumTeamsLower = Builder.CreateSelect(
8287 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
8288 }
8289
8290 if (ThreadLimit == nullptr)
8291 ThreadLimit = Builder.getInt32(0);
8292
8293 Value *ThreadNum = getOrCreateThreadID(Ident);
8295 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
8296 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
8297 }
8298 // Generate the body of teams.
8299 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
8300 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
8301 BodyGenCB(AllocaIP, CodeGenIP);
8302
8303 OutlineInfo OI;
8304 OI.EntryBB = AllocaBB;
8305 OI.ExitBB = ExitBB;
8306 OI.OuterAllocaBB = &OuterAllocaBB;
8307
8308 // Insert fake values for global tid and bound tid.
8310 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
8312 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
8314 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
8315
8316 auto HostPostOutlineCB = [this, Ident,
8317 ToBeDeleted](Function &OutlinedFn) mutable {
8318 // The stale call instruction will be replaced with a new call instruction
8319 // for runtime call with the outlined function.
8320
8321 assert(OutlinedFn.getNumUses() == 1 &&
8322 "there must be a single user for the outlined function");
8323 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8324 ToBeDeleted.push_back(StaleCI);
8325
8326 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
8327 "Outlined function must have two or three arguments only");
8328
8329 bool HasShared = OutlinedFn.arg_size() == 3;
8330
8331 OutlinedFn.getArg(0)->setName("global.tid.ptr");
8332 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
8333 if (HasShared)
8334 OutlinedFn.getArg(2)->setName("data");
8335
8336 // Call to the runtime function for teams in the current function.
8337 assert(StaleCI && "Error while outlining - no CallInst user found for the "
8338 "outlined function.");
8339 Builder.SetInsertPoint(StaleCI);
8340 SmallVector<Value *> Args = {
8341 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
8342 if (HasShared)
8343 Args.push_back(StaleCI->getArgOperand(2));
8345 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
8346 Args);
8347
8348 for (Instruction *I : llvm::reverse(ToBeDeleted))
8349 I->eraseFromParent();
8350 };
8351
8352 if (!Config.isTargetDevice())
8353 OI.PostOutlineCB = HostPostOutlineCB;
8354
8355 addOutlineInfo(std::move(OI));
8356
8357 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
8358
8359 return Builder.saveIP();
8360}
8361
8364 std::string VarName) {
8365 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
8367 Names.size()),
8368 Names);
8369 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
8370 M, MapNamesArrayInit->getType(),
8371 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
8372 VarName);
8373 return MapNamesArrayGlobal;
8374}
8375
8376// Create all simple and struct types exposed by the runtime and remember
8377// the llvm::PointerTypes of them for easy access later.
8378void OpenMPIRBuilder::initializeTypes(Module &M) {
8379 LLVMContext &Ctx = M.getContext();
8380 StructType *T;
8381#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
8382#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
8383 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
8384 VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
8385#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
8386 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
8387 VarName##Ptr = PointerType::getUnqual(VarName);
8388#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
8389 T = StructType::getTypeByName(Ctx, StructName); \
8390 if (!T) \
8391 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
8392 VarName = T; \
8393 VarName##Ptr = PointerType::getUnqual(T);
8394#include "llvm/Frontend/OpenMP/OMPKinds.def"
8395}
8396
8399 SmallVectorImpl<BasicBlock *> &BlockVector) {
8401 BlockSet.insert(EntryBB);
8402 BlockSet.insert(ExitBB);
8403
8404 Worklist.push_back(EntryBB);
8405 while (!Worklist.empty()) {
8406 BasicBlock *BB = Worklist.pop_back_val();
8407 BlockVector.push_back(BB);
8408 for (BasicBlock *SuccBB : successors(BB))
8409 if (BlockSet.insert(SuccBB).second)
8410 Worklist.push_back(SuccBB);
8411 }
8412}
8413
8415 uint64_t Size, int32_t Flags,
8417 StringRef Name) {
8418 if (!Config.isGPU()) {
8420 M, ID, Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0,
8421 "omp_offloading_entries");
8422 return;
8423 }
8424 // TODO: Add support for global variables on the device after declare target
8425 // support.
8426 Function *Fn = dyn_cast<Function>(Addr);
8427 if (!Fn)
8428 return;
8429
8430 Module &M = *(Fn->getParent());
8431 LLVMContext &Ctx = M.getContext();
8432
8433 // Get "nvvm.annotations" metadata node.
8434 NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
8435
8436 Metadata *MDVals[] = {
8437 ConstantAsMetadata::get(Fn), MDString::get(Ctx, "kernel"),
8438 ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Ctx), 1))};
8439 // Append metadata to nvvm.annotations.
8440 MD->addOperand(MDNode::get(Ctx, MDVals));
8441
8442 // Add a function attribute for the kernel.
8443 Fn->addFnAttr(Attribute::get(Ctx, "kernel"));
8444 if (T.isAMDGCN())
8445 Fn->addFnAttr("uniform-work-group-size", "true");
8446 Fn->addFnAttr(Attribute::MustProgress);
8447}
8448
8449// We only generate metadata for function that contain target regions.
8452
8453 // If there are no entries, we don't need to do anything.
8455 return;
8456
8460 16>
8461 OrderedEntries(OffloadInfoManager.size());
8462
8463 // Auxiliary methods to create metadata values and strings.
8464 auto &&GetMDInt = [this](unsigned V) {
8465 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
8466 };
8467
8468 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
8469
8470 // Create the offloading info metadata node.
8471 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
8472 auto &&TargetRegionMetadataEmitter =
8473 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
8474 const TargetRegionEntryInfo &EntryInfo,
8476 // Generate metadata for target regions. Each entry of this metadata
8477 // contains:
8478 // - Entry 0 -> Kind of this type of metadata (0).
8479 // - Entry 1 -> Device ID of the file where the entry was identified.
8480 // - Entry 2 -> File ID of the file where the entry was identified.
8481 // - Entry 3 -> Mangled name of the function where the entry was
8482 // identified.
8483 // - Entry 4 -> Line in the file where the entry was identified.
8484 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
8485 // - Entry 6 -> Order the entry was created.
8486 // The first element of the metadata node is the kind.
8487 Metadata *Ops[] = {
8488 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
8489 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
8490 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
8491 GetMDInt(E.getOrder())};
8492
8493 // Save this entry in the right position of the ordered entries array.
8494 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
8495
8496 // Add metadata to the named metadata node.
8497 MD->addOperand(MDNode::get(C, Ops));
8498 };
8499
8500 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
8501
8502 // Create function that emits metadata for each device global variable entry;
8503 auto &&DeviceGlobalVarMetadataEmitter =
8504 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
8505 StringRef MangledName,
8507 // Generate metadata for global variables. Each entry of this metadata
8508 // contains:
8509 // - Entry 0 -> Kind of this type of metadata (1).
8510 // - Entry 1 -> Mangled name of the variable.
8511 // - Entry 2 -> Declare target kind.
8512 // - Entry 3 -> Order the entry was created.
8513 // The first element of the metadata node is the kind.
8514 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
8515 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
8516
8517 // Save this entry in the right position of the ordered entries array.
8518 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
8519 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
8520
8521 // Add metadata to the named metadata node.
8522 MD->addOperand(MDNode::get(C, Ops));
8523 };
8524
8526 DeviceGlobalVarMetadataEmitter);
8527
8528 for (const auto &E : OrderedEntries) {
8529 assert(E.first && "All ordered entries must exist!");
8530 if (const auto *CE =
8531 dyn_cast<OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion>(
8532 E.first)) {
8533 if (!CE->getID() || !CE->getAddress()) {
8534 // Do not blame the entry if the parent funtion is not emitted.
8535 TargetRegionEntryInfo EntryInfo = E.second;
8536 StringRef FnName = EntryInfo.ParentName;
8537 if (!M.getNamedValue(FnName))
8538 continue;
8539 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
8540 continue;
8541 }
8542 createOffloadEntry(CE->getID(), CE->getAddress(),
8543 /*Size=*/0, CE->getFlags(),
8545 } else if (const auto *CE = dyn_cast<
8547 E.first)) {
8550 CE->getFlags());
8551 switch (Flags) {
8555 continue;
8556 if (!CE->getAddress()) {
8557 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
8558 continue;
8559 }
8560 // The vaiable has no definition - no need to add the entry.
8561 if (CE->getVarSize() == 0)
8562 continue;
8563 break;
8565 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
8566 (!Config.isTargetDevice() && CE->getAddress())) &&
8567 "Declaret target link address is set.");
8568 if (Config.isTargetDevice())
8569 continue;
8570 if (!CE->getAddress()) {
8572 continue;
8573 }
8574 break;
8575 default:
8576 break;
8577 }
8578
8579 // Hidden or internal symbols on the device are not externally visible.
8580 // We should not attempt to register them by creating an offloading
8581 // entry. Indirect variables are handled separately on the device.
8582 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
8583 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
8585 continue;
8586
8587 // Indirect globals need to use a special name that doesn't match the name
8588 // of the associated host global.
8590 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8591 Flags, CE->getLinkage(), CE->getVarName());
8592 else
8593 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
8594 Flags, CE->getLinkage());
8595
8596 } else {
8597 llvm_unreachable("Unsupported entry kind.");
8598 }
8599 }
8600
8601 // Emit requires directive globals to a special entry so the runtime can
8602 // register them when the device image is loaded.
8603 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
8604 // entries should be redesigned to better suit this use-case.
8608 /*Name=*/"",
8610 Config.getRequiresFlags(), "omp_offloading_entries");
8611}
8612
8614 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
8615 unsigned FileID, unsigned Line, unsigned Count) {
8617 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
8618 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
8619 if (Count)
8620 OS << "_" << Count;
8621}
8622
8625 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
8627 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
8628 EntryInfo.Line, NewCount);
8629}
8630
8633 StringRef ParentName) {
8635 auto FileIDInfo = CallBack();
8636 if (auto EC = sys::fs::getUniqueID(std::get<0>(FileIDInfo), ID)) {
8637 report_fatal_error(("Unable to get unique ID for file, during "
8638 "getTargetEntryUniqueInfo, error message: " +
8639 EC.message())
8640 .c_str());
8641 }
8642
8643 return TargetRegionEntryInfo(ParentName, ID.getDevice(), ID.getFile(),
8644 std::get<1>(FileIDInfo));
8645}
8646
8648 unsigned Offset = 0;
8649 for (uint64_t Remain =
8650 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8652 !(Remain & 1); Remain = Remain >> 1)
8653 Offset++;
8654 return Offset;
8655}
8656
8659 // Rotate by getFlagMemberOffset() bits.
8660 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
8661 << getFlagMemberOffset());
8662}
8663
8666 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
8667 // If the entry is PTR_AND_OBJ but has not been marked with the special
8668 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
8669 // marked as MEMBER_OF.
8670 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8672 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
8675 return;
8676
8677 // Reset the placeholder value to prepare the flag for the assignment of the
8678 // proper MEMBER_OF value.
8679 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
8680 Flags |= MemberOfFlag;
8681}
8682
8686 bool IsDeclaration, bool IsExternallyVisible,
8687 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8688 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8689 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
8690 std::function<Constant *()> GlobalInitializer,
8691 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
8692 // TODO: convert this to utilise the IRBuilder Config rather than
8693 // a passed down argument.
8694 if (OpenMPSIMD)
8695 return nullptr;
8696
8699 CaptureClause ==
8702 SmallString<64> PtrName;
8703 {
8704 raw_svector_ostream OS(PtrName);
8705 OS << MangledName;
8706 if (!IsExternallyVisible)
8707 OS << format("_%x", EntryInfo.FileID);
8708 OS << "_decl_tgt_ref_ptr";
8709 }
8710
8711 Value *Ptr = M.getNamedValue(PtrName);
8712
8713 if (!Ptr) {
8714 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
8715 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
8716
8717 auto *GV = cast<GlobalVariable>(Ptr);
8718 GV->setLinkage(GlobalValue::WeakAnyLinkage);
8719
8720 if (!Config.isTargetDevice()) {
8721 if (GlobalInitializer)
8722 GV->setInitializer(GlobalInitializer());
8723 else
8724 GV->setInitializer(GlobalValue);
8725 }
8726
8728 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8729 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8730 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
8731 }
8732
8733 return cast<Constant>(Ptr);
8734 }
8735
8736 return nullptr;
8737}
8738
8742 bool IsDeclaration, bool IsExternallyVisible,
8743 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
8744 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
8745 std::vector<Triple> TargetTriple,
8746 std::function<Constant *()> GlobalInitializer,
8747 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
8748 Constant *Addr) {
8750 (TargetTriple.empty() && !Config.isTargetDevice()))
8751 return;
8752
8754 StringRef VarName;
8755 int64_t VarSize;
8757
8759 CaptureClause ==
8763 VarName = MangledName;
8764 GlobalValue *LlvmVal = M.getNamedValue(VarName);
8765
8766 if (!IsDeclaration)
8767 VarSize = divideCeil(
8769 else
8770 VarSize = 0;
8771 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
8772
8773 // This is a workaround carried over from Clang which prevents undesired
8774 // optimisation of internal variables.
8775 if (Config.isTargetDevice() &&
8776 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
8777 // Do not create a "ref-variable" if the original is not also available
8778 // on the host.
8780 return;
8781
8782 std::string RefName = createPlatformSpecificName({VarName, "ref"});
8783
8784 if (!M.getNamedValue(RefName)) {
8785 Constant *AddrRef =
8786 getOrCreateInternalVariable(Addr->getType(), RefName);
8787 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
8788 GvAddrRef->setConstant(true);
8789 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
8790 GvAddrRef->setInitializer(Addr);
8791 GeneratedRefs.push_back(GvAddrRef);
8792 }
8793 }
8794 } else {
8797 else
8799
8800 if (Config.isTargetDevice()) {
8801 VarName = (Addr) ? Addr->getName() : "";
8802 Addr = nullptr;
8803 } else {
8805 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
8806 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
8807 LlvmPtrTy, GlobalInitializer, VariableLinkage);
8808 VarName = (Addr) ? Addr->getName() : "";
8809 }
8810 VarSize = M.getDataLayout().getPointerSize();
8812 }
8813
8815 Flags, Linkage);
8816}
8817
8818/// Loads all the offload entries information from the host IR
8819/// metadata.
8821 // If we are in target mode, load the metadata from the host IR. This code has
8822 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
8823
8825 if (!MD)
8826 return;
8827
8828 for (MDNode *MN : MD->operands()) {
8829 auto &&GetMDInt = [MN](unsigned Idx) {
8830 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
8831 return cast<ConstantInt>(V->getValue())->getZExtValue();
8832 };
8833
8834 auto &&GetMDString = [MN](unsigned Idx) {
8835 auto *V = cast<MDString>(MN->getOperand(Idx));
8836 return V->getString();
8837 };
8838
8839 switch (GetMDInt(0)) {
8840 default:
8841 llvm_unreachable("Unexpected metadata!");
8842 break;
8845 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
8846 /*DeviceID=*/GetMDInt(1),
8847 /*FileID=*/GetMDInt(2),
8848 /*Line=*/GetMDInt(4),
8849 /*Count=*/GetMDInt(5));
8851 /*Order=*/GetMDInt(6));
8852 break;
8853 }
8857 /*MangledName=*/GetMDString(1),
8859 /*Flags=*/GetMDInt(2)),
8860 /*Order=*/GetMDInt(3));
8861 break;
8862 }
8863 }
8864}
8865
8867 if (HostFilePath.empty())
8868 return;
8869
8870 auto Buf = MemoryBuffer::getFile(HostFilePath);
8871 if (std::error_code Err = Buf.getError()) {
8872 report_fatal_error(("error opening host file from host file path inside of "
8873 "OpenMPIRBuilder: " +
8874 Err.message())
8875 .c_str());
8876 }
8877
8878 LLVMContext Ctx;
8880 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
8881 if (std::error_code Err = M.getError()) {
8883 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
8884 .c_str());
8885 }
8886
8887 loadOffloadInfoMetadata(*M.get());
8888}
8889
8890//===----------------------------------------------------------------------===//
8891// OffloadEntriesInfoManager
8892//===----------------------------------------------------------------------===//
8893
8895 return OffloadEntriesTargetRegion.empty() &&
8896 OffloadEntriesDeviceGlobalVar.empty();
8897}
8898
8899unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
8900 const TargetRegionEntryInfo &EntryInfo) const {
8901 auto It = OffloadEntriesTargetRegionCount.find(
8902 getTargetRegionEntryCountKey(EntryInfo));
8903 if (It == OffloadEntriesTargetRegionCount.end())
8904 return 0;
8905 return It->second;
8906}
8907
8908void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
8909 const TargetRegionEntryInfo &EntryInfo) {
8910 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
8911 EntryInfo.Count + 1;
8912}
8913
8914/// Initialize target region entry.
8916 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
8917 OffloadEntriesTargetRegion[EntryInfo] =
8918 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
8919 OMPTargetRegionEntryTargetRegion);
8920 ++OffloadingEntriesNum;
8921}
8922
8926 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
8927
8928 // Update the EntryInfo with the next available count for this location.
8929 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8930
8931 // If we are emitting code for a target, the entry is already initialized,
8932 // only has to be registered.
8933 if (OMPBuilder->Config.isTargetDevice()) {
8934 // This could happen if the device compilation is invoked standalone.
8935 if (!hasTargetRegionEntryInfo(EntryInfo)) {
8936 return;
8937 }
8938 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
8939 Entry.setAddress(Addr);
8940 Entry.setID(ID);
8941 Entry.setFlags(Flags);
8942 } else {
8944 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
8945 return;
8946 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
8947 "Target region entry already registered!");
8948 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
8949 OffloadEntriesTargetRegion[EntryInfo] = Entry;
8950 ++OffloadingEntriesNum;
8951 }
8952 incrementTargetRegionEntryInfoCount(EntryInfo);
8953}
8954
8956 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
8957
8958 // Update the EntryInfo with the next available count for this location.
8959 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
8960
8961 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
8962 if (It == OffloadEntriesTargetRegion.end()) {
8963 return false;
8964 }
8965 // Fail if this entry is already registered.
8966 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
8967 return false;
8968 return true;
8969}
8970
8972 const OffloadTargetRegionEntryInfoActTy &Action) {
8973 // Scan all target region entries and perform the provided action.
8974 for (const auto &It : OffloadEntriesTargetRegion) {
8975 Action(It.first, It.second);
8976 }
8977}
8978
8980 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
8981 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
8982 ++OffloadingEntriesNum;
8983}
8984
8986 StringRef VarName, Constant *Addr, int64_t VarSize,
8988 if (OMPBuilder->Config.isTargetDevice()) {
8989 // This could happen if the device compilation is invoked standalone.
8990 if (!hasDeviceGlobalVarEntryInfo(VarName))
8991 return;
8992 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
8993 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
8994 if (Entry.getVarSize() == 0) {
8995 Entry.setVarSize(VarSize);
8996 Entry.setLinkage(Linkage);
8997 }
8998 return;
8999 }
9000 Entry.setVarSize(VarSize);
9001 Entry.setLinkage(Linkage);
9002 Entry.setAddress(Addr);
9003 } else {
9004 if (hasDeviceGlobalVarEntryInfo(VarName)) {
9005 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
9006 assert(Entry.isValid() && Entry.getFlags() == Flags &&
9007 "Entry not initialized!");
9008 if (Entry.getVarSize() == 0) {
9009 Entry.setVarSize(VarSize);
9010 Entry.setLinkage(Linkage);
9011 }
9012 return;
9013 }
9015 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
9016 Addr, VarSize, Flags, Linkage,
9017 VarName.str());
9018 else
9019 OffloadEntriesDeviceGlobalVar.try_emplace(
9020 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
9021 ++OffloadingEntriesNum;
9022 }
9023}
9024
9027 // Scan all target region entries and perform the provided action.
9028 for (const auto &E : OffloadEntriesDeviceGlobalVar)
9029 Action(E.getKey(), E.getValue());
9030}
9031
9032//===----------------------------------------------------------------------===//
9033// CanonicalLoopInfo
9034//===----------------------------------------------------------------------===//
9035
9036void CanonicalLoopInfo::collectControlBlocks(
9038 // We only count those BBs as control block for which we do not need to
9039 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
9040 // flow. For consistency, this also means we do not add the Body block, which
9041 // is just the entry to the body code.
9042 BBs.reserve(BBs.size() + 6);
9043 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
9044}
9045
9047 assert(isValid() && "Requires a valid canonical loop");
9048 for (BasicBlock *Pred : predecessors(Header)) {
9049 if (Pred != Latch)
9050 return Pred;
9051 }
9052 llvm_unreachable("Missing preheader");
9053}
9054
9055void CanonicalLoopInfo::setTripCount(Value *TripCount) {
9056 assert(isValid() && "Requires a valid canonical loop");
9057
9058 Instruction *CmpI = &getCond()->front();
9059 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
9060 CmpI->setOperand(1, TripCount);
9061
9062#ifndef NDEBUG
9063 assertOK();
9064#endif
9065}
9066
9067void CanonicalLoopInfo::mapIndVar(
9068 llvm::function_ref<Value *(Instruction *)> Updater) {
9069 assert(isValid() && "Requires a valid canonical loop");
9070
9071 Instruction *OldIV = getIndVar();
9072
9073 // Record all uses excluding those introduced by the updater. Uses by the
9074 // CanonicalLoopInfo itself to keep track of the number of iterations are
9075 // excluded.
9076 SmallVector<Use *> ReplacableUses;
9077 for (Use &U : OldIV->uses()) {
9078 auto *User = dyn_cast<Instruction>(U.getUser());
9079 if (!User)
9080 continue;
9081 if (User->getParent() == getCond())
9082 continue;
9083 if (User->getParent() == getLatch())
9084 continue;
9085 ReplacableUses.push_back(&U);
9086 }
9087
9088 // Run the updater that may introduce new uses
9089 Value *NewIV = Updater(OldIV);
9090
9091 // Replace the old uses with the value returned by the updater.
9092 for (Use *U : ReplacableUses)
9093 U->set(NewIV);
9094
9095#ifndef NDEBUG
9096 assertOK();
9097#endif
9098}
9099
9101#ifndef NDEBUG
9102 // No constraints if this object currently does not describe a loop.
9103 if (!isValid())
9104 return;
9105
9106 BasicBlock *Preheader = getPreheader();
9107 BasicBlock *Body = getBody();
9108 BasicBlock *After = getAfter();
9109
9110 // Verify standard control-flow we use for OpenMP loops.
9111 assert(Preheader);
9112 assert(isa<BranchInst>(Preheader->getTerminator()) &&
9113 "Preheader must terminate with unconditional branch");
9114 assert(Preheader->getSingleSuccessor() == Header &&
9115 "Preheader must jump to header");
9116
9117 assert(Header);
9118 assert(isa<BranchInst>(Header->getTerminator()) &&
9119 "Header must terminate with unconditional branch");
9120 assert(Header->getSingleSuccessor() == Cond &&
9121 "Header must jump to exiting block");
9122
9123 assert(Cond);
9124 assert(Cond->getSinglePredecessor() == Header &&
9125 "Exiting block only reachable from header");
9126
9127 assert(isa<BranchInst>(Cond->getTerminator()) &&
9128 "Exiting block must terminate with conditional branch");
9129 assert(size(successors(Cond)) == 2 &&
9130 "Exiting block must have two successors");
9131 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
9132 "Exiting block's first successor jump to the body");
9133 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
9134 "Exiting block's second successor must exit the loop");
9135
9136 assert(Body);
9137 assert(Body->getSinglePredecessor() == Cond &&
9138 "Body only reachable from exiting block");
9139 assert(!isa<PHINode>(Body->front()));
9140
9141 assert(Latch);
9142 assert(isa<BranchInst>(Latch->getTerminator()) &&
9143 "Latch must terminate with unconditional branch");
9144 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
9145 // TODO: To support simple redirecting of the end of the body code that has
9146 // multiple; introduce another auxiliary basic block like preheader and after.
9147 assert(Latch->getSinglePredecessor() != nullptr);
9148 assert(!isa<PHINode>(Latch->front()));
9149
9150 assert(Exit);
9151 assert(isa<BranchInst>(Exit->getTerminator()) &&
9152 "Exit block must terminate with unconditional branch");
9153 assert(Exit->getSingleSuccessor() == After &&
9154 "Exit block must jump to after block");
9155
9156 assert(After);
9157 assert(After->getSinglePredecessor() == Exit &&
9158 "After block only reachable from exit block");
9159 assert(After->empty() || !isa<PHINode>(After->front()));
9160
9161 Instruction *IndVar = getIndVar();
9162 assert(IndVar && "Canonical induction variable not found?");
9163 assert(isa<IntegerType>(IndVar->getType()) &&
9164 "Induction variable must be an integer");
9165 assert(cast<PHINode>(IndVar)->getParent() == Header &&
9166 "Induction variable must be a PHI in the loop header");
9167 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
9168 assert(
9169 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
9170 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
9171
9172 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
9173 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
9174 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
9175 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
9176 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
9177 ->isOne());
9178
9179 Value *TripCount = getTripCount();
9180 assert(TripCount && "Loop trip count not found?");
9181 assert(IndVar->getType() == TripCount->getType() &&
9182 "Trip count and induction variable must have the same type");
9183
9184 auto *CmpI = cast<CmpInst>(&Cond->front());
9185 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
9186 "Exit condition must be a signed less-than comparison");
9187 assert(CmpI->getOperand(0) == IndVar &&
9188 "Exit condition must compare the induction variable");
9189 assert(CmpI->getOperand(1) == TripCount &&
9190 "Exit condition must compare with the trip count");
9191#endif
9192}
9193
9195 Header = nullptr;
9196 Cond = nullptr;
9197 Latch = nullptr;
9198 Exit = nullptr;
9199}
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE() pulls the operator overloads used by LLVM_MARK_AS_BITMASK_EN...
Definition: BitmaskEnum.h:83
#define LLVM_MARK_AS_BITMASK_ENUM(LargestValue)
LLVM_MARK_AS_BITMASK_ENUM lets you opt in an individual enum type so you can perform bitwise operatio...
Definition: BitmaskEnum.h:42
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Addr
std::string Name
uint64_t Size
bool End
Definition: ELF_riscv.cpp:480
DenseMap< Block *, BlockRelaxAux > Blocks
Definition: ELF_riscv.cpp:507
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Rewrite Partial Register Uses
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
iv Induction Variable Users
Definition: IVUsers.cpp:48
static LVOptions Options
Definition: LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition: Lint.cpp:512
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
This file contains the declarations for metadata subclasses.
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Definition: OMPConstants.h:75
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Function * createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI)
Create an entry point for a target task with the following.
static void updateNVPTXMetadata(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static MDNode * getNVPTXMDNode(Function &Kernel, StringRef Name)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static void emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, Type *ParallelTaskPtr, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, Function *OutlinedFn, Constant *OutlinedFnID, ArrayRef< int32_t > NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, SmallVector< llvm::OpenMPIRBuilder::DependData > Dependencies={})
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Type *ParallelTaskPtr, Value *TripCount, Function &LoopBodyFn)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
const char LLVMTargetMachineRef TM
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
This header defines various interfaces for pass management in LLVM.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
raw_pwrite_stream & OS
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition: VPlanSLP.cpp:191
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition: blake3_impl.h:78
Class for arbitrary precision integers.
Definition: APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Definition: Instructions.h:61
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
Definition: Instructions.h:122
PointerType * getType() const
Overload to return most specific pointer type.
Definition: Instructions.h:97
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:115
unsigned getAddressSpace() const
Return the address space for the allocation.
Definition: Instructions.h:102
std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
Definition: Instructions.h:126
const Value * getArraySize() const
Get the number of elements allocated.
Definition: Instructions.h:93
A container for analyses that lazily runs them and caches their results.
Definition: PassManager.h:253
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
Definition: PassManager.h:467
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
iterator end() const
Definition: ArrayRef.h:154
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
iterator begin() const
Definition: ArrayRef.h:153
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
Class to represent array types.
Definition: DerivedTypes.h:371
static ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
Definition: Type.cpp:635
A function analysis which provides an AssumptionCache.
AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
Definition: Instructions.h:495
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
Definition: Instructions.h:644
an instruction that atomically reads a memory location, combines it with another value,...
Definition: Instructions.h:696
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
@ Add
*p = old + v
Definition: Instructions.h:712
@ FAdd
*p = old + v
Definition: Instructions.h:733
@ Min
*p = old <signed v ? old : v
Definition: Instructions.h:726
@ Or
*p = old | v
Definition: Instructions.h:720
@ Sub
*p = old - v
Definition: Instructions.h:714
@ And
*p = old & v
Definition: Instructions.h:716
@ Xor
*p = old ^ v
Definition: Instructions.h:722
@ FSub
*p = old - v
Definition: Instructions.h:736
@ UIncWrap
Increment one up to a maximum value.
Definition: Instructions.h:748
@ Max
*p = old >signed v ? old : v
Definition: Instructions.h:724
@ UMin
*p = old <unsigned v ? old : v
Definition: Instructions.h:730
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
Definition: Instructions.h:744
@ UMax
*p = old >unsigned v ? old : v
Definition: Instructions.h:728
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
Definition: Instructions.h:740
@ UDecWrap
Decrement one until a minimum value or zero.
Definition: Instructions.h:752
@ Nand
*p = ~(old & v)
Definition: Instructions.h:718
AttrBuilder & addAttribute(Attribute::AttrKind Val)
Add an attribute to the builder.
AttrBuilder & removeAttribute(Attribute::AttrKind Val)
Remove an attribute from the builder.
AttributeSet getFnAttrs() const
The function attributes are returned.
AttributeList addFnAttributes(LLVMContext &C, const AttrBuilder &B) const
Add function attribute to the list.
Definition: Attributes.h:577
AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
Definition: Attributes.cpp:866
AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
Definition: Attributes.cpp:851
static Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
Definition: Attributes.cpp:94
StringRef getValueAsString() const
Return the attribute's value as a string.
Definition: Attributes.cpp:392
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
Definition: BasicBlock.cpp:662
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
Definition: BasicBlock.cpp:416
reverse_iterator rbegin()
Definition: BasicBlock.h:464
bool empty() const
Definition: BasicBlock.h:470
const Instruction * getFirstNonPHI() const
Returns a pointer to the first instruction in this block that is not a PHINode instruction.
Definition: BasicBlock.cpp:367
const Instruction & front() const
Definition: BasicBlock.h:471
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition: BasicBlock.h:212
BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
Definition: BasicBlock.cpp:577
const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
Definition: BasicBlock.cpp:497
const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
Definition: BasicBlock.cpp:459
InstListType::reverse_iterator reverse_iterator
Definition: BasicBlock.h:179
const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
Definition: BasicBlock.cpp:467
const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
Definition: BasicBlock.cpp:489
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
Definition: BasicBlock.cpp:279
reverse_iterator rend()
Definition: BasicBlock.h:466
const Instruction * getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
Definition: BasicBlock.cpp:386
InstListType::iterator iterator
Instruction iterators...
Definition: BasicBlock.h:177
LLVMContext & getContext() const
Get the context in which this basic block lives.
Definition: BasicBlock.cpp:168
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition: BasicBlock.h:376
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition: BasicBlock.h:239
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition: BasicBlock.h:631
const Instruction & back() const
Definition: BasicBlock.h:473
const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
Definition: BasicBlock.cpp:292
void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Definition: BasicBlock.cpp:516
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Definition: InstrTypes.h:1385
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
Definition: InstrTypes.h:1391
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ ICMP_SLT
signed less than
Definition: InstrTypes.h:786
@ ICMP_SLE
signed less or equal
Definition: InstrTypes.h:787
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ ICMP_UGT
unsigned greater than
Definition: InstrTypes.h:780
@ ICMP_SGT
signed greater than
Definition: InstrTypes.h:784
@ ICMP_ULT
unsigned less than
Definition: InstrTypes.h:782
@ ICMP_NE
not equal
Definition: InstrTypes.h:779
@ ICMP_ULE
unsigned less or equal
Definition: InstrTypes.h:783
A cache for the CodeExtractor analysis.
Definition: CodeExtractor.h:45
Utility class for extracting code into a new function.
Definition: CodeExtractor.h:84
void findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs, const ValueSet &Allocas) const
Compute the set of input values and output values for the code.
void findAllocas(const CodeExtractorAnalysisCache &CEAC, ValueSet &SinkCands, ValueSet &HoistCands, BasicBlock *&ExitBlock) const
Find the set of allocas whose life ranges are contained within the outlined region.
Function * extractCodeRegion(const CodeExtractorAnalysisCache &CEAC)
Perform the extraction, returning the new function.
bool isEligible() const
Test whether this code extractor is eligible.
void excludeArgFromAggregate(Value *Arg)
Exclude a value from aggregate argument passing when extracting a code region, passing it instead as ...
static Constant * get(ArrayType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1292
static ConstantAsMetadata * get(Constant *C)
Definition: Metadata.h:528
static Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
Definition: Constants.cpp:2950
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition: Constants.h:706
static Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
Definition: Constants.cpp:2227
static Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
Definition: Constants.cpp:2242
static Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
Definition: Constants.cpp:2307
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:850
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition: Constants.h:124
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:857
static ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
Definition: Constants.cpp:1800
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getAllOnesValue(Type *Ty)
Definition: Constants.cpp:417
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
Debug location.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:63
unsigned getDefaultGlobalsAddressSpace() const
Definition: DataLayout.h:252
Align getABIIntegerTypeAlignment(unsigned BitWidth) const
Returns the minimum ABI-required alignment for an integer type of the specified bitwidth.
Definition: DataLayout.h:491
unsigned getAllocaAddrSpace() const
Definition: DataLayout.h:234
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:462
unsigned getPointerSize(unsigned AS=0) const
Layout pointer size in bytes, rounded up to a whole number of bytes.
Definition: DataLayout.cpp:748
unsigned getIndexSizeInBits(unsigned AS) const
Size in bits of index used for address calculation in getelementptr.
Definition: DataLayout.h:378
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:622
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition: DataLayout.h:430
Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Definition: DataLayout.cpp:865
A debug info location.
Definition: DebugLoc.h:33
Analysis pass which computes a DominatorTree.
Definition: Dominators.h:279
DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Definition: Dominators.cpp:371
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:162
Lightweight error class with error context and mandatory checking.
Definition: Error.h:160
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
Class to represent function types.
Definition: DerivedTypes.h:103
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition: Function.cpp:653
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition: Function.h:172
const BasicBlock & getEntryBlock() const
Definition: Function.h:807
bool empty() const
Definition: Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition: Function.h:214
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition: Function.cpp:465
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition: Function.cpp:384
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition: Function.cpp:769
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition: Function.cpp:781
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:357
const Function & getFunction() const
Definition: Function.h:170
iterator begin()
Definition: Function.h:851
arg_iterator arg_begin()
Definition: Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition: Function.h:360
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition: Function.cpp:380
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition: Function.cpp:681
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition: Function.h:752
size_t arg_size() const
Definition: Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition: Function.h:219
iterator end()
Definition: Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition: Function.h:285
Argument * getArg(unsigned i) const
Definition: Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition: Value.h:589
void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
Definition: Metadata.cpp:1528
LinkageTypes getLinkage() const
Definition: GlobalValue.h:546
void setLinkage(LinkageTypes LT)
Definition: GlobalValue.h:537
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:656
void setDSOLocal(bool Local)
Definition: GlobalValue.h:303
PointerType * getType() const
Global values are always pointers.
Definition: GlobalValue.h:294
@ HiddenVisibility
The GV is hidden.
Definition: GlobalValue.h:68
@ ProtectedVisibility
The GV is protected.
Definition: GlobalValue.h:69
void setVisibility(VisibilityTypes V)
Definition: GlobalValue.h:254
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition: GlobalValue.h:60
@ CommonLinkage
Tentative definitions.
Definition: GlobalValue.h:62
@ InternalLinkage
Rename collisions when linking (static functions).
Definition: GlobalValue.h:59
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:57
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition: GlobalValue.h:56
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition: GlobalValue.h:58
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition: GlobalValue.h:55
Type * getValueType() const
Definition: GlobalValue.h:296
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:254
BasicBlock * getBlock() const
Definition: IRBuilder.h:269
bool isSet() const
Returns true if this insert point is set.
Definition: IRBuilder.h:267
BasicBlock::iterator getPoint() const
Definition: IRBuilder.h:270
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
Value * CreatePtrDiff(Type *ElemTy, Value *LHS, Value *RHS, const Twine &Name="")
Return the i64 difference between two pointer values, dividing out the size of the pointed-to objects...
Definition: IRBuilder.cpp:1107
Value * CreateICmpULT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2277
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1858
AllocaInst * CreateAlloca(Type *Ty, unsigned AddrSpace, Value *ArraySize=nullptr, const Twine &Name="")
Definition: IRBuilder.h:1790
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2543
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Value * CreateICmpSGT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2285
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition: IRBuilder.h:1824
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2059
UnreachableInst * CreateUnreachable()
Definition: IRBuilder.h:1280
Value * CreatePointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2190
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2536
CallInst * CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue, unsigned Alignment, Value *OffsetValue=nullptr)
Create an assume intrinsic call that represents an alignment assumption on the provided pointer.
Definition: IRBuilder.cpp:1307
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
BasicBlock::iterator GetInsertPoint() const
Definition: IRBuilder.h:172
Value * CreateStructGEP(Type *Ty, Value *Ptr, unsigned Idx, const Twine &Name="")
Definition: IRBuilder.h:1989
IntegerType * getIndexTy(const DataLayout &DL, unsigned AddrSpace)
Fetch the type of an integer that should be used to index GEP operations within AddressSpace.
Definition: IRBuilder.h:578
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2053
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2142
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Value * CreateNSWAdd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1353
BasicBlock * GetInsertBlock() const
Definition: IRBuilder.h:171
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition: IRBuilder.h:1891
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2202
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1395
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2265
IntegerType * getInt16Ty()
Fetch the type representing a 16-bit integer.
Definition: IRBuilder.h:518
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1883
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition: IRBuilder.h:1738
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:274
Constant * CreateGlobalStringPtr(StringRef Str, const Twine &Name="", unsigned AddressSpace=0, Module *M=nullptr, bool AddNull=true)
Same as CreateGlobalString, but return a pointer with "i8*" type instead of a pointer to array of i8.
Definition: IRBuilder.h:2012
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition: IRBuilder.h:483
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2386
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2417
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition: IRBuilder.h:1160
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2261
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition: IRBuilder.h:142
DebugLoc getCurrentDebugLocation() const
Get location information used by debugging information.
Definition: IRBuilder.cpp:64
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1361
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2147
ConstantInt * getIntN(unsigned N, uint64_t C)
Get a constant N-bit value, zero extended or truncated from a 64-bit value.
Definition: IRBuilder.h:494
BranchInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition: IRBuilder.h:1137
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1807
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition: IRBuilder.h:2041
LLVMContext & getContext() const
Definition: IRBuilder.h:173
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1492
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition: IRBuilder.h:1107
Value * CreateConstInBoundsGEP2_32(Type *Ty, Value *Ptr, unsigned Idx0, unsigned Idx1, const Twine &Name="")
Definition: IRBuilder.h:1930
Value * CreateConstInBoundsGEP2_64(Type *Ty, Value *Ptr, uint64_t Idx0, uint64_t Idx1, const Twine &Name="")
Definition: IRBuilder.h:1976
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1820
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1344
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2137
Value * CreateIsNotNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg != 0.
Definition: IRBuilder.h:2569
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Definition: IRBuilder.h:1871
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition: IRBuilder.h:2027
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1514
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition: IRBuilder.h:566
BranchInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition: IRBuilder.h:1131
void ClearInsertionPoint()
Clear the insertion point: created instructions will not be inserted into a block.
Definition: IRBuilder.h:166
Value * CreateICmpSLT(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2293
ConstantInt * getInt16(uint16_t C)
Get a constant 16-bit value.
Definition: IRBuilder.h:478
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2273
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2216
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:286
Value * CreateIsNull(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg == 0.
Definition: IRBuilder.h:2564
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
Type * getVoidTy()
Fetch the type representing void.
Definition: IRBuilder.h:561
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition: IRBuilder.h:1843
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:2432
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition: IRBuilder.h:1473
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1536
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:2371
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition: IRBuilder.h:513
Value * CreateURem(Value *LHS, Value *RHS, const Twine &Name="")
Definition: IRBuilder.h:1421
CallInst * CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, uint64_t Size, bool isVolatile=false, MDNode *TBAATag=nullptr, MDNode *TBAAStructTag=nullptr, MDNode *ScopeTag=nullptr, MDNode *NoAliasTag=nullptr)
Create and insert a memcpy between the specified pointers.
Definition: IRBuilder.h:656
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition: IRBuilder.h:2074
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2152
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition: IRBuilder.h:1378
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2686
void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
Definition: Instruction.cpp:78
void moveBeforePreserving(Instruction *MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
Definition: Instruction.h:466
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Definition: Instruction.cpp:92
static bool classof(const Value *V)
Methods for support type inquiry through isa, cast, and dyn_cast:
Definition: Instruction.h:938
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
Definition: Instruction.h:381
BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
Definition: Metadata.cpp:1642
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
Definition: Instruction.h:463
void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
static IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition: Type.cpp:266
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Definition: Instructions.h:239
Analysis pass that exposes the LoopInfo for a function.
Definition: LoopInfo.h:571
LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition: LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Definition: MDBuilder.cpp:120
Metadata node.
Definition: Metadata.h:1069
void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
Definition: Metadata.cpp:1077
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1550
const MDOperand & getOperand(unsigned I) const
Definition: Metadata.h:1430
ArrayRef< MDOperand > operands() const
Definition: Metadata.h:1428
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition: Metadata.h:1542
static MDString * get(LLVMContext &Context, StringRef Str)
Definition: Metadata.cpp:606
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
size_type size() const
Definition: MapVector.h:60
static ErrorOr< std::unique_ptr< MemoryBuffer > > getFile(const Twine &Filename, bool IsText=false, bool RequiresNullTerminator=true, bool IsVolatile=false, std::optional< Align > Alignment=std::nullopt)
Open the specified file as a MemoryBuffer, returning a new MemoryBuffer if successful,...
Root of the metadata hierarchy.
Definition: Metadata.h:62
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
NamedMDNode * getNamedMetadata(StringRef Name) const
Return the first NamedMDNode in the module with the specified name.
Definition: Module.cpp:262
LLVMContext & getContext() const
Get the global data context.
Definition: Module.h:299
Function * getFunction(StringRef Name) const
Look up the specified function in the module symbol table.
Definition: Module.cpp:193
StringRef getName() const
Get a short "name" for the module.
Definition: Module.h:282
const std::string & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition: Module.h:295
iterator_range< global_iterator > globals()
Definition: Module.h:699
const FunctionListType & getFunctionList() const
Get the Module's list of functions (constant).
Definition: Module.h:611
GlobalVariable * getGlobalVariable(StringRef Name) const
Look up the specified global variable in the module symbol table.
Definition: Module.h:444
GlobalValue * getNamedValue(StringRef Name) const
Return the global value in the module with the specified name, of arbitrary type.
Definition: Module.cpp:135
NamedMDNode * getOrInsertNamedMetadata(StringRef Name)
Return the named MDNode in the module with the specified name.
Definition: Module.cpp:269
const GlobalVariable * getNamedGlobal(StringRef Name) const
Return the global variable in the module with the specified name, of arbitrary type.
Definition: Module.h:459
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:291
A tuple of MDNodes.
Definition: Metadata.h:1730
iterator_range< op_iterator > operands()
Definition: Metadata.h:1826
void addOperand(MDNode *M)
Definition: Metadata.cpp:1394
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:243
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:245
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:376
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:378
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:296
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:298
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:287
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:356
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:362
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:368
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:366
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:360
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:358
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:432
bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:92
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:188
StringRef separator() const
Definition: OMPIRBuilder.h:174
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:164
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:105
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:147
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:141
void setHasRequiresUnifiedAddress(bool Value)
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:184
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:473
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:519
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
InsertPointTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:499
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, ArrayRef< int32_t > NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={})
Generator for '#omp target'.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP, SmallVector< OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
Definition: DerivedTypes.h:646
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
Analysis pass that exposes the ScalarEvolution for a function.
ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition: SetVector.h:57
bool empty() const
Determine if the SetVector is empty or not.
Definition: SetVector.h:93
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:347
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
Definition: SmallPtrSet.h:436
iterator end() const
Definition: SmallPtrSet.h:461
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:368
iterator begin() const
Definition: SmallPtrSet.h:456
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:503
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition: SmallSet.h:135
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition: SmallSet.h:179
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition: SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition: SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
Definition: SmallString.h:254
bool empty() const
Definition: SmallVector.h:95
size_t size() const
Definition: SmallVector.h:92
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
reference emplace_back(ArgTypes &&... Args)
Definition: SmallVector.h:951
void reserve(size_type N)
Definition: SmallVector.h:677
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:697
void resize(size_type N)
Definition: SmallVector.h:652
void push_back(const T &Elt)
Definition: SmallVector.h:427
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
An instruction for storing to memory.
Definition: Instructions.h:290
void setAlignment(Align Align)
Definition: Instructions.h:333
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Definition: Instructions.h:360
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: StringMap.h:253
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:685
constexpr bool empty() const
empty - Check if the string is empty.
Definition: StringRef.h:134
constexpr size_t size() const
size - Get the string size.
Definition: StringRef.h:137
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition: StringRef.h:436
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition: StringRef.h:262
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition: StringRef.h:601
Class to represent struct types.
Definition: DerivedTypes.h:216
static StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition: Type.cpp:501
Multiway switch.
void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(StringRef TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition: Triple.h:953
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition: Triple.h:1011
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition: Triple.h:1021
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
std::string str() const
Return the twine contents as a std::string.
Definition: Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getIntegerBitWidth() const
Type * getStructElementType(unsigned N) const
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:251
static IntegerType * getInt1Ty(LLVMContext &C)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
static Type * getVoidTy(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:184
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:224
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
static UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
Definition: Constants.cpp:1833
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition: UnrollLoop.h:127
bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition: UnrollLoop.h:143
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
void setOperand(unsigned i, Value *Val)
Definition: User.h:174
Value * getOperand(unsigned i) const
Definition: User.h:169
ValueT lookup(const KeyT &Val) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition: ValueMap.h:164
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
void setName(const Twine &Name)
Change the name of the value.
Definition: Value.cpp:377
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition: Value.cpp:534
iterator_range< user_iterator > users()
Definition: Value.h:421
User * user_back()
Definition: Value.h:407
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition: Value.cpp:542
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition: Value.cpp:179
bool use_empty() const
Definition: Value.h:344
user_iterator user_end()
Definition: Value.h:405
unsigned getNumUses() const
This method computes the number of uses of this Value.
Definition: Value.cpp:255
iterator_range< use_iterator > uses()
Definition: Value.h:376
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:309
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition: ilist_node.h:32
self_iterator getIterator()
Definition: ilist_node.h:132
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition: ilist_node.h:353
iterator insertAfter(iterator where, pointer New)
Definition: ilist.h:174
A raw_ostream that writes to an SmallVector or SmallString.
Definition: raw_ostream.h:691
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ Exit
Definition: COFF.h:827
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition: CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
Definition: CallingConv.h:200
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
void emitOffloadingEntry(Module &M, Constant *Addr, StringRef Name, uint64_t Size, int32_t Flags, int32_t Data, StringRef SectionName)
Create an offloading section struct used to register this global at runtime.
Definition: Utility.cpp:64
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
Definition: OMPConstants.h:252
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
NodeAddr< PhiNode * > Phi
Definition: RDFGraph.h:390
std::error_code getUniqueID(const Twine Path, UniqueID &Result)
Definition: Path.cpp:788
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition: STLExtras.h:853
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition: STLExtras.h:1680
Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition: STLExtras.h:2406
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition: Casting.h:649
auto successors(const MachineBasicBlock *BB)
AddressSpace
Definition: NVPTXBaseInfo.h:21
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition: Error.h:198
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition: STLExtras.h:2073
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition: STLExtras.h:656
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
Definition: BitcodeReader.h:66
bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
auto reverse(ContainerTy &&C)
Definition: STLExtras.h:419
BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, DebugInfoFinder *DIFinder=nullptr)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
Definition: LoopPeel.cpp:872
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1736
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
CodeGenOptLevel
Code generation optimization level.
Definition: CodeGen.h:54
bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition: Format.h:125
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition: MathExtras.h:403
bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
DWARFExpression::Operation Op
void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
ConstantFoldInsertValueInstruction - Attempt to constant fold an insertvalue instruction with the spe...
@ Continue
Definition: DWP.h:21
void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
Definition: CodeMetrics.cpp:71
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:615
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:202
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static const Target * lookupTarget(StringRef Triple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57
unsigned GV_Warp_Size
The default value of maximum number of threads in a worker warp.
Definition: OMPGridValues.h:61