LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
18#include "llvm/ADT/StringRef.h"
28#include "llvm/IR/Attributes.h"
29#include "llvm/IR/BasicBlock.h"
30#include "llvm/IR/CFG.h"
31#include "llvm/IR/CallingConv.h"
32#include "llvm/IR/Constant.h"
33#include "llvm/IR/Constants.h"
34#include "llvm/IR/DIBuilder.h"
37#include "llvm/IR/Function.h"
39#include "llvm/IR/IRBuilder.h"
42#include "llvm/IR/LLVMContext.h"
43#include "llvm/IR/MDBuilder.h"
44#include "llvm/IR/Metadata.h"
46#include "llvm/IR/PassManager.h"
48#include "llvm/IR/Value.h"
61
62#include <cstdint>
63#include <optional>
64
65#define DEBUG_TYPE "openmp-ir-builder"
66
67using namespace llvm;
68using namespace omp;
69
70static cl::opt<bool>
71 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
72 cl::desc("Use optimistic attributes describing "
73 "'as-if' properties of runtime calls."),
74 cl::init(false));
75
77 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
78 cl::desc("Factor for the unroll threshold to account for code "
79 "simplifications still taking place"),
80 cl::init(1.5));
81
82#ifndef NDEBUG
83/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
84/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
85/// an InsertPoint stores the instruction before something is inserted. For
86/// instance, if both point to the same instruction, two IRBuilders alternating
87/// creating instruction will cause the instructions to be interleaved.
90 if (!IP1.isSet() || !IP2.isSet())
91 return false;
92 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
93}
94
96 // Valid ordered/unordered and base algorithm combinations.
97 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
98 case OMPScheduleType::UnorderedStaticChunked:
99 case OMPScheduleType::UnorderedStatic:
100 case OMPScheduleType::UnorderedDynamicChunked:
101 case OMPScheduleType::UnorderedGuidedChunked:
102 case OMPScheduleType::UnorderedRuntime:
103 case OMPScheduleType::UnorderedAuto:
104 case OMPScheduleType::UnorderedTrapezoidal:
105 case OMPScheduleType::UnorderedGreedy:
106 case OMPScheduleType::UnorderedBalanced:
107 case OMPScheduleType::UnorderedGuidedIterativeChunked:
108 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
109 case OMPScheduleType::UnorderedSteal:
110 case OMPScheduleType::UnorderedStaticBalancedChunked:
111 case OMPScheduleType::UnorderedGuidedSimd:
112 case OMPScheduleType::UnorderedRuntimeSimd:
113 case OMPScheduleType::OrderedStaticChunked:
114 case OMPScheduleType::OrderedStatic:
115 case OMPScheduleType::OrderedDynamicChunked:
116 case OMPScheduleType::OrderedGuidedChunked:
117 case OMPScheduleType::OrderedRuntime:
118 case OMPScheduleType::OrderedAuto:
119 case OMPScheduleType::OrderdTrapezoidal:
120 case OMPScheduleType::NomergeUnorderedStaticChunked:
121 case OMPScheduleType::NomergeUnorderedStatic:
122 case OMPScheduleType::NomergeUnorderedDynamicChunked:
123 case OMPScheduleType::NomergeUnorderedGuidedChunked:
124 case OMPScheduleType::NomergeUnorderedRuntime:
125 case OMPScheduleType::NomergeUnorderedAuto:
126 case OMPScheduleType::NomergeUnorderedTrapezoidal:
127 case OMPScheduleType::NomergeUnorderedGreedy:
128 case OMPScheduleType::NomergeUnorderedBalanced:
129 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
130 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
131 case OMPScheduleType::NomergeUnorderedSteal:
132 case OMPScheduleType::NomergeOrderedStaticChunked:
133 case OMPScheduleType::NomergeOrderedStatic:
134 case OMPScheduleType::NomergeOrderedDynamicChunked:
135 case OMPScheduleType::NomergeOrderedGuidedChunked:
136 case OMPScheduleType::NomergeOrderedRuntime:
137 case OMPScheduleType::NomergeOrderedAuto:
138 case OMPScheduleType::NomergeOrderedTrapezoidal:
139 break;
140 default:
141 return false;
142 }
143
144 // Must not set both monotonicity modifiers at the same time.
145 OMPScheduleType MonotonicityFlags =
146 SchedType & OMPScheduleType::MonotonicityMask;
147 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
148 return false;
149
150 return true;
151}
152#endif
153
154/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
155/// debug location to the last instruction in the specified basic block if the
156/// insert point points to the end of the block.
159 Builder.restoreIP(IP);
160 llvm::BasicBlock *BB = Builder.GetInsertBlock();
161 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
162 if (!BB->empty() && I == BB->end())
163 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
164}
165
166static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
167 if (T.isAMDGPU()) {
168 StringRef Features =
169 Kernel->getFnAttribute("target-features").getValueAsString();
170 if (Features.count("+wavefrontsize64"))
173 }
174 if (T.isNVPTX())
176 if (T.isSPIRV())
178 llvm_unreachable("No grid value available for this architecture!");
179}
180
181/// Determine which scheduling algorithm to use, determined from schedule clause
182/// arguments.
183static OMPScheduleType
184getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
185 bool HasSimdModifier) {
186 // Currently, the default schedule it static.
187 switch (ClauseKind) {
188 case OMP_SCHEDULE_Default:
189 case OMP_SCHEDULE_Static:
190 return HasChunks ? OMPScheduleType::BaseStaticChunked
191 : OMPScheduleType::BaseStatic;
192 case OMP_SCHEDULE_Dynamic:
193 return OMPScheduleType::BaseDynamicChunked;
194 case OMP_SCHEDULE_Guided:
195 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
196 : OMPScheduleType::BaseGuidedChunked;
197 case OMP_SCHEDULE_Auto:
199 case OMP_SCHEDULE_Runtime:
200 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
201 : OMPScheduleType::BaseRuntime;
202 }
203 llvm_unreachable("unhandled schedule clause argument");
204}
205
206/// Adds ordering modifier flags to schedule type.
207static OMPScheduleType
209 bool HasOrderedClause) {
210 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
211 OMPScheduleType::None &&
212 "Must not have ordering nor monotonicity flags already set");
213
214 OMPScheduleType OrderingModifier = HasOrderedClause
215 ? OMPScheduleType::ModifierOrdered
216 : OMPScheduleType::ModifierUnordered;
217 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
218
219 // Unsupported combinations
220 if (OrderingScheduleType ==
221 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
222 return OMPScheduleType::OrderedGuidedChunked;
223 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
224 OMPScheduleType::ModifierOrdered))
225 return OMPScheduleType::OrderedRuntime;
226
227 return OrderingScheduleType;
228}
229
230/// Adds monotonicity modifier flags to schedule type.
231static OMPScheduleType
233 bool HasSimdModifier, bool HasMonotonic,
234 bool HasNonmonotonic, bool HasOrderedClause) {
235 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
236 OMPScheduleType::None &&
237 "Must not have monotonicity flags already set");
238 assert((!HasMonotonic || !HasNonmonotonic) &&
239 "Monotonic and Nonmonotonic are contradicting each other");
240
241 if (HasMonotonic) {
242 return ScheduleType | OMPScheduleType::ModifierMonotonic;
243 } else if (HasNonmonotonic) {
244 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
245 } else {
246 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
247 // If the static schedule kind is specified or if the ordered clause is
248 // specified, and if the nonmonotonic modifier is not specified, the
249 // effect is as if the monotonic modifier is specified. Otherwise, unless
250 // the monotonic modifier is specified, the effect is as if the
251 // nonmonotonic modifier is specified.
252 OMPScheduleType BaseScheduleType =
253 ScheduleType & ~OMPScheduleType::ModifierMask;
254 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
255 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
256 HasOrderedClause) {
257 // The monotonic is used by default in openmp runtime library, so no need
258 // to set it.
259 return ScheduleType;
260 } else {
261 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
262 }
263 }
264}
265
266/// Determine the schedule type using schedule and ordering clause arguments.
267static OMPScheduleType
268computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
269 bool HasSimdModifier, bool HasMonotonicModifier,
270 bool HasNonmonotonicModifier, bool HasOrderedClause) {
271 OMPScheduleType BaseSchedule =
272 getOpenMPBaseScheduleType(ClauseKind, HasChunks, HasSimdModifier);
273 OMPScheduleType OrderedSchedule =
274 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
276 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
277 HasNonmonotonicModifier, HasOrderedClause);
278
280 return Result;
281}
282
283/// Make \p Source branch to \p Target.
284///
285/// Handles two situations:
286/// * \p Source already has an unconditional branch.
287/// * \p Source is a degenerate block (no terminator because the BB is
288/// the current head of the IR construction).
290 if (Instruction *Term = Source->getTerminator()) {
291 auto *Br = cast<BranchInst>(Term);
292 assert(!Br->isConditional() &&
293 "BB's terminator must be an unconditional branch (or degenerate)");
294 BasicBlock *Succ = Br->getSuccessor(0);
295 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
296 Br->setSuccessor(0, Target);
297 return;
298 }
299
300 auto *NewBr = BranchInst::Create(Target, Source);
301 NewBr->setDebugLoc(DL);
302}
303
304void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
305 bool CreateBranch, DebugLoc DL) {
306 assert(New->getFirstInsertionPt() == New->begin() &&
307 "Target BB must not have PHI nodes");
308
309 // Move instructions to new block.
310 BasicBlock *Old = IP.getBlock();
311 // If the `Old` block is empty then there are no instructions to move. But in
312 // the new debug scheme, it could have trailing debug records which will be
313 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
314 // reasons:
315 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
316 // 2. Even if `New` is not empty, the rationale to move those records to `New`
317 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
318 // assumes that `Old` is optimized out and is going away. This is not the case
319 // here. The `Old` block is still being used e.g. a branch instruction is
320 // added to it later in this function.
321 // So we call `BasicBlock::splice` only when `Old` is not empty.
322 if (!Old->empty())
323 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
324
325 if (CreateBranch) {
326 auto *NewBr = BranchInst::Create(New, Old);
327 NewBr->setDebugLoc(DL);
328 }
329}
330
331void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
332 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
333 BasicBlock *Old = Builder.GetInsertBlock();
334
335 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
336 if (CreateBranch)
337 Builder.SetInsertPoint(Old->getTerminator());
338 else
339 Builder.SetInsertPoint(Old);
340
341 // SetInsertPoint also updates the Builder's debug location, but we want to
342 // keep the one the Builder was configured to use.
343 Builder.SetCurrentDebugLocation(DebugLoc);
344}
345
346BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
347 DebugLoc DL, llvm::Twine Name) {
348 BasicBlock *Old = IP.getBlock();
350 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
351 Old->getParent(), Old->getNextNode());
352 spliceBB(IP, New, CreateBranch, DL);
353 New->replaceSuccessorsPhiUsesWith(Old, New);
354 return New;
355}
356
357BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
358 llvm::Twine Name) {
359 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
360 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
361 if (CreateBranch)
362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
363 else
364 Builder.SetInsertPoint(Builder.GetInsertBlock());
365 // SetInsertPoint also updates the Builder's debug location, but we want to
366 // keep the one the Builder was configured to use.
367 Builder.SetCurrentDebugLocation(DebugLoc);
368 return New;
369}
370
371BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
372 llvm::Twine Name) {
373 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
374 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
375 if (CreateBranch)
376 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
377 else
378 Builder.SetInsertPoint(Builder.GetInsertBlock());
379 // SetInsertPoint also updates the Builder's debug location, but we want to
380 // keep the one the Builder was configured to use.
381 Builder.SetCurrentDebugLocation(DebugLoc);
382 return New;
383}
384
385BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
386 llvm::Twine Suffix) {
387 BasicBlock *Old = Builder.GetInsertBlock();
388 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
389}
390
391// This function creates a fake integer value and a fake use for the integer
392// value. It returns the fake value created. This is useful in modeling the
393// extra arguments to the outlined functions.
395 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
397 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
398 const Twine &Name = "", bool AsPtr = true) {
399 Builder.restoreIP(OuterAllocaIP);
400 Instruction *FakeVal;
401 AllocaInst *FakeValAddr =
402 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
403 ToBeDeleted.push_back(FakeValAddr);
404
405 if (AsPtr) {
406 FakeVal = FakeValAddr;
407 } else {
408 FakeVal =
409 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
410 ToBeDeleted.push_back(FakeVal);
411 }
412
413 // Generate a fake use of this value
414 Builder.restoreIP(InnerAllocaIP);
415 Instruction *UseFakeVal;
416 if (AsPtr) {
417 UseFakeVal =
418 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
419 } else {
420 UseFakeVal =
421 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
422 }
423 ToBeDeleted.push_back(UseFakeVal);
424 return FakeVal;
425}
426
427//===----------------------------------------------------------------------===//
428// OpenMPIRBuilderConfig
429//===----------------------------------------------------------------------===//
430
431namespace {
433/// Values for bit flags for marking which requires clauses have been used.
434enum OpenMPOffloadingRequiresDirFlags {
435 /// flag undefined.
436 OMP_REQ_UNDEFINED = 0x000,
437 /// no requires directive present.
438 OMP_REQ_NONE = 0x001,
439 /// reverse_offload clause.
440 OMP_REQ_REVERSE_OFFLOAD = 0x002,
441 /// unified_address clause.
442 OMP_REQ_UNIFIED_ADDRESS = 0x004,
443 /// unified_shared_memory clause.
444 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
445 /// dynamic_allocators clause.
446 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
447 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
448};
449
450} // anonymous namespace
451
452OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
453 : RequiresFlags(OMP_REQ_UNDEFINED) {}
454
455OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
456 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
457 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
458 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
459 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
460 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
461 RequiresFlags(OMP_REQ_UNDEFINED) {
462 if (HasRequiresReverseOffload)
463 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
464 if (HasRequiresUnifiedAddress)
465 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
466 if (HasRequiresUnifiedSharedMemory)
467 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
468 if (HasRequiresDynamicAllocators)
469 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
470}
471
472bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
473 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
474}
475
476bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
477 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
478}
479
480bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
481 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
482}
483
484bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
485 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
486}
487
488int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
489 return hasRequiresFlags() ? RequiresFlags
490 : static_cast<int64_t>(OMP_REQ_NONE);
491}
492
493void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
494 if (Value)
495 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
496 else
497 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
503 else
504 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
517 else
518 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
519}
520
521//===----------------------------------------------------------------------===//
522// OpenMPIRBuilder
523//===----------------------------------------------------------------------===//
524
525void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
526 IRBuilderBase &Builder,
527 SmallVector<Value *> &ArgsVector) {
528 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
529 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
530 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
531 constexpr size_t MaxDim = 3;
532 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
533 Value *Flags = Builder.getInt64(KernelArgs.HasNoWait);
534
535 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
536
537 Value *NumTeams3D =
538 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
539 Value *NumThreads3D =
540 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
541 for (unsigned I :
542 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
543 NumTeams3D =
544 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
545 for (unsigned I :
546 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
547 NumThreads3D =
548 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
549
550 ArgsVector = {Version,
551 PointerNum,
552 KernelArgs.RTArgs.BasePointersArray,
553 KernelArgs.RTArgs.PointersArray,
554 KernelArgs.RTArgs.SizesArray,
555 KernelArgs.RTArgs.MapTypesArray,
556 KernelArgs.RTArgs.MapNamesArray,
557 KernelArgs.RTArgs.MappersArray,
558 KernelArgs.NumIterations,
559 Flags,
560 NumTeams3D,
561 NumThreads3D,
562 KernelArgs.DynCGGroupMem};
563}
564
565void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
566 LLVMContext &Ctx = Fn.getContext();
567
568 // Get the function's current attributes.
569 auto Attrs = Fn.getAttributes();
570 auto FnAttrs = Attrs.getFnAttrs();
571 auto RetAttrs = Attrs.getRetAttrs();
573 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
574 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
575
576 // Add AS to FnAS while taking special care with integer extensions.
577 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
578 bool Param = true) -> void {
579 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
580 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
581 if (HasSignExt || HasZeroExt) {
582 assert(AS.getNumAttributes() == 1 &&
583 "Currently not handling extension attr combined with others.");
584 if (Param) {
585 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
586 FnAS = FnAS.addAttribute(Ctx, AK);
587 } else if (auto AK =
588 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
589 FnAS = FnAS.addAttribute(Ctx, AK);
590 } else {
591 FnAS = FnAS.addAttributes(Ctx, AS);
592 }
593 };
594
595#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
596#include "llvm/Frontend/OpenMP/OMPKinds.def"
597
598 // Add attributes to the function declaration.
599 switch (FnID) {
600#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
601 case Enum: \
602 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
603 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
604 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
605 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
606 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
607 break;
608#include "llvm/Frontend/OpenMP/OMPKinds.def"
609 default:
610 // Attributes are optional.
611 break;
612 }
613}
614
616OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
617 FunctionType *FnTy = nullptr;
618 Function *Fn = nullptr;
619
620 // Try to find the declation in the module first.
621 switch (FnID) {
622#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
623 case Enum: \
624 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
625 IsVarArg); \
626 Fn = M.getFunction(Str); \
627 break;
628#include "llvm/Frontend/OpenMP/OMPKinds.def"
629 }
630
631 if (!Fn) {
632 // Create a new declaration if we need one.
633 switch (FnID) {
634#define OMP_RTL(Enum, Str, ...) \
635 case Enum: \
636 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
637 break;
638#include "llvm/Frontend/OpenMP/OMPKinds.def"
639 }
640
641 // Add information if the runtime function takes a callback function
642 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
643 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
644 LLVMContext &Ctx = Fn->getContext();
645 MDBuilder MDB(Ctx);
646 // Annotate the callback behavior of the runtime function:
647 // - The callback callee is argument number 2 (microtask).
648 // - The first two arguments of the callback callee are unknown (-1).
649 // - All variadic arguments to the runtime function are passed to the
650 // callback callee.
651 Fn->addMetadata(
652 LLVMContext::MD_callback,
653 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
654 2, {-1, -1}, /* VarArgsArePassed */ true)}));
655 }
656 }
657
658 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
659 << " with type " << *Fn->getFunctionType() << "\n");
660 addAttributes(FnID, *Fn);
661
662 } else {
663 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
664 << " with type " << *Fn->getFunctionType() << "\n");
665 }
666
667 assert(Fn && "Failed to create OpenMP runtime function");
668
669 return {FnTy, Fn};
670}
671
672Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
673 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
674 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
675 assert(Fn && "Failed to create OpenMP runtime function pointer");
676 return Fn;
677}
678
679void OpenMPIRBuilder::initialize() { initializeTypes(M); }
680
683 BasicBlock &EntryBlock = Function->getEntryBlock();
684 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
685
686 // Loop over blocks looking for constant allocas, skipping the entry block
687 // as any allocas there are already in the desired location.
688 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
689 Block++) {
690 for (auto Inst = Block->getReverseIterator()->begin();
691 Inst != Block->getReverseIterator()->end();) {
693 Inst++;
695 continue;
696 AllocaInst->moveBeforePreserving(MoveLocInst);
697 } else {
698 Inst++;
699 }
700 }
701 }
702}
703
704void OpenMPIRBuilder::finalize(Function *Fn) {
705 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
707 SmallVector<OutlineInfo, 16> DeferredOutlines;
708 for (OutlineInfo &OI : OutlineInfos) {
709 // Skip functions that have not finalized yet; may happen with nested
710 // function generation.
711 if (Fn && OI.getFunction() != Fn) {
712 DeferredOutlines.push_back(OI);
713 continue;
714 }
715
716 ParallelRegionBlockSet.clear();
717 Blocks.clear();
718 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
719
720 Function *OuterFn = OI.getFunction();
721 CodeExtractorAnalysisCache CEAC(*OuterFn);
722 // If we generate code for the target device, we need to allocate
723 // struct for aggregate params in the device default alloca address space.
724 // OpenMP runtime requires that the params of the extracted functions are
725 // passed as zero address space pointers. This flag ensures that
726 // CodeExtractor generates correct code for extracted functions
727 // which are used by OpenMP runtime.
728 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
729 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
730 /* AggregateArgs */ true,
731 /* BlockFrequencyInfo */ nullptr,
732 /* BranchProbabilityInfo */ nullptr,
733 /* AssumptionCache */ nullptr,
734 /* AllowVarArgs */ true,
735 /* AllowAlloca */ true,
736 /* AllocaBlock*/ OI.OuterAllocaBB,
737 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
738
739 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
740 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
741 << " Exit: " << OI.ExitBB->getName() << "\n");
742 assert(Extractor.isEligible() &&
743 "Expected OpenMP outlining to be possible!");
744
745 for (auto *V : OI.ExcludeArgsFromAggregate)
746 Extractor.excludeArgFromAggregate(V);
747
748 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
749
750 // Forward target-cpu, target-features attributes to the outlined function.
751 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
752 if (TargetCpuAttr.isStringAttribute())
753 OutlinedFn->addFnAttr(TargetCpuAttr);
754
755 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
756 if (TargetFeaturesAttr.isStringAttribute())
757 OutlinedFn->addFnAttr(TargetFeaturesAttr);
758
759 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
760 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
761 assert(OutlinedFn->getReturnType()->isVoidTy() &&
762 "OpenMP outlined functions should not return a value!");
763
764 // For compability with the clang CG we move the outlined function after the
765 // one with the parallel region.
766 OutlinedFn->removeFromParent();
767 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
768
769 // Remove the artificial entry introduced by the extractor right away, we
770 // made our own entry block after all.
771 {
772 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
773 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
774 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
775 // Move instructions from the to-be-deleted ArtificialEntry to the entry
776 // basic block of the parallel region. CodeExtractor generates
777 // instructions to unwrap the aggregate argument and may sink
778 // allocas/bitcasts for values that are solely used in the outlined region
779 // and do not escape.
780 assert(!ArtificialEntry.empty() &&
781 "Expected instructions to add in the outlined region entry");
782 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
783 End = ArtificialEntry.rend();
784 It != End;) {
785 Instruction &I = *It;
786 It++;
787
788 if (I.isTerminator()) {
789 // Absorb any debug value that terminator may have
790 if (OI.EntryBB->getTerminator())
791 OI.EntryBB->getTerminator()->adoptDbgRecords(
792 &ArtificialEntry, I.getIterator(), false);
793 continue;
794 }
795
796 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
797 }
798
799 OI.EntryBB->moveBefore(&ArtificialEntry);
800 ArtificialEntry.eraseFromParent();
801 }
802 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
803 assert(OutlinedFn && OutlinedFn->hasNUses(1));
804
805 // Run a user callback, e.g. to add attributes.
806 if (OI.PostOutlineCB)
807 OI.PostOutlineCB(*OutlinedFn);
808 }
809
810 // Remove work items that have been completed.
811 OutlineInfos = std::move(DeferredOutlines);
812
813 // The createTarget functions embeds user written code into
814 // the target region which may inject allocas which need to
815 // be moved to the entry block of our target or risk malformed
816 // optimisations by later passes, this is only relevant for
817 // the device pass which appears to be a little more delicate
818 // when it comes to optimisations (however, we do not block on
819 // that here, it's up to the inserter to the list to do so).
820 // This notbaly has to occur after the OutlinedInfo candidates
821 // have been extracted so we have an end product that will not
822 // be implicitly adversely affected by any raises unless
823 // intentionally appended to the list.
824 // NOTE: This only does so for ConstantData, it could be extended
825 // to ConstantExpr's with further effort, however, they should
826 // largely be folded when they get here. Extending it to runtime
827 // defined/read+writeable allocation sizes would be non-trivial
828 // (need to factor in movement of any stores to variables the
829 // allocation size depends on, as well as the usual loads,
830 // otherwise it'll yield the wrong result after movement) and
831 // likely be more suitable as an LLVM optimisation pass.
832 for (Function *F : ConstantAllocaRaiseCandidates)
834
835 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
836 [](EmitMetadataErrorKind Kind,
837 const TargetRegionEntryInfo &EntryInfo) -> void {
838 errs() << "Error of kind: " << Kind
839 << " when emitting offload entries and metadata during "
840 "OMPIRBuilder finalization \n";
841 };
842
843 if (!OffloadInfoManager.empty())
844 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
845
846 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
847 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
848 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
849 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
850 }
851
852 IsFinalized = true;
853}
854
855bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
856
857OpenMPIRBuilder::~OpenMPIRBuilder() {
858 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
859}
860
861GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
862 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
863 auto *GV =
864 new GlobalVariable(M, I32Ty,
865 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
866 ConstantInt::get(I32Ty, Value), Name);
867 GV->setVisibility(GlobalValue::HiddenVisibility);
868
869 return GV;
870}
871
872void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
873 if (List.empty())
874 return;
875
876 // Convert List to what ConstantArray needs.
878 UsedArray.resize(List.size());
879 for (unsigned I = 0, E = List.size(); I != E; ++I)
881 cast<Constant>(&*List[I]), Builder.getPtrTy());
882
883 if (UsedArray.empty())
884 return;
885 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
886
887 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
888 ConstantArray::get(ATy, UsedArray), Name);
889
890 GV->setSection("llvm.metadata");
891}
892
894OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
896 auto *Int8Ty = Builder.getInt8Ty();
897 auto *GVMode = new GlobalVariable(
898 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
899 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
900 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
901 return GVMode;
902}
903
904Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
905 uint32_t SrcLocStrSize,
906 IdentFlag LocFlags,
907 unsigned Reserve2Flags) {
908 // Enable "C-mode".
909 LocFlags |= OMP_IDENT_FLAG_KMPC;
910
911 Constant *&Ident =
912 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
913 if (!Ident) {
915 Constant *IdentData[] = {I32Null,
916 ConstantInt::get(Int32, uint32_t(LocFlags)),
917 ConstantInt::get(Int32, Reserve2Flags),
918 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
919
920 size_t SrcLocStrArgIdx = 4;
921 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
923 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
924 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
925 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
926 Constant *Initializer =
927 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
928
929 // Look for existing encoding of the location + flags, not needed but
930 // minimizes the difference to the existing solution while we transition.
931 for (GlobalVariable &GV : M.globals())
932 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
933 if (GV.getInitializer() == Initializer)
934 Ident = &GV;
935
936 if (!Ident) {
937 auto *GV = new GlobalVariable(
938 M, OpenMPIRBuilder::Ident,
939 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
941 M.getDataLayout().getDefaultGlobalsAddressSpace());
942 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
943 GV->setAlignment(Align(8));
944 Ident = GV;
945 }
946 }
947
949}
950
951Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
952 uint32_t &SrcLocStrSize) {
953 SrcLocStrSize = LocStr.size();
954 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
955 if (!SrcLocStr) {
956 Constant *Initializer =
957 ConstantDataArray::getString(M.getContext(), LocStr);
958
959 // Look for existing encoding of the location, not needed but minimizes the
960 // difference to the existing solution while we transition.
961 for (GlobalVariable &GV : M.globals())
962 if (GV.isConstant() && GV.hasInitializer() &&
963 GV.getInitializer() == Initializer)
964 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
965
966 SrcLocStr = Builder.CreateGlobalString(
967 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
968 &M);
969 }
970 return SrcLocStr;
971}
972
973Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
974 StringRef FileName,
975 unsigned Line, unsigned Column,
976 uint32_t &SrcLocStrSize) {
977 SmallString<128> Buffer;
978 Buffer.push_back(';');
979 Buffer.append(FileName);
980 Buffer.push_back(';');
981 Buffer.append(FunctionName);
982 Buffer.push_back(';');
983 Buffer.append(std::to_string(Line));
984 Buffer.push_back(';');
985 Buffer.append(std::to_string(Column));
986 Buffer.push_back(';');
987 Buffer.push_back(';');
988 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
989}
990
991Constant *
992OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
993 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
994 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
995}
996
997Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
998 uint32_t &SrcLocStrSize,
999 Function *F) {
1000 DILocation *DIL = DL.get();
1001 if (!DIL)
1002 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1003 StringRef FileName = M.getName();
1004 if (DIFile *DIF = DIL->getFile())
1005 if (std::optional<StringRef> Source = DIF->getSource())
1006 FileName = *Source;
1007 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1008 if (Function.empty() && F)
1009 Function = F->getName();
1010 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1011 DIL->getColumn(), SrcLocStrSize);
1012}
1013
1014Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1015 uint32_t &SrcLocStrSize) {
1016 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1017 Loc.IP.getBlock()->getParent());
1018}
1019
1020Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1021 return Builder.CreateCall(
1022 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1023 "omp_global_thread_num");
1024}
1025
1026OpenMPIRBuilder::InsertPointOrErrorTy
1027OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1028 bool ForceSimpleCall, bool CheckCancelFlag) {
1029 if (!updateToLocation(Loc))
1030 return Loc.IP;
1031
1032 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1033 // __kmpc_barrier(loc, thread_id);
1034
1035 IdentFlag BarrierLocFlags;
1036 switch (Kind) {
1037 case OMPD_for:
1038 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1039 break;
1040 case OMPD_sections:
1041 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1042 break;
1043 case OMPD_single:
1044 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1045 break;
1046 case OMPD_barrier:
1047 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1048 break;
1049 default:
1050 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1051 break;
1052 }
1053
1054 uint32_t SrcLocStrSize;
1055 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1056 Value *Args[] = {
1057 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1058 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1059
1060 // If we are in a cancellable parallel region, barriers are cancellation
1061 // points.
1062 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1063 bool UseCancelBarrier =
1064 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1065
1066 Value *Result =
1067 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
1068 UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
1069 : OMPRTL___kmpc_barrier),
1070 Args);
1071
1072 if (UseCancelBarrier && CheckCancelFlag)
1073 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1074 return Err;
1075
1076 return Builder.saveIP();
1077}
1078
1079OpenMPIRBuilder::InsertPointOrErrorTy
1080OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1081 Value *IfCondition,
1082 omp::Directive CanceledDirective) {
1083 if (!updateToLocation(Loc))
1084 return Loc.IP;
1085
1086 // LLVM utilities like blocks with terminators.
1087 auto *UI = Builder.CreateUnreachable();
1088
1089 Instruction *ThenTI = UI, *ElseTI = nullptr;
1090 if (IfCondition)
1091 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1092 Builder.SetInsertPoint(ThenTI);
1093
1094 Value *CancelKind = nullptr;
1095 switch (CanceledDirective) {
1096#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1097 case DirectiveEnum: \
1098 CancelKind = Builder.getInt32(Value); \
1099 break;
1100#include "llvm/Frontend/OpenMP/OMPKinds.def"
1101 default:
1102 llvm_unreachable("Unknown cancel kind!");
1103 }
1104
1105 uint32_t SrcLocStrSize;
1106 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1107 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1108 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1109 Value *Result = Builder.CreateCall(
1110 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1111 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1112 if (CanceledDirective == OMPD_parallel) {
1113 IRBuilder<>::InsertPointGuard IPG(Builder);
1114 Builder.restoreIP(IP);
1115 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1116 omp::Directive::OMPD_unknown,
1117 /* ForceSimpleCall */ false,
1118 /* CheckCancelFlag */ false)
1119 .takeError();
1120 }
1121 return Error::success();
1122 };
1123
1124 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1125 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1126 return Err;
1127
1128 // Update the insertion point and remove the terminator we introduced.
1129 Builder.SetInsertPoint(UI->getParent());
1130 UI->eraseFromParent();
1131
1132 return Builder.saveIP();
1133}
1134
1135OpenMPIRBuilder::InsertPointOrErrorTy
1136OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1137 omp::Directive CanceledDirective) {
1138 if (!updateToLocation(Loc))
1139 return Loc.IP;
1140
1141 // LLVM utilities like blocks with terminators.
1142 auto *UI = Builder.CreateUnreachable();
1143 Builder.SetInsertPoint(UI);
1144
1145 Value *CancelKind = nullptr;
1146 switch (CanceledDirective) {
1147#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1148 case DirectiveEnum: \
1149 CancelKind = Builder.getInt32(Value); \
1150 break;
1151#include "llvm/Frontend/OpenMP/OMPKinds.def"
1152 default:
1153 llvm_unreachable("Unknown cancel kind!");
1154 }
1155
1156 uint32_t SrcLocStrSize;
1157 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1158 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1159 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1160 Value *Result = Builder.CreateCall(
1161 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1162 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1163 if (CanceledDirective == OMPD_parallel) {
1164 IRBuilder<>::InsertPointGuard IPG(Builder);
1165 Builder.restoreIP(IP);
1166 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1167 omp::Directive::OMPD_unknown,
1168 /* ForceSimpleCall */ false,
1169 /* CheckCancelFlag */ false)
1170 .takeError();
1171 }
1172 return Error::success();
1173 };
1174
1175 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1176 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1177 return Err;
1178
1179 // Update the insertion point and remove the terminator we introduced.
1180 Builder.SetInsertPoint(UI->getParent());
1181 UI->eraseFromParent();
1182
1183 return Builder.saveIP();
1184}
1185
1186OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1187 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1188 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1189 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1190 if (!updateToLocation(Loc))
1191 return Loc.IP;
1192
1193 Builder.restoreIP(AllocaIP);
1194 auto *KernelArgsPtr =
1195 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1196 updateToLocation(Loc);
1197
1198 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1199 llvm::Value *Arg =
1200 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1201 Builder.CreateAlignedStore(
1202 KernelArgs[I], Arg,
1203 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1204 }
1205
1206 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1207 NumThreads, HostPtr, KernelArgsPtr};
1208
1209 Return = Builder.CreateCall(
1210 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1211 OffloadingArgs);
1212
1213 return Builder.saveIP();
1214}
1215
1216OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1217 const LocationDescription &Loc, Value *OutlinedFnID,
1218 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1219 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1220
1221 if (!updateToLocation(Loc))
1222 return Loc.IP;
1223
1224 // On top of the arrays that were filled up, the target offloading call
1225 // takes as arguments the device id as well as the host pointer. The host
1226 // pointer is used by the runtime library to identify the current target
1227 // region, so it only has to be unique and not necessarily point to
1228 // anything. It could be the pointer to the outlined function that
1229 // implements the target region, but we aren't using that so that the
1230 // compiler doesn't need to keep that, and could therefore inline the host
1231 // function if proven worthwhile during optimization.
1232
1233 // From this point on, we need to have an ID of the target region defined.
1234 assert(OutlinedFnID && "Invalid outlined function ID!");
1235 (void)OutlinedFnID;
1236
1237 // Return value of the runtime offloading call.
1238 Value *Return = nullptr;
1239
1240 // Arguments for the target kernel.
1241 SmallVector<Value *> ArgsVector;
1242 getKernelArgsVector(Args, Builder, ArgsVector);
1243
1244 // The target region is an outlined function launched by the runtime
1245 // via calls to __tgt_target_kernel().
1246 //
1247 // Note that on the host and CPU targets, the runtime implementation of
1248 // these calls simply call the outlined function without forking threads.
1249 // The outlined functions themselves have runtime calls to
1250 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1251 // the compiler in emitTeamsCall() and emitParallelCall().
1252 //
1253 // In contrast, on the NVPTX target, the implementation of
1254 // __tgt_target_teams() launches a GPU kernel with the requested number
1255 // of teams and threads so no additional calls to the runtime are required.
1256 // Check the error code and execute the host version if required.
1257 Builder.restoreIP(emitTargetKernel(
1258 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1259 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1260
1261 BasicBlock *OffloadFailedBlock =
1262 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1263 BasicBlock *OffloadContBlock =
1264 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1265 Value *Failed = Builder.CreateIsNotNull(Return);
1266 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1267
1268 auto CurFn = Builder.GetInsertBlock()->getParent();
1269 emitBlock(OffloadFailedBlock, CurFn);
1270 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1271 if (!AfterIP)
1272 return AfterIP.takeError();
1273 Builder.restoreIP(*AfterIP);
1274 emitBranch(OffloadContBlock);
1275 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1276 return Builder.saveIP();
1277}
1278
1279Error OpenMPIRBuilder::emitCancelationCheckImpl(
1280 Value *CancelFlag, omp::Directive CanceledDirective,
1281 FinalizeCallbackTy ExitCB) {
1282 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1283 "Unexpected cancellation!");
1284
1285 // For a cancel barrier we create two new blocks.
1286 BasicBlock *BB = Builder.GetInsertBlock();
1287 BasicBlock *NonCancellationBlock;
1288 if (Builder.GetInsertPoint() == BB->end()) {
1289 // TODO: This branch will not be needed once we moved to the
1290 // OpenMPIRBuilder codegen completely.
1291 NonCancellationBlock = BasicBlock::Create(
1292 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1293 } else {
1294 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1296 Builder.SetInsertPoint(BB);
1297 }
1298 BasicBlock *CancellationBlock = BasicBlock::Create(
1299 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1300
1301 // Jump to them based on the return value.
1302 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1303 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1304 /* TODO weight */ nullptr, nullptr);
1305
1306 // From the cancellation block we finalize all variables and go to the
1307 // post finalization block that is known to the FiniCB callback.
1308 Builder.SetInsertPoint(CancellationBlock);
1309 if (ExitCB)
1310 if (Error Err = ExitCB(Builder.saveIP()))
1311 return Err;
1312 auto &FI = FinalizationStack.back();
1313 if (Error Err = FI.FiniCB(Builder.saveIP()))
1314 return Err;
1315
1316 // The continuation block is where code generation continues.
1317 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1318 return Error::success();
1319}
1320
1321// Callback used to create OpenMP runtime calls to support
1322// omp parallel clause for the device.
1323// We need to use this callback to replace call to the OutlinedFn in OuterFn
1324// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1326 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1327 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1328 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1329 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1330 // Add some known attributes.
1331 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1332 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1333 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1334 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1335 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1336 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1337
1338 assert(OutlinedFn.arg_size() >= 2 &&
1339 "Expected at least tid and bounded tid as arguments");
1340 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1341
1342 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1343 assert(CI && "Expected call instruction to outlined function");
1344 CI->getParent()->setName("omp_parallel");
1345
1346 Builder.SetInsertPoint(CI);
1347 Type *PtrTy = OMPIRBuilder->VoidPtr;
1348 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1349
1350 // Add alloca for kernel args
1351 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1352 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1353 AllocaInst *ArgsAlloca =
1354 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1355 Value *Args = ArgsAlloca;
1356 // Add address space cast if array for storing arguments is not allocated
1357 // in address space 0
1358 if (ArgsAlloca->getAddressSpace())
1359 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1360 Builder.restoreIP(CurrentIP);
1361
1362 // Store captured vars which are used by kmpc_parallel_51
1363 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1364 Value *V = *(CI->arg_begin() + 2 + Idx);
1365 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1366 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1367 Builder.CreateStore(V, StoreAddress);
1368 }
1369
1370 Value *Cond =
1371 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1372 : Builder.getInt32(1);
1373
1374 // Build kmpc_parallel_51 call
1375 Value *Parallel51CallArgs[] = {
1376 /* identifier*/ Ident,
1377 /* global thread num*/ ThreadID,
1378 /* if expression */ Cond,
1379 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1380 /* Proc bind */ Builder.getInt32(-1),
1381 /* outlined function */ &OutlinedFn,
1382 /* wrapper function */ NullPtrValue,
1383 /* arguments of the outlined funciton*/ Args,
1384 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1385
1386 FunctionCallee RTLFn =
1387 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1388
1389 Builder.CreateCall(RTLFn, Parallel51CallArgs);
1390
1391 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1392 << *Builder.GetInsertBlock()->getParent() << "\n");
1393
1394 // Initialize the local TID stack location with the argument value.
1395 Builder.SetInsertPoint(PrivTID);
1396 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1397 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1398 PrivTIDAddr);
1399
1400 // Remove redundant call to the outlined function.
1401 CI->eraseFromParent();
1402
1403 for (Instruction *I : ToBeDeleted) {
1404 I->eraseFromParent();
1405 }
1406}
1407
1408// Callback used to create OpenMP runtime calls to support
1409// omp parallel clause for the host.
1410// We need to use this callback to replace call to the OutlinedFn in OuterFn
1411// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1412static void
1413hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1414 Function *OuterFn, Value *Ident, Value *IfCondition,
1415 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1416 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1417 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1418 FunctionCallee RTLFn;
1419 if (IfCondition) {
1420 RTLFn =
1421 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1422 } else {
1423 RTLFn =
1424 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1425 }
1426 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1427 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1428 LLVMContext &Ctx = F->getContext();
1429 MDBuilder MDB(Ctx);
1430 // Annotate the callback behavior of the __kmpc_fork_call:
1431 // - The callback callee is argument number 2 (microtask).
1432 // - The first two arguments of the callback callee are unknown (-1).
1433 // - All variadic arguments to the __kmpc_fork_call are passed to the
1434 // callback callee.
1435 F->addMetadata(LLVMContext::MD_callback,
1437 2, {-1, -1},
1438 /* VarArgsArePassed */ true)}));
1439 }
1440 }
1441 // Add some known attributes.
1442 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1443 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1444 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1445
1446 assert(OutlinedFn.arg_size() >= 2 &&
1447 "Expected at least tid and bounded tid as arguments");
1448 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1449
1450 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1451 CI->getParent()->setName("omp_parallel");
1452 Builder.SetInsertPoint(CI);
1453
1454 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1455 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1456 &OutlinedFn};
1457
1458 SmallVector<Value *, 16> RealArgs;
1459 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1460 if (IfCondition) {
1461 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1462 RealArgs.push_back(Cond);
1463 }
1464 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1465
1466 // __kmpc_fork_call_if always expects a void ptr as the last argument
1467 // If there are no arguments, pass a null pointer.
1468 auto PtrTy = OMPIRBuilder->VoidPtr;
1469 if (IfCondition && NumCapturedVars == 0) {
1470 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1471 RealArgs.push_back(NullPtrValue);
1472 }
1473
1474 Builder.CreateCall(RTLFn, RealArgs);
1475
1476 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1477 << *Builder.GetInsertBlock()->getParent() << "\n");
1478
1479 // Initialize the local TID stack location with the argument value.
1480 Builder.SetInsertPoint(PrivTID);
1481 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1482 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1483 PrivTIDAddr);
1484
1485 // Remove redundant call to the outlined function.
1486 CI->eraseFromParent();
1487
1488 for (Instruction *I : ToBeDeleted) {
1489 I->eraseFromParent();
1490 }
1491}
1492
1493OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1494 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1495 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1496 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1497 omp::ProcBindKind ProcBind, bool IsCancellable) {
1498 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1499
1500 if (!updateToLocation(Loc))
1501 return Loc.IP;
1502
1503 uint32_t SrcLocStrSize;
1504 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1505 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1506 Value *ThreadID = getOrCreateThreadID(Ident);
1507 // If we generate code for the target device, we need to allocate
1508 // struct for aggregate params in the device default alloca address space.
1509 // OpenMP runtime requires that the params of the extracted functions are
1510 // passed as zero address space pointers. This flag ensures that extracted
1511 // function arguments are declared in zero address space
1512 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1513
1514 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1515 // only if we compile for host side.
1516 if (NumThreads && !Config.isTargetDevice()) {
1517 Value *Args[] = {
1518 Ident, ThreadID,
1519 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1520 Builder.CreateCall(
1521 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1522 }
1523
1524 if (ProcBind != OMP_PROC_BIND_default) {
1525 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1526 Value *Args[] = {
1527 Ident, ThreadID,
1528 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1529 Builder.CreateCall(
1530 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1531 }
1532
1533 BasicBlock *InsertBB = Builder.GetInsertBlock();
1534 Function *OuterFn = InsertBB->getParent();
1535
1536 // Save the outer alloca block because the insertion iterator may get
1537 // invalidated and we still need this later.
1538 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1539
1540 // Vector to remember instructions we used only during the modeling but which
1541 // we want to delete at the end.
1543
1544 // Change the location to the outer alloca insertion point to create and
1545 // initialize the allocas we pass into the parallel region.
1546 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1547 Builder.restoreIP(NewOuter);
1548 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1549 AllocaInst *ZeroAddrAlloca =
1550 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1551 Instruction *TIDAddr = TIDAddrAlloca;
1552 Instruction *ZeroAddr = ZeroAddrAlloca;
1553 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1554 // Add additional casts to enforce pointers in zero address space
1555 TIDAddr = new AddrSpaceCastInst(
1556 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1557 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1558 ToBeDeleted.push_back(TIDAddr);
1559 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1560 PointerType ::get(M.getContext(), 0),
1561 "zero.addr.ascast");
1562 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1563 ToBeDeleted.push_back(ZeroAddr);
1564 }
1565
1566 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1567 // associated arguments in the outlined function, so we delete them later.
1568 ToBeDeleted.push_back(TIDAddrAlloca);
1569 ToBeDeleted.push_back(ZeroAddrAlloca);
1570
1571 // Create an artificial insertion point that will also ensure the blocks we
1572 // are about to split are not degenerated.
1573 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1574
1575 BasicBlock *EntryBB = UI->getParent();
1576 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1577 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1578 BasicBlock *PRegPreFiniBB =
1579 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1580 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1581
1582 auto FiniCBWrapper = [&](InsertPointTy IP) {
1583 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1584 // target to the region exit block.
1585 if (IP.getBlock()->end() == IP.getPoint()) {
1586 IRBuilder<>::InsertPointGuard IPG(Builder);
1587 Builder.restoreIP(IP);
1588 Instruction *I = Builder.CreateBr(PRegExitBB);
1589 IP = InsertPointTy(I->getParent(), I->getIterator());
1590 }
1592 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1593 "Unexpected insertion point for finalization call!");
1594 return FiniCB(IP);
1595 };
1596
1597 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1598
1599 // Generate the privatization allocas in the block that will become the entry
1600 // of the outlined function.
1601 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1602 InsertPointTy InnerAllocaIP = Builder.saveIP();
1603
1604 AllocaInst *PrivTIDAddr =
1605 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1606 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1607
1608 // Add some fake uses for OpenMP provided arguments.
1609 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1610 Instruction *ZeroAddrUse =
1611 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1612 ToBeDeleted.push_back(ZeroAddrUse);
1613
1614 // EntryBB
1615 // |
1616 // V
1617 // PRegionEntryBB <- Privatization allocas are placed here.
1618 // |
1619 // V
1620 // PRegionBodyBB <- BodeGen is invoked here.
1621 // |
1622 // V
1623 // PRegPreFiniBB <- The block we will start finalization from.
1624 // |
1625 // V
1626 // PRegionExitBB <- A common exit to simplify block collection.
1627 //
1628
1629 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1630
1631 // Let the caller create the body.
1632 assert(BodyGenCB && "Expected body generation callback!");
1633 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1634 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1635 return Err;
1636
1637 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1638
1639 OutlineInfo OI;
1640 if (Config.isTargetDevice()) {
1641 // Generate OpenMP target specific runtime call
1642 OI.PostOutlineCB = [=, ToBeDeletedVec =
1643 std::move(ToBeDeleted)](Function &OutlinedFn) {
1644 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1645 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1646 ThreadID, ToBeDeletedVec);
1647 };
1648 } else {
1649 // Generate OpenMP host runtime call
1650 OI.PostOutlineCB = [=, ToBeDeletedVec =
1651 std::move(ToBeDeleted)](Function &OutlinedFn) {
1652 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1653 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1654 };
1655 }
1656
1657 OI.OuterAllocaBB = OuterAllocaBlock;
1658 OI.EntryBB = PRegEntryBB;
1659 OI.ExitBB = PRegExitBB;
1660
1661 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1663 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1664
1665 CodeExtractorAnalysisCache CEAC(*OuterFn);
1666 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1667 /* AggregateArgs */ false,
1668 /* BlockFrequencyInfo */ nullptr,
1669 /* BranchProbabilityInfo */ nullptr,
1670 /* AssumptionCache */ nullptr,
1671 /* AllowVarArgs */ true,
1672 /* AllowAlloca */ true,
1673 /* AllocationBlock */ OuterAllocaBlock,
1674 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1675
1676 // Find inputs to, outputs from the code region.
1677 BasicBlock *CommonExit = nullptr;
1678 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1679 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1680
1681 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1682 /*CollectGlobalInputs=*/true);
1683
1684 Inputs.remove_if([&](Value *I) {
1686 return GV->getValueType() == OpenMPIRBuilder::Ident;
1687
1688 return false;
1689 });
1690
1691 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1692
1693 FunctionCallee TIDRTLFn =
1694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1695
1696 auto PrivHelper = [&](Value &V) -> Error {
1697 if (&V == TIDAddr || &V == ZeroAddr) {
1698 OI.ExcludeArgsFromAggregate.push_back(&V);
1699 return Error::success();
1700 }
1701
1703 for (Use &U : V.uses())
1704 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1705 if (ParallelRegionBlockSet.count(UserI->getParent()))
1706 Uses.insert(&U);
1707
1708 // __kmpc_fork_call expects extra arguments as pointers. If the input
1709 // already has a pointer type, everything is fine. Otherwise, store the
1710 // value onto stack and load it back inside the to-be-outlined region. This
1711 // will ensure only the pointer will be passed to the function.
1712 // FIXME: if there are more than 15 trailing arguments, they must be
1713 // additionally packed in a struct.
1714 Value *Inner = &V;
1715 if (!V.getType()->isPointerTy()) {
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1718
1719 Builder.restoreIP(OuterAllocaIP);
1720 Value *Ptr =
1721 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1722
1723 // Store to stack at end of the block that currently branches to the entry
1724 // block of the to-be-outlined region.
1725 Builder.SetInsertPoint(InsertBB,
1726 InsertBB->getTerminator()->getIterator());
1727 Builder.CreateStore(&V, Ptr);
1728
1729 // Load back next to allocations in the to-be-outlined region.
1730 Builder.restoreIP(InnerAllocaIP);
1731 Inner = Builder.CreateLoad(V.getType(), Ptr);
1732 }
1733
1734 Value *ReplacementValue = nullptr;
1735 CallInst *CI = dyn_cast<CallInst>(&V);
1736 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1737 ReplacementValue = PrivTID;
1738 } else {
1739 InsertPointOrErrorTy AfterIP =
1740 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1741 if (!AfterIP)
1742 return AfterIP.takeError();
1743 Builder.restoreIP(*AfterIP);
1744 InnerAllocaIP = {
1745 InnerAllocaIP.getBlock(),
1746 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1747
1748 assert(ReplacementValue &&
1749 "Expected copy/create callback to set replacement value!");
1750 if (ReplacementValue == &V)
1751 return Error::success();
1752 }
1753
1754 for (Use *UPtr : Uses)
1755 UPtr->set(ReplacementValue);
1756
1757 return Error::success();
1758 };
1759
1760 // Reset the inner alloca insertion as it will be used for loading the values
1761 // wrapped into pointers before passing them into the to-be-outlined region.
1762 // Configure it to insert immediately after the fake use of zero address so
1763 // that they are available in the generated body and so that the
1764 // OpenMP-related values (thread ID and zero address pointers) remain leading
1765 // in the argument list.
1766 InnerAllocaIP = IRBuilder<>::InsertPoint(
1767 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1768
1769 // Reset the outer alloca insertion point to the entry of the relevant block
1770 // in case it was invalidated.
1771 OuterAllocaIP = IRBuilder<>::InsertPoint(
1772 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1773
1774 for (Value *Input : Inputs) {
1775 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1776 if (Error Err = PrivHelper(*Input))
1777 return Err;
1778 }
1779 LLVM_DEBUG({
1780 for (Value *Output : Outputs)
1781 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1782 });
1783 assert(Outputs.empty() &&
1784 "OpenMP outlining should not produce live-out values!");
1785
1786 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1787 LLVM_DEBUG({
1788 for (auto *BB : Blocks)
1789 dbgs() << " PBR: " << BB->getName() << "\n";
1790 });
1791
1792 // Adjust the finalization stack, verify the adjustment, and call the
1793 // finalize function a last time to finalize values between the pre-fini
1794 // block and the exit block if we left the parallel "the normal way".
1795 auto FiniInfo = FinalizationStack.pop_back_val();
1796 (void)FiniInfo;
1797 assert(FiniInfo.DK == OMPD_parallel &&
1798 "Unexpected finalization stack state!");
1799
1800 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1801
1802 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1803 if (Error Err = FiniCB(PreFiniIP))
1804 return Err;
1805
1806 // Register the outlined info.
1807 addOutlineInfo(std::move(OI));
1808
1809 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1810 UI->eraseFromParent();
1811
1812 return AfterIP;
1813}
1814
1815void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1816 // Build call void __kmpc_flush(ident_t *loc)
1817 uint32_t SrcLocStrSize;
1818 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1819 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1820
1821 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
1822}
1823
1824void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1825 if (!updateToLocation(Loc))
1826 return;
1827 emitFlush(Loc);
1828}
1829
1830void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1831 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1832 // global_tid);
1833 uint32_t SrcLocStrSize;
1834 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1835 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1836 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1837
1838 // Ignore return result until untied tasks are supported.
1839 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
1840 Args);
1841}
1842
1843void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1844 if (!updateToLocation(Loc))
1845 return;
1846 emitTaskwaitImpl(Loc);
1847}
1848
1849void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1850 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1851 uint32_t SrcLocStrSize;
1852 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1853 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1855 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1856
1857 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
1858 Args);
1859}
1860
1861void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1862 if (!updateToLocation(Loc))
1863 return;
1864 emitTaskyieldImpl(Loc);
1865}
1866
1867// Processes the dependencies in Dependencies and does the following
1868// - Allocates space on the stack of an array of DependInfo objects
1869// - Populates each DependInfo object with relevant information of
1870// the corresponding dependence.
1871// - All code is inserted in the entry block of the current function.
1873 OpenMPIRBuilder &OMPBuilder,
1875 // Early return if we have no dependencies to process
1876 if (Dependencies.empty())
1877 return nullptr;
1878
1879 // Given a vector of DependData objects, in this function we create an
1880 // array on the stack that holds kmp_dep_info objects corresponding
1881 // to each dependency. This is then passed to the OpenMP runtime.
1882 // For example, if there are 'n' dependencies then the following psedo
1883 // code is generated. Assume the first dependence is on a variable 'a'
1884 //
1885 // \code{c}
1886 // DepArray = alloc(n x sizeof(kmp_depend_info);
1887 // idx = 0;
1888 // DepArray[idx].base_addr = ptrtoint(&a);
1889 // DepArray[idx].len = 8;
1890 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1891 // ++idx;
1892 // DepArray[idx].base_addr = ...;
1893 // \endcode
1894
1895 IRBuilderBase &Builder = OMPBuilder.Builder;
1896 Type *DependInfo = OMPBuilder.DependInfo;
1897 Module &M = OMPBuilder.M;
1898
1899 Value *DepArray = nullptr;
1900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1901 Builder.SetInsertPoint(
1902 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1903
1904 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1905 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1906
1907 Builder.restoreIP(OldIP);
1908
1909 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1910 Value *Base =
1911 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1912 // Store the pointer to the variable
1913 Value *Addr = Builder.CreateStructGEP(
1914 DependInfo, Base,
1915 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1916 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1917 Builder.CreateStore(DepValPtr, Addr);
1918 // Store the size of the variable
1919 Value *Size = Builder.CreateStructGEP(
1920 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1921 Builder.CreateStore(
1922 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1923 Size);
1924 // Store the dependency kind
1925 Value *Flags = Builder.CreateStructGEP(
1926 DependInfo, Base,
1927 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1928 Builder.CreateStore(
1929 ConstantInt::get(Builder.getInt8Ty(),
1930 static_cast<unsigned int>(Dep.DepKind)),
1931 Flags);
1932 }
1933 return DepArray;
1934}
1935
1936OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1937 const LocationDescription &Loc, InsertPointTy AllocaIP,
1938 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1939 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1940 Value *Priority) {
1941
1942 if (!updateToLocation(Loc))
1943 return InsertPointTy();
1944
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 // The current basic block is split into four basic blocks. After outlining,
1949 // they will be mapped as follows:
1950 // ```
1951 // def current_fn() {
1952 // current_basic_block:
1953 // br label %task.exit
1954 // task.exit:
1955 // ; instructions after task
1956 // }
1957 // def outlined_fn() {
1958 // task.alloca:
1959 // br label %task.body
1960 // task.body:
1961 // ret void
1962 // }
1963 // ```
1964 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1965 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1966 BasicBlock *TaskAllocaBB =
1967 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1968
1969 InsertPointTy TaskAllocaIP =
1970 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1971 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1972 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1973 return Err;
1974
1975 OutlineInfo OI;
1976 OI.EntryBB = TaskAllocaBB;
1977 OI.OuterAllocaBB = AllocaIP.getBlock();
1978 OI.ExitBB = TaskExitBB;
1979
1980 // Add the thread ID argument.
1982 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
1983 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
1984
1985 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
1986 Mergeable, Priority, EventHandle, TaskAllocaBB,
1987 ToBeDeleted](Function &OutlinedFn) mutable {
1988 // Replace the Stale CI by appropriate RTL function call.
1989 assert(OutlinedFn.hasOneUse() &&
1990 "there must be a single user for the outlined function");
1991 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
1992
1993 // HasShareds is true if any variables are captured in the outlined region,
1994 // false otherwise.
1995 bool HasShareds = StaleCI->arg_size() > 1;
1996 Builder.SetInsertPoint(StaleCI);
1997
1998 // Gather the arguments for emitting the runtime call for
1999 // @__kmpc_omp_task_alloc
2000 Function *TaskAllocFn =
2001 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2002
2003 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2004 // call.
2005 Value *ThreadID = getOrCreateThreadID(Ident);
2006
2007 // Argument - `flags`
2008 // Task is tied iff (Flags & 1) == 1.
2009 // Task is untied iff (Flags & 1) == 0.
2010 // Task is final iff (Flags & 2) == 2.
2011 // Task is not final iff (Flags & 2) == 0.
2012 // Task is mergeable iff (Flags & 4) == 4.
2013 // Task is not mergeable iff (Flags & 4) == 0.
2014 // Task is priority iff (Flags & 32) == 32.
2015 // Task is not priority iff (Flags & 32) == 0.
2016 // TODO: Handle the other flags.
2017 Value *Flags = Builder.getInt32(Tied);
2018 if (Final) {
2019 Value *FinalFlag =
2020 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2021 Flags = Builder.CreateOr(FinalFlag, Flags);
2022 }
2023
2024 if (Mergeable)
2025 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2026 if (Priority)
2027 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2028
2029 // Argument - `sizeof_kmp_task_t` (TaskSize)
2030 // Tasksize refers to the size in bytes of kmp_task_t data structure
2031 // including private vars accessed in task.
2032 // TODO: add kmp_task_t_with_privates (privates)
2033 Value *TaskSize = Builder.getInt64(
2034 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2035
2036 // Argument - `sizeof_shareds` (SharedsSize)
2037 // SharedsSize refers to the shareds array size in the kmp_task_t data
2038 // structure.
2039 Value *SharedsSize = Builder.getInt64(0);
2040 if (HasShareds) {
2041 AllocaInst *ArgStructAlloca =
2043 assert(ArgStructAlloca &&
2044 "Unable to find the alloca instruction corresponding to arguments "
2045 "for extracted function");
2046 StructType *ArgStructType =
2047 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2048 assert(ArgStructType && "Unable to find struct type corresponding to "
2049 "arguments for extracted function");
2050 SharedsSize =
2051 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2052 }
2053 // Emit the @__kmpc_omp_task_alloc runtime call
2054 // The runtime call returns a pointer to an area where the task captured
2055 // variables must be copied before the task is run (TaskData)
2056 CallInst *TaskData = Builder.CreateCall(
2057 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2058 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2059 /*task_func=*/&OutlinedFn});
2060
2061 // Emit detach clause initialization.
2062 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2063 // task_descriptor);
2064 if (EventHandle) {
2065 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2066 OMPRTL___kmpc_task_allow_completion_event);
2067 llvm::Value *EventVal =
2068 Builder.CreateCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2069 llvm::Value *EventHandleAddr =
2070 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2071 Builder.getPtrTy(0));
2072 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2073 Builder.CreateStore(EventVal, EventHandleAddr);
2074 }
2075 // Copy the arguments for outlined function
2076 if (HasShareds) {
2077 Value *Shareds = StaleCI->getArgOperand(1);
2078 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2079 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2080 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2081 SharedsSize);
2082 }
2083
2084 if (Priority) {
2085 //
2086 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2087 // we populate the priority information into the "kmp_task_t" here
2088 //
2089 // The struct "kmp_task_t" definition is available in kmp.h
2090 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2091 // data2 is used for priority
2092 //
2093 Type *Int32Ty = Builder.getInt32Ty();
2094 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2095 // kmp_task_t* => { ptr }
2096 Type *TaskPtr = StructType::get(VoidPtr);
2097 Value *TaskGEP =
2098 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2099 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2100 Type *TaskStructType = StructType::get(
2101 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2102 Value *PriorityData = Builder.CreateInBoundsGEP(
2103 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2104 // kmp_cmplrdata_t => { ptr, ptr }
2105 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2106 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2107 PriorityData, {Zero, Zero});
2108 Builder.CreateStore(Priority, CmplrData);
2109 }
2110
2111 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2112
2113 // In the presence of the `if` clause, the following IR is generated:
2114 // ...
2115 // %data = call @__kmpc_omp_task_alloc(...)
2116 // br i1 %if_condition, label %then, label %else
2117 // then:
2118 // call @__kmpc_omp_task(...)
2119 // br label %exit
2120 // else:
2121 // ;; Wait for resolution of dependencies, if any, before
2122 // ;; beginning the task
2123 // call @__kmpc_omp_wait_deps(...)
2124 // call @__kmpc_omp_task_begin_if0(...)
2125 // call @outlined_fn(...)
2126 // call @__kmpc_omp_task_complete_if0(...)
2127 // br label %exit
2128 // exit:
2129 // ...
2130 if (IfCondition) {
2131 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2132 // terminator.
2133 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2134 Instruction *IfTerminator =
2135 Builder.GetInsertPoint()->getParent()->getTerminator();
2136 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2137 Builder.SetInsertPoint(IfTerminator);
2138 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2139 &ElseTI);
2140 Builder.SetInsertPoint(ElseTI);
2141
2142 if (Dependencies.size()) {
2143 Function *TaskWaitFn =
2144 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2145 Builder.CreateCall(
2146 TaskWaitFn,
2147 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2148 ConstantInt::get(Builder.getInt32Ty(), 0),
2150 }
2151 Function *TaskBeginFn =
2152 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2153 Function *TaskCompleteFn =
2154 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2155 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2156 CallInst *CI = nullptr;
2157 if (HasShareds)
2158 CI = Builder.CreateCall(&OutlinedFn, {ThreadID, TaskData});
2159 else
2160 CI = Builder.CreateCall(&OutlinedFn, {ThreadID});
2161 CI->setDebugLoc(StaleCI->getDebugLoc());
2162 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2163 Builder.SetInsertPoint(ThenTI);
2164 }
2165
2166 if (Dependencies.size()) {
2167 Function *TaskFn =
2168 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2169 Builder.CreateCall(
2170 TaskFn,
2171 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2172 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2174
2175 } else {
2176 // Emit the @__kmpc_omp_task runtime call to spawn the task
2177 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2178 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
2179 }
2180
2181 StaleCI->eraseFromParent();
2182
2183 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2184 if (HasShareds) {
2185 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2186 OutlinedFn.getArg(1)->replaceUsesWithIf(
2187 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2188 }
2189
2190 for (Instruction *I : llvm::reverse(ToBeDeleted))
2191 I->eraseFromParent();
2192 };
2193
2194 addOutlineInfo(std::move(OI));
2195 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2196
2197 return Builder.saveIP();
2198}
2199
2200OpenMPIRBuilder::InsertPointOrErrorTy
2201OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2202 InsertPointTy AllocaIP,
2203 BodyGenCallbackTy BodyGenCB) {
2204 if (!updateToLocation(Loc))
2205 return InsertPointTy();
2206
2207 uint32_t SrcLocStrSize;
2208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2209 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2210 Value *ThreadID = getOrCreateThreadID(Ident);
2211
2212 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2213 Function *TaskgroupFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2215 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2216
2217 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2218 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2219 return Err;
2220
2221 Builder.SetInsertPoint(TaskgroupExitBB);
2222 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2223 Function *EndTaskgroupFn =
2224 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2225 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2226
2227 return Builder.saveIP();
2228}
2229
2230OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2231 const LocationDescription &Loc, InsertPointTy AllocaIP,
2232 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2233 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2234 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2235
2236 if (!updateToLocation(Loc))
2237 return Loc.IP;
2238
2239 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2240 // this has not been created yet at some times when this callback runs.
2241 SmallVector<BranchInst *> CancellationBranches;
2242 auto FiniCBWrapper = [&](InsertPointTy IP) {
2243 if (IP.getBlock()->end() != IP.getPoint())
2244 return FiniCB(IP);
2245 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2246 // will fail because that function requires the Finalization Basic Block to
2247 // have a terminator, which is already removed by EmitOMPRegionBody.
2248 // IP is currently at cancelation block.
2249 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2250 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2251 CancellationBranches.push_back(DummyBranch);
2252 return FiniCB(IP);
2253 };
2254
2255 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2256
2257 // Each section is emitted as a switch case
2258 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2259 // -> OMP.createSection() which generates the IR for each section
2260 // Iterate through all sections and emit a switch construct:
2261 // switch (IV) {
2262 // case 0:
2263 // <SectionStmt[0]>;
2264 // break;
2265 // ...
2266 // case <NumSection> - 1:
2267 // <SectionStmt[<NumSection> - 1]>;
2268 // break;
2269 // }
2270 // ...
2271 // section_loop.after:
2272 // <FiniCB>;
2273 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2274 Builder.restoreIP(CodeGenIP);
2276 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2277 Function *CurFn = Continue->getParent();
2278 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2279
2280 unsigned CaseNumber = 0;
2281 for (auto SectionCB : SectionCBs) {
2283 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2284 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2285 Builder.SetInsertPoint(CaseBB);
2286 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2287 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2288 CaseEndBr->getIterator()}))
2289 return Err;
2290 CaseNumber++;
2291 }
2292 // remove the existing terminator from body BB since there can be no
2293 // terminators after switch/case
2294 return Error::success();
2295 };
2296 // Loop body ends here
2297 // LowerBound, UpperBound, and STride for createCanonicalLoop
2298 Type *I32Ty = Type::getInt32Ty(M.getContext());
2299 Value *LB = ConstantInt::get(I32Ty, 0);
2300 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2301 Value *ST = ConstantInt::get(I32Ty, 1);
2302 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2303 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2304 if (!LoopInfo)
2305 return LoopInfo.takeError();
2306
2307 InsertPointOrErrorTy WsloopIP =
2308 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2309 WorksharingLoopType::ForStaticLoop, !IsNowait);
2310 if (!WsloopIP)
2311 return WsloopIP.takeError();
2312 InsertPointTy AfterIP = *WsloopIP;
2313
2314 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2315 assert(LoopFini && "Bad structure of static workshare loop finalization");
2316
2317 // Apply the finalization callback in LoopAfterBB
2318 auto FiniInfo = FinalizationStack.pop_back_val();
2319 assert(FiniInfo.DK == OMPD_sections &&
2320 "Unexpected finalization stack state!");
2321 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2322 Builder.restoreIP(AfterIP);
2323 BasicBlock *FiniBB =
2324 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2325 if (Error Err = CB(Builder.saveIP()))
2326 return Err;
2327 AfterIP = {FiniBB, FiniBB->begin()};
2328 }
2329
2330 // Now we can fix the dummy branch to point to the right place
2331 for (BranchInst *DummyBranch : CancellationBranches) {
2332 assert(DummyBranch->getNumSuccessors() == 1);
2333 DummyBranch->setSuccessor(0, LoopFini);
2334 }
2335
2336 return AfterIP;
2337}
2338
2339OpenMPIRBuilder::InsertPointOrErrorTy
2340OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2341 BodyGenCallbackTy BodyGenCB,
2342 FinalizeCallbackTy FiniCB) {
2343 if (!updateToLocation(Loc))
2344 return Loc.IP;
2345
2346 auto FiniCBWrapper = [&](InsertPointTy IP) {
2347 if (IP.getBlock()->end() != IP.getPoint())
2348 return FiniCB(IP);
2349 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2350 // will fail because that function requires the Finalization Basic Block to
2351 // have a terminator, which is already removed by EmitOMPRegionBody.
2352 // IP is currently at cancelation block.
2353 // We need to backtrack to the condition block to fetch
2354 // the exit block and create a branch from cancelation
2355 // to exit block.
2356 IRBuilder<>::InsertPointGuard IPG(Builder);
2357 Builder.restoreIP(IP);
2358 auto *CaseBB = Loc.IP.getBlock();
2359 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2360 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2361 Instruction *I = Builder.CreateBr(ExitBB);
2362 IP = InsertPointTy(I->getParent(), I->getIterator());
2363 return FiniCB(IP);
2364 };
2365
2366 Directive OMPD = Directive::OMPD_sections;
2367 // Since we are using Finalization Callback here, HasFinalize
2368 // and IsCancellable have to be true
2369 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2370 /*Conditional*/ false, /*hasFinalize*/ true,
2371 /*IsCancellable*/ true);
2372}
2373
2374static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2376 IT++;
2377 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2378}
2379
2380Value *OpenMPIRBuilder::getGPUThreadID() {
2381 return Builder.CreateCall(
2382 getOrCreateRuntimeFunction(M,
2383 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2384 {});
2385}
2386
2387Value *OpenMPIRBuilder::getGPUWarpSize() {
2388 return Builder.CreateCall(
2389 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2390}
2391
2392Value *OpenMPIRBuilder::getNVPTXWarpID() {
2393 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2394 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2395}
2396
2397Value *OpenMPIRBuilder::getNVPTXLaneID() {
2398 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2399 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2400 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2401 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2402 "nvptx_lane_id");
2403}
2404
2405Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2406 Type *ToType) {
2407 Type *FromType = From->getType();
2408 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2409 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2410 assert(FromSize > 0 && "From size must be greater than zero");
2411 assert(ToSize > 0 && "To size must be greater than zero");
2412 if (FromType == ToType)
2413 return From;
2414 if (FromSize == ToSize)
2415 return Builder.CreateBitCast(From, ToType);
2416 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2417 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2418 InsertPointTy SaveIP = Builder.saveIP();
2419 Builder.restoreIP(AllocaIP);
2420 Value *CastItem = Builder.CreateAlloca(ToType);
2421 Builder.restoreIP(SaveIP);
2422
2423 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2424 CastItem, Builder.getPtrTy(0));
2425 Builder.CreateStore(From, ValCastItem);
2426 return Builder.CreateLoad(ToType, CastItem);
2427}
2428
2429Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2430 Value *Element,
2431 Type *ElementType,
2432 Value *Offset) {
2433 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2434 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2435
2436 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2437 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2438 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2439 Value *WarpSize =
2440 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2441 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2442 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2443 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2444 Value *WarpSizeCast =
2445 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2446 Value *ShuffleCall =
2447 Builder.CreateCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2448 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2449}
2450
2451void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2452 Value *DstAddr, Type *ElemType,
2453 Value *Offset, Type *ReductionArrayTy) {
2454 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2455 // Create the loop over the big sized data.
2456 // ptr = (void*)Elem;
2457 // ptrEnd = (void*) Elem + 1;
2458 // Step = 8;
2459 // while (ptr + Step < ptrEnd)
2460 // shuffle((int64_t)*ptr);
2461 // Step = 4;
2462 // while (ptr + Step < ptrEnd)
2463 // shuffle((int32_t)*ptr);
2464 // ...
2465 Type *IndexTy = Builder.getIndexTy(
2466 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2467 Value *ElemPtr = DstAddr;
2468 Value *Ptr = SrcAddr;
2469 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2470 if (Size < IntSize)
2471 continue;
2472 Type *IntType = Builder.getIntNTy(IntSize * 8);
2473 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2474 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2475 Value *SrcAddrGEP =
2476 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2477 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2478 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2479
2480 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2481 if ((Size / IntSize) > 1) {
2482 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2483 SrcAddrGEP, Builder.getPtrTy());
2484 BasicBlock *PreCondBB =
2485 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2486 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2487 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2488 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2489 emitBlock(PreCondBB, CurFunc);
2490 PHINode *PhiSrc =
2491 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2492 PhiSrc->addIncoming(Ptr, CurrentBB);
2493 PHINode *PhiDest =
2494 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2495 PhiDest->addIncoming(ElemPtr, CurrentBB);
2496 Ptr = PhiSrc;
2497 ElemPtr = PhiDest;
2498 Value *PtrDiff = Builder.CreatePtrDiff(
2499 Builder.getInt8Ty(), PtrEnd,
2500 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2501 Builder.CreateCondBr(
2502 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2503 ExitBB);
2504 emitBlock(ThenBB, CurFunc);
2505 Value *Res = createRuntimeShuffleFunction(
2506 AllocaIP,
2507 Builder.CreateAlignedLoad(
2508 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2509 IntType, Offset);
2510 Builder.CreateAlignedStore(Res, ElemPtr,
2511 M.getDataLayout().getPrefTypeAlign(ElemType));
2512 Value *LocalPtr =
2513 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2514 Value *LocalElemPtr =
2515 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2516 PhiSrc->addIncoming(LocalPtr, ThenBB);
2517 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2518 emitBranch(PreCondBB);
2519 emitBlock(ExitBB, CurFunc);
2520 } else {
2521 Value *Res = createRuntimeShuffleFunction(
2522 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2523 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2524 Res->getType()->getScalarSizeInBits())
2525 Res = Builder.CreateTrunc(Res, ElemType);
2526 Builder.CreateStore(Res, ElemPtr);
2527 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2528 ElemPtr =
2529 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2530 }
2531 Size = Size % IntSize;
2532 }
2533}
2534
2535void OpenMPIRBuilder::emitReductionListCopy(
2536 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2537 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2538 CopyOptionsTy CopyOptions) {
2539 Type *IndexTy = Builder.getIndexTy(
2540 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2541 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2542
2543 // Iterates, element-by-element, through the source Reduce list and
2544 // make a copy.
2545 for (auto En : enumerate(ReductionInfos)) {
2546 const ReductionInfo &RI = En.value();
2547 Value *SrcElementAddr = nullptr;
2548 Value *DestElementAddr = nullptr;
2549 Value *DestElementPtrAddr = nullptr;
2550 // Should we shuffle in an element from a remote lane?
2551 bool ShuffleInElement = false;
2552 // Set to true to update the pointer in the dest Reduce list to a
2553 // newly created element.
2554 bool UpdateDestListPtr = false;
2555
2556 // Step 1.1: Get the address for the src element in the Reduce list.
2557 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2558 ReductionArrayTy, SrcBase,
2559 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2560 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2561
2562 // Step 1.2: Create a temporary to store the element in the destination
2563 // Reduce list.
2564 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2565 ReductionArrayTy, DestBase,
2566 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2567 switch (Action) {
2568 case CopyAction::RemoteLaneToThread: {
2569 InsertPointTy CurIP = Builder.saveIP();
2570 Builder.restoreIP(AllocaIP);
2571 AllocaInst *DestAlloca = Builder.CreateAlloca(RI.ElementType, nullptr,
2572 ".omp.reduction.element");
2573 DestAlloca->setAlignment(
2574 M.getDataLayout().getPrefTypeAlign(RI.ElementType));
2575 DestElementAddr = DestAlloca;
2576 DestElementAddr =
2577 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2578 DestElementAddr->getName() + ".ascast");
2579 Builder.restoreIP(CurIP);
2580 ShuffleInElement = true;
2581 UpdateDestListPtr = true;
2582 break;
2583 }
2584 case CopyAction::ThreadCopy: {
2585 DestElementAddr =
2586 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2587 break;
2588 }
2589 }
2590
2591 // Now that all active lanes have read the element in the
2592 // Reduce list, shuffle over the value from the remote lane.
2593 if (ShuffleInElement) {
2594 shuffleAndStore(AllocaIP, SrcElementAddr, DestElementAddr, RI.ElementType,
2595 RemoteLaneOffset, ReductionArrayTy);
2596 } else {
2597 switch (RI.EvaluationKind) {
2598 case EvalKind::Scalar: {
2599 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2600 // Store the source element value to the dest element address.
2601 Builder.CreateStore(Elem, DestElementAddr);
2602 break;
2603 }
2604 case EvalKind::Complex: {
2605 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2606 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2607 Value *SrcReal = Builder.CreateLoad(
2608 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2609 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2610 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2611 Value *SrcImg = Builder.CreateLoad(
2612 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2613
2614 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2615 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2616 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2617 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2618 Builder.CreateStore(SrcReal, DestRealPtr);
2619 Builder.CreateStore(SrcImg, DestImgPtr);
2620 break;
2621 }
2622 case EvalKind::Aggregate: {
2623 Value *SizeVal = Builder.getInt64(
2624 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2625 Builder.CreateMemCpy(
2626 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2627 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2628 SizeVal, false);
2629 break;
2630 }
2631 };
2632 }
2633
2634 // Step 3.1: Modify reference in dest Reduce list as needed.
2635 // Modifying the reference in Reduce list to point to the newly
2636 // created element. The element is live in the current function
2637 // scope and that of functions it invokes (i.e., reduce_function).
2638 // RemoteReduceData[i] = (void*)&RemoteElem
2639 if (UpdateDestListPtr) {
2640 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2641 DestElementAddr, Builder.getPtrTy(),
2642 DestElementAddr->getName() + ".ascast");
2643 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2644 }
2645 }
2646}
2647
2648Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2649 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2650 AttributeList FuncAttrs) {
2651 InsertPointTy SavedIP = Builder.saveIP();
2652 LLVMContext &Ctx = M.getContext();
2654 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2655 /* IsVarArg */ false);
2656 Function *WcFunc =
2658 "_omp_reduction_inter_warp_copy_func", &M);
2659 WcFunc->setAttributes(FuncAttrs);
2660 WcFunc->addParamAttr(0, Attribute::NoUndef);
2661 WcFunc->addParamAttr(1, Attribute::NoUndef);
2662 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2663 Builder.SetInsertPoint(EntryBB);
2664
2665 // ReduceList: thread local Reduce list.
2666 // At the stage of the computation when this function is called, partially
2667 // aggregated values reside in the first lane of every active warp.
2668 Argument *ReduceListArg = WcFunc->getArg(0);
2669 // NumWarps: number of warps active in the parallel region. This could
2670 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2671 Argument *NumWarpsArg = WcFunc->getArg(1);
2672
2673 // This array is used as a medium to transfer, one reduce element at a time,
2674 // the data from the first lane of every warp to lanes in the first warp
2675 // in order to perform the final step of a reduction in a parallel region
2676 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2677 // for reduced latency, as well as to have a distinct copy for concurrently
2678 // executing target regions. The array is declared with common linkage so
2679 // as to be shared across compilation units.
2680 StringRef TransferMediumName =
2681 "__openmp_nvptx_data_transfer_temporary_storage";
2682 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2683 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2684 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2685 if (!TransferMedium) {
2686 TransferMedium = new GlobalVariable(
2687 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2688 UndefValue::get(ArrayTy), TransferMediumName,
2689 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2690 /*AddressSpace=*/3);
2691 }
2692
2693 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2694 Value *GPUThreadID = getGPUThreadID();
2695 // nvptx_lane_id = nvptx_id % warpsize
2696 Value *LaneID = getNVPTXLaneID();
2697 // nvptx_warp_id = nvptx_id / warpsize
2698 Value *WarpID = getNVPTXWarpID();
2699
2700 InsertPointTy AllocaIP =
2701 InsertPointTy(Builder.GetInsertBlock(),
2702 Builder.GetInsertBlock()->getFirstInsertionPt());
2703 Type *Arg0Type = ReduceListArg->getType();
2704 Type *Arg1Type = NumWarpsArg->getType();
2705 Builder.restoreIP(AllocaIP);
2706 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2707 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2708 AllocaInst *NumWarpsAlloca =
2709 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2710 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2711 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2712 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2713 NumWarpsAlloca, Builder.getPtrTy(0),
2714 NumWarpsAlloca->getName() + ".ascast");
2715 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2716 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2717 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2718 InsertPointTy CodeGenIP =
2719 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2720 Builder.restoreIP(CodeGenIP);
2721
2722 Value *ReduceList =
2723 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2724
2725 for (auto En : enumerate(ReductionInfos)) {
2726 //
2727 // Warp master copies reduce element to transfer medium in __shared__
2728 // memory.
2729 //
2730 const ReductionInfo &RI = En.value();
2731 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(RI.ElementType);
2732 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2733 Type *CType = Builder.getIntNTy(TySize * 8);
2734
2735 unsigned NumIters = RealTySize / TySize;
2736 if (NumIters == 0)
2737 continue;
2738 Value *Cnt = nullptr;
2739 Value *CntAddr = nullptr;
2740 BasicBlock *PrecondBB = nullptr;
2741 BasicBlock *ExitBB = nullptr;
2742 if (NumIters > 1) {
2743 CodeGenIP = Builder.saveIP();
2744 Builder.restoreIP(AllocaIP);
2745 CntAddr =
2746 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2747
2748 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2749 CntAddr->getName() + ".ascast");
2750 Builder.restoreIP(CodeGenIP);
2751 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2752 CntAddr,
2753 /*Volatile=*/false);
2754 PrecondBB = BasicBlock::Create(Ctx, "precond");
2755 ExitBB = BasicBlock::Create(Ctx, "exit");
2756 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2757 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2758 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2759 /*Volatile=*/false);
2760 Value *Cmp = Builder.CreateICmpULT(
2761 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2762 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2763 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2764 }
2765
2766 // kmpc_barrier.
2767 InsertPointOrErrorTy BarrierIP1 =
2768 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2769 omp::Directive::OMPD_unknown,
2770 /* ForceSimpleCall */ false,
2771 /* CheckCancelFlag */ true);
2772 if (!BarrierIP1)
2773 return BarrierIP1.takeError();
2774 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2775 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2776 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2777
2778 // if (lane_id == 0)
2779 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2780 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2781 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2782
2783 // Reduce element = LocalReduceList[i]
2784 auto *RedListArrayTy =
2785 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2786 Type *IndexTy = Builder.getIndexTy(
2787 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2788 Value *ElemPtrPtr =
2789 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2790 {ConstantInt::get(IndexTy, 0),
2791 ConstantInt::get(IndexTy, En.index())});
2792 // elemptr = ((CopyType*)(elemptrptr)) + I
2793 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2794 if (NumIters > 1)
2795 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2796
2797 // Get pointer to location in transfer medium.
2798 // MediumPtr = &medium[warp_id]
2799 Value *MediumPtr = Builder.CreateInBoundsGEP(
2800 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2801 // elem = *elemptr
2802 //*MediumPtr = elem
2803 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2804 // Store the source element value to the dest element address.
2805 Builder.CreateStore(Elem, MediumPtr,
2806 /*IsVolatile*/ true);
2807 Builder.CreateBr(MergeBB);
2808
2809 // else
2810 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2811 Builder.CreateBr(MergeBB);
2812
2813 // endif
2814 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2815 InsertPointOrErrorTy BarrierIP2 =
2816 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2817 omp::Directive::OMPD_unknown,
2818 /* ForceSimpleCall */ false,
2819 /* CheckCancelFlag */ true);
2820 if (!BarrierIP2)
2821 return BarrierIP2.takeError();
2822
2823 // Warp 0 copies reduce element from transfer medium
2824 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2825 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2826 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2827
2828 Value *NumWarpsVal =
2829 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2830 // Up to 32 threads in warp 0 are active.
2831 Value *IsActiveThread =
2832 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2833 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2834
2835 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2836
2837 // SecMediumPtr = &medium[tid]
2838 // SrcMediumVal = *SrcMediumPtr
2839 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2840 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2841 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2842 Value *TargetElemPtrPtr =
2843 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2844 {ConstantInt::get(IndexTy, 0),
2845 ConstantInt::get(IndexTy, En.index())});
2846 Value *TargetElemPtrVal =
2847 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2848 Value *TargetElemPtr = TargetElemPtrVal;
2849 if (NumIters > 1)
2850 TargetElemPtr =
2851 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2852
2853 // *TargetElemPtr = SrcMediumVal;
2854 Value *SrcMediumValue =
2855 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2856 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2857 Builder.CreateBr(W0MergeBB);
2858
2859 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2860 Builder.CreateBr(W0MergeBB);
2861
2862 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2863
2864 if (NumIters > 1) {
2865 Cnt = Builder.CreateNSWAdd(
2866 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2867 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2868
2869 auto *CurFn = Builder.GetInsertBlock()->getParent();
2870 emitBranch(PrecondBB);
2871 emitBlock(ExitBB, CurFn);
2872 }
2873 RealTySize %= TySize;
2874 }
2875 }
2876
2877 Builder.CreateRetVoid();
2878 Builder.restoreIP(SavedIP);
2879
2880 return WcFunc;
2881}
2882
2883Function *OpenMPIRBuilder::emitShuffleAndReduceFunction(
2884 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2885 AttributeList FuncAttrs) {
2886 LLVMContext &Ctx = M.getContext();
2887 FunctionType *FuncTy =
2888 FunctionType::get(Builder.getVoidTy(),
2889 {Builder.getPtrTy(), Builder.getInt16Ty(),
2890 Builder.getInt16Ty(), Builder.getInt16Ty()},
2891 /* IsVarArg */ false);
2892 Function *SarFunc =
2894 "_omp_reduction_shuffle_and_reduce_func", &M);
2895 SarFunc->setAttributes(FuncAttrs);
2896 SarFunc->addParamAttr(0, Attribute::NoUndef);
2897 SarFunc->addParamAttr(1, Attribute::NoUndef);
2898 SarFunc->addParamAttr(2, Attribute::NoUndef);
2899 SarFunc->addParamAttr(3, Attribute::NoUndef);
2900 SarFunc->addParamAttr(1, Attribute::SExt);
2901 SarFunc->addParamAttr(2, Attribute::SExt);
2902 SarFunc->addParamAttr(3, Attribute::SExt);
2903 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
2904 Builder.SetInsertPoint(EntryBB);
2905
2906 // Thread local Reduce list used to host the values of data to be reduced.
2907 Argument *ReduceListArg = SarFunc->getArg(0);
2908 // Current lane id; could be logical.
2909 Argument *LaneIDArg = SarFunc->getArg(1);
2910 // Offset of the remote source lane relative to the current lane.
2911 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
2912 // Algorithm version. This is expected to be known at compile time.
2913 Argument *AlgoVerArg = SarFunc->getArg(3);
2914
2915 Type *ReduceListArgType = ReduceListArg->getType();
2916 Type *LaneIDArgType = LaneIDArg->getType();
2917 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
2918 Value *ReduceListAlloca = Builder.CreateAlloca(
2919 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
2920 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2921 LaneIDArg->getName() + ".addr");
2922 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
2923 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
2924 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
2925 AlgoVerArg->getName() + ".addr");
2926 ArrayType *RedListArrayTy =
2927 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2928
2929 // Create a local thread-private variable to host the Reduce list
2930 // from a remote lane.
2931 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
2932 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
2933
2934 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2935 ReduceListAlloca, ReduceListArgType,
2936 ReduceListAlloca->getName() + ".ascast");
2937 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2938 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
2939 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2940 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
2941 RemoteLaneOffsetAlloca->getName() + ".ascast");
2942 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2943 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
2944 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2945 RemoteReductionListAlloca, Builder.getPtrTy(),
2946 RemoteReductionListAlloca->getName() + ".ascast");
2947
2948 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2949 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
2950 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
2951 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
2952
2953 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
2954 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
2955 Value *RemoteLaneOffset =
2956 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
2957 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
2958
2959 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
2960
2961 // This loop iterates through the list of reduce elements and copies,
2962 // element by element, from a remote lane in the warp to RemoteReduceList,
2963 // hosted on the thread's stack.
2964 emitReductionListCopy(
2965 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
2966 ReduceList, RemoteListAddrCast, {RemoteLaneOffset, nullptr, nullptr});
2967
2968 // The actions to be performed on the Remote Reduce list is dependent
2969 // on the algorithm version.
2970 //
2971 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2972 // LaneId % 2 == 0 && Offset > 0):
2973 // do the reduction value aggregation
2974 //
2975 // The thread local variable Reduce list is mutated in place to host the
2976 // reduced data, which is the aggregated value produced from local and
2977 // remote lanes.
2978 //
2979 // Note that AlgoVer is expected to be a constant integer known at compile
2980 // time.
2981 // When AlgoVer==0, the first conjunction evaluates to true, making
2982 // the entire predicate true during compile time.
2983 // When AlgoVer==1, the second conjunction has only the second part to be
2984 // evaluated during runtime. Other conjunctions evaluates to false
2985 // during compile time.
2986 // When AlgoVer==2, the third conjunction has only the second part to be
2987 // evaluated during runtime. Other conjunctions evaluates to false
2988 // during compile time.
2989 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
2990 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
2991 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
2992 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
2993 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
2994 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
2995 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
2996 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
2997 Value *RemoteOffsetComp =
2998 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
2999 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3000 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3001 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3002
3003 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3004 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3005 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3006
3007 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3008 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3009 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3010 ReduceList, Builder.getPtrTy());
3011 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3012 RemoteListAddrCast, Builder.getPtrTy());
3013 Builder.CreateCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3014 ->addFnAttr(Attribute::NoUnwind);
3015 Builder.CreateBr(MergeBB);
3016
3017 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3018 Builder.CreateBr(MergeBB);
3019
3020 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3021
3022 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3023 // Reduce list.
3024 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3025 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3026 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3027
3028 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3029 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3030 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3031 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3032
3033 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3034 emitReductionListCopy(AllocaIP, CopyAction::ThreadCopy, RedListArrayTy,
3035 ReductionInfos, RemoteListAddrCast, ReduceList);
3036 Builder.CreateBr(CpyMergeBB);
3037
3038 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3039 Builder.CreateBr(CpyMergeBB);
3040
3041 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3042
3043 Builder.CreateRetVoid();
3044
3045 return SarFunc;
3046}
3047
3048Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3049 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3050 AttributeList FuncAttrs) {
3051 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3052 LLVMContext &Ctx = M.getContext();
3054 Builder.getVoidTy(),
3055 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3056 /* IsVarArg */ false);
3057 Function *LtGCFunc =
3059 "_omp_reduction_list_to_global_copy_func", &M);
3060 LtGCFunc->setAttributes(FuncAttrs);
3061 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3062 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3063 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3064
3065 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3066 Builder.SetInsertPoint(EntryBlock);
3067
3068 // Buffer: global reduction buffer.
3069 Argument *BufferArg = LtGCFunc->getArg(0);
3070 // Idx: index of the buffer.
3071 Argument *IdxArg = LtGCFunc->getArg(1);
3072 // ReduceList: thread local Reduce list.
3073 Argument *ReduceListArg = LtGCFunc->getArg(2);
3074
3075 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3076 BufferArg->getName() + ".addr");
3077 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3078 IdxArg->getName() + ".addr");
3079 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3080 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3082 BufferArgAlloca, Builder.getPtrTy(),
3083 BufferArgAlloca->getName() + ".ascast");
3084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3087 ReduceListArgAlloca, Builder.getPtrTy(),
3088 ReduceListArgAlloca->getName() + ".ascast");
3089
3090 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3091 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3092 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3093
3094 Value *LocalReduceList =
3095 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3096 Value *BufferArgVal =
3097 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3099 Type *IndexTy = Builder.getIndexTy(
3100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3101 for (auto En : enumerate(ReductionInfos)) {
3102 const ReductionInfo &RI = En.value();
3103 auto *RedListArrayTy =
3104 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3105 // Reduce element = LocalReduceList[i]
3106 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3107 RedListArrayTy, LocalReduceList,
3108 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3109 // elemptr = ((CopyType*)(elemptrptr)) + I
3110 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3111
3112 // Global = Buffer.VD[Idx];
3113 Value *BufferVD =
3114 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3115 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3116 ReductionsBufferTy, BufferVD, 0, En.index());
3117
3118 switch (RI.EvaluationKind) {
3119 case EvalKind::Scalar: {
3120 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3121 Builder.CreateStore(TargetElement, GlobVal);
3122 break;
3123 }
3124 case EvalKind::Complex: {
3125 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3126 RI.ElementType, ElemPtr, 0, 0, ".realp");
3127 Value *SrcReal = Builder.CreateLoad(
3128 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3129 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3130 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3131 Value *SrcImg = Builder.CreateLoad(
3132 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3133
3134 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3135 RI.ElementType, GlobVal, 0, 0, ".realp");
3136 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3137 RI.ElementType, GlobVal, 0, 1, ".imagp");
3138 Builder.CreateStore(SrcReal, DestRealPtr);
3139 Builder.CreateStore(SrcImg, DestImgPtr);
3140 break;
3141 }
3142 case EvalKind::Aggregate: {
3143 Value *SizeVal =
3144 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3145 Builder.CreateMemCpy(
3146 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3147 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3148 break;
3149 }
3150 }
3151 }
3152
3153 Builder.CreateRetVoid();
3154 Builder.restoreIP(OldIP);
3155 return LtGCFunc;
3156}
3157
3158Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3159 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3160 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3161 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3162 LLVMContext &Ctx = M.getContext();
3164 Builder.getVoidTy(),
3165 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3166 /* IsVarArg */ false);
3167 Function *LtGRFunc =
3169 "_omp_reduction_list_to_global_reduce_func", &M);
3170 LtGRFunc->setAttributes(FuncAttrs);
3171 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3172 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3173 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3174
3175 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3176 Builder.SetInsertPoint(EntryBlock);
3177
3178 // Buffer: global reduction buffer.
3179 Argument *BufferArg = LtGRFunc->getArg(0);
3180 // Idx: index of the buffer.
3181 Argument *IdxArg = LtGRFunc->getArg(1);
3182 // ReduceList: thread local Reduce list.
3183 Argument *ReduceListArg = LtGRFunc->getArg(2);
3184
3185 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3186 BufferArg->getName() + ".addr");
3187 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3188 IdxArg->getName() + ".addr");
3189 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3190 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3191 auto *RedListArrayTy =
3192 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3193
3194 // 1. Build a list of reduction variables.
3195 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3196 Value *LocalReduceList =
3197 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3198
3199 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 BufferArgAlloca, Builder.getPtrTy(),
3201 BufferArgAlloca->getName() + ".ascast");
3202 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3203 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3204 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 ReduceListArgAlloca, Builder.getPtrTy(),
3206 ReduceListArgAlloca->getName() + ".ascast");
3207 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 LocalReduceList, Builder.getPtrTy(),
3209 LocalReduceList->getName() + ".ascast");
3210
3211 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3212 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3213 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3214
3215 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3216 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3217 Type *IndexTy = Builder.getIndexTy(
3218 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3219 for (auto En : enumerate(ReductionInfos)) {
3220 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3221 RedListArrayTy, LocalReduceListAddrCast,
3222 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3223 Value *BufferVD =
3224 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3225 // Global = Buffer.VD[Idx];
3226 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3227 ReductionsBufferTy, BufferVD, 0, En.index());
3228 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3229 }
3230
3231 // Call reduce_function(GlobalReduceList, ReduceList)
3232 Value *ReduceList =
3233 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3234 Builder.CreateCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3235 ->addFnAttr(Attribute::NoUnwind);
3236 Builder.CreateRetVoid();
3237 Builder.restoreIP(OldIP);
3238 return LtGRFunc;
3239}
3240
3241Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3242 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3243 AttributeList FuncAttrs) {
3244 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3245 LLVMContext &Ctx = M.getContext();
3247 Builder.getVoidTy(),
3248 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3249 /* IsVarArg */ false);
3250 Function *LtGCFunc =
3252 "_omp_reduction_global_to_list_copy_func", &M);
3253 LtGCFunc->setAttributes(FuncAttrs);
3254 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3255 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3256 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3257
3258 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3259 Builder.SetInsertPoint(EntryBlock);
3260
3261 // Buffer: global reduction buffer.
3262 Argument *BufferArg = LtGCFunc->getArg(0);
3263 // Idx: index of the buffer.
3264 Argument *IdxArg = LtGCFunc->getArg(1);
3265 // ReduceList: thread local Reduce list.
3266 Argument *ReduceListArg = LtGCFunc->getArg(2);
3267
3268 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3269 BufferArg->getName() + ".addr");
3270 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3271 IdxArg->getName() + ".addr");
3272 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3273 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3274 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3275 BufferArgAlloca, Builder.getPtrTy(),
3276 BufferArgAlloca->getName() + ".ascast");
3277 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3278 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3279 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3280 ReduceListArgAlloca, Builder.getPtrTy(),
3281 ReduceListArgAlloca->getName() + ".ascast");
3282 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3283 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3284 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3285
3286 Value *LocalReduceList =
3287 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3288 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3289 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3290 Type *IndexTy = Builder.getIndexTy(
3291 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3292 for (auto En : enumerate(ReductionInfos)) {
3293 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3294 auto *RedListArrayTy =
3295 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3296 // Reduce element = LocalReduceList[i]
3297 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3298 RedListArrayTy, LocalReduceList,
3299 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3300 // elemptr = ((CopyType*)(elemptrptr)) + I
3301 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3302 // Global = Buffer.VD[Idx];
3303 Value *BufferVD =
3304 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3305 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3306 ReductionsBufferTy, BufferVD, 0, En.index());
3307
3308 switch (RI.EvaluationKind) {
3309 case EvalKind::Scalar: {
3310 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3311 Builder.CreateStore(TargetElement, ElemPtr);
3312 break;
3313 }
3314 case EvalKind::Complex: {
3315 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3316 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3317 Value *SrcReal = Builder.CreateLoad(
3318 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3319 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3320 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3321 Value *SrcImg = Builder.CreateLoad(
3322 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3323
3324 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3325 RI.ElementType, ElemPtr, 0, 0, ".realp");
3326 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3327 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3328 Builder.CreateStore(SrcReal, DestRealPtr);
3329 Builder.CreateStore(SrcImg, DestImgPtr);
3330 break;
3331 }
3332 case EvalKind::Aggregate: {
3333 Value *SizeVal =
3334 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3335 Builder.CreateMemCpy(
3336 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3337 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3338 SizeVal, false);
3339 break;
3340 }
3341 }
3342 }
3343
3344 Builder.CreateRetVoid();
3345 Builder.restoreIP(OldIP);
3346 return LtGCFunc;
3347}
3348
3349Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3350 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3351 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3352 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3353 LLVMContext &Ctx = M.getContext();
3354 auto *FuncTy = FunctionType::get(
3355 Builder.getVoidTy(),
3356 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3357 /* IsVarArg */ false);
3358 Function *LtGRFunc =
3360 "_omp_reduction_global_to_list_reduce_func", &M);
3361 LtGRFunc->setAttributes(FuncAttrs);
3362 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3363 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3364 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3365
3366 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3367 Builder.SetInsertPoint(EntryBlock);
3368
3369 // Buffer: global reduction buffer.
3370 Argument *BufferArg = LtGRFunc->getArg(0);
3371 // Idx: index of the buffer.
3372 Argument *IdxArg = LtGRFunc->getArg(1);
3373 // ReduceList: thread local Reduce list.
3374 Argument *ReduceListArg = LtGRFunc->getArg(2);
3375
3376 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3377 BufferArg->getName() + ".addr");
3378 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3379 IdxArg->getName() + ".addr");
3380 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3381 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3382 ArrayType *RedListArrayTy =
3383 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3384
3385 // 1. Build a list of reduction variables.
3386 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3387 Value *LocalReduceList =
3388 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3389
3390 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 BufferArgAlloca, Builder.getPtrTy(),
3392 BufferArgAlloca->getName() + ".ascast");
3393 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3394 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3395 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3396 ReduceListArgAlloca, Builder.getPtrTy(),
3397 ReduceListArgAlloca->getName() + ".ascast");
3398 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3399 LocalReduceList, Builder.getPtrTy(),
3400 LocalReduceList->getName() + ".ascast");
3401
3402 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3403 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3404 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3405
3406 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3407 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3408 Type *IndexTy = Builder.getIndexTy(
3409 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3410 for (auto En : enumerate(ReductionInfos)) {
3411 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3412 RedListArrayTy, ReductionList,
3413 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3414 // Global = Buffer.VD[Idx];
3415 Value *BufferVD =
3416 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3417 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3418 ReductionsBufferTy, BufferVD, 0, En.index());
3419 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3420 }
3421
3422 // Call reduce_function(ReduceList, GlobalReduceList)
3423 Value *ReduceList =
3424 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3425 Builder.CreateCall(ReduceFn, {ReduceList, ReductionList})
3426 ->addFnAttr(Attribute::NoUnwind);
3427 Builder.CreateRetVoid();
3428 Builder.restoreIP(OldIP);
3429 return LtGRFunc;
3430}
3431
3432std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3433 std::string Suffix =
3434 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3435 return (Name + Suffix).str();
3436}
3437
3438Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3439 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3440 ReductionGenCBKind ReductionGenCBKind, AttributeList FuncAttrs) {
3441 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3442 {Builder.getPtrTy(), Builder.getPtrTy()},
3443 /* IsVarArg */ false);
3444 std::string Name = getReductionFuncName(ReducerName);
3445 Function *ReductionFunc =
3447 ReductionFunc->setAttributes(FuncAttrs);
3448 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3449 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3450 BasicBlock *EntryBB =
3451 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3452 Builder.SetInsertPoint(EntryBB);
3453
3454 // Need to alloca memory here and deal with the pointers before getting
3455 // LHS/RHS pointers out
3456 Value *LHSArrayPtr = nullptr;
3457 Value *RHSArrayPtr = nullptr;
3458 Argument *Arg0 = ReductionFunc->getArg(0);
3459 Argument *Arg1 = ReductionFunc->getArg(1);
3460 Type *Arg0Type = Arg0->getType();
3461 Type *Arg1Type = Arg1->getType();
3462
3463 Value *LHSAlloca =
3464 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3465 Value *RHSAlloca =
3466 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3467 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3469 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3470 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3471 Builder.CreateStore(Arg0, LHSAddrCast);
3472 Builder.CreateStore(Arg1, RHSAddrCast);
3473 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3474 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3475
3476 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3477 Type *IndexTy = Builder.getIndexTy(
3478 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3479 SmallVector<Value *> LHSPtrs, RHSPtrs;
3480 for (auto En : enumerate(ReductionInfos)) {
3481 const ReductionInfo &RI = En.value();
3482 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3483 RedArrayTy, RHSArrayPtr,
3484 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3485 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3486 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3487 RHSI8Ptr, RI.PrivateVariable->getType(),
3488 RHSI8Ptr->getName() + ".ascast");
3489
3490 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3491 RedArrayTy, LHSArrayPtr,
3492 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3493 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3494 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3495 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3496
3497 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3498 LHSPtrs.emplace_back(LHSPtr);
3499 RHSPtrs.emplace_back(RHSPtr);
3500 } else {
3501 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3502 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3503 Value *Reduced;
3504 InsertPointOrErrorTy AfterIP =
3505 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3506 if (!AfterIP)
3507 return AfterIP.takeError();
3508 if (!Builder.GetInsertBlock())
3509 return ReductionFunc;
3510
3511 Builder.restoreIP(*AfterIP);
3512 Builder.CreateStore(Reduced, LHSPtr);
3513 }
3514 }
3515
3516 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3517 for (auto En : enumerate(ReductionInfos)) {
3518 unsigned Index = En.index();
3519 const ReductionInfo &RI = En.value();
3520 Value *LHSFixupPtr, *RHSFixupPtr;
3521 Builder.restoreIP(RI.ReductionGenClang(
3522 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3523
3524 // Fix the CallBack code genereated to use the correct Values for the LHS
3525 // and RHS
3526 LHSFixupPtr->replaceUsesWithIf(
3527 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3528 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3529 ReductionFunc;
3530 });
3531 RHSFixupPtr->replaceUsesWithIf(
3532 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3533 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3534 ReductionFunc;
3535 });
3536 }
3537
3538 Builder.CreateRetVoid();
3539 return ReductionFunc;
3540}
3541
3542static void
3544 bool IsGPU) {
3545 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3546 (void)RI;
3547 assert(RI.Variable && "expected non-null variable");
3548 assert(RI.PrivateVariable && "expected non-null private variable");
3549 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3550 "expected non-null reduction generator callback");
3551 if (!IsGPU) {
3552 assert(
3553 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3554 "expected variables and their private equivalents to have the same "
3555 "type");
3556 }
3557 assert(RI.Variable->getType()->isPointerTy() &&
3558 "expected variables to be pointers");
3559 }
3560}
3561
3562OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3563 const LocationDescription &Loc, InsertPointTy AllocaIP,
3564 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3565 bool IsNoWait, bool IsTeamsReduction, ReductionGenCBKind ReductionGenCBKind,
3566 std::optional<omp::GV> GridValue, unsigned ReductionBufNum,
3567 Value *SrcLocInfo) {
3568 if (!updateToLocation(Loc))
3569 return InsertPointTy();
3570 Builder.restoreIP(CodeGenIP);
3571 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3572 LLVMContext &Ctx = M.getContext();
3573
3574 // Source location for the ident struct
3575 if (!SrcLocInfo) {
3576 uint32_t SrcLocStrSize;
3577 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3578 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3579 }
3580
3581 if (ReductionInfos.size() == 0)
3582 return Builder.saveIP();
3583
3584 BasicBlock *ContinuationBlock = nullptr;
3585 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3586 // Copied code from createReductions
3587 BasicBlock *InsertBlock = Loc.IP.getBlock();
3588 ContinuationBlock =
3589 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3590 InsertBlock->getTerminator()->eraseFromParent();
3591 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3592 }
3593
3594 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3595 AttributeList FuncAttrs;
3596 AttrBuilder AttrBldr(Ctx);
3597 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3598 AttrBldr.addAttribute(Attr);
3599 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3600 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3601
3602 CodeGenIP = Builder.saveIP();
3603 Expected<Function *> ReductionResult =
3604 createReductionFunction(Builder.GetInsertBlock()->getParent()->getName(),
3605 ReductionInfos, ReductionGenCBKind, FuncAttrs);
3606 if (!ReductionResult)
3607 return ReductionResult.takeError();
3608 Function *ReductionFunc = *ReductionResult;
3609 Builder.restoreIP(CodeGenIP);
3610
3611 // Set the grid value in the config needed for lowering later on
3612 if (GridValue.has_value())
3613 Config.setGridValue(GridValue.value());
3614 else
3615 Config.setGridValue(getGridValue(T, ReductionFunc));
3616
3617 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3618 // RedList, shuffle_reduce_func, interwarp_copy_func);
3619 // or
3620 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3621 Value *Res;
3622
3623 // 1. Build a list of reduction variables.
3624 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3625 auto Size = ReductionInfos.size();
3626 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
3627 Type *FuncPtrTy =
3628 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
3629 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3630 CodeGenIP = Builder.saveIP();
3631 Builder.restoreIP(AllocaIP);
3632 Value *ReductionListAlloca =
3633 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3634 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3635 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3636 Builder.restoreIP(CodeGenIP);
3637 Type *IndexTy = Builder.getIndexTy(
3638 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3639 for (auto En : enumerate(ReductionInfos)) {
3640 const ReductionInfo &RI = En.value();
3641 Value *ElemPtr = Builder.CreateInBoundsGEP(
3642 RedArrayTy, ReductionList,
3643 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3644 Value *CastElem =
3645 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3646 Builder.CreateStore(CastElem, ElemPtr);
3647 }
3648 CodeGenIP = Builder.saveIP();
3649 Function *SarFunc =
3650 emitShuffleAndReduceFunction(ReductionInfos, ReductionFunc, FuncAttrs);
3651 Expected<Function *> CopyResult =
3652 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs);
3653 if (!CopyResult)
3654 return CopyResult.takeError();
3655 Function *WcFunc = *CopyResult;
3656 Builder.restoreIP(CodeGenIP);
3657
3658 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3659
3660 unsigned MaxDataSize = 0;
3661 SmallVector<Type *> ReductionTypeArgs;
3662 for (auto En : enumerate(ReductionInfos)) {
3663 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3664 if (Size > MaxDataSize)
3665 MaxDataSize = Size;
3666 ReductionTypeArgs.emplace_back(En.value().ElementType);
3667 }
3668 Value *ReductionDataSize =
3669 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3670 if (!IsTeamsReduction) {
3671 Value *SarFuncCast =
3672 Builder.CreatePointerBitCastOrAddrSpaceCast(SarFunc, FuncPtrTy);
3673 Value *WcFuncCast =
3674 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
3675 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3676 WcFuncCast};
3677 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3678 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3679 Res = Builder.CreateCall(Pv2Ptr, Args);
3680 } else {
3681 CodeGenIP = Builder.saveIP();
3682 StructType *ReductionsBufferTy = StructType::create(
3683 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3684 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3685 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3686 Function *LtGCFunc = emitListToGlobalCopyFunction(
3687 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3688 Function *LtGRFunc = emitListToGlobalReduceFunction(
3689 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3690 Function *GtLCFunc = emitGlobalToListCopyFunction(
3691 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3692 Function *GtLRFunc = emitGlobalToListReduceFunction(
3693 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3694 Builder.restoreIP(CodeGenIP);
3695
3696 Value *KernelTeamsReductionPtr = Builder.CreateCall(
3697 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3698
3699 Value *Args3[] = {SrcLocInfo,
3700 KernelTeamsReductionPtr,
3701 Builder.getInt32(ReductionBufNum),
3702 ReductionDataSize,
3703 RL,
3704 SarFunc,
3705 WcFunc,
3706 LtGCFunc,
3707 LtGRFunc,
3708 GtLCFunc,
3709 GtLRFunc};
3710
3711 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3712 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3713 Res = Builder.CreateCall(TeamsReduceFn, Args3);
3714 }
3715
3716 // 5. Build if (res == 1)
3717 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3718 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3719 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3720 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3721
3722 // 6. Build then branch: where we have reduced values in the master
3723 // thread in each team.
3724 // __kmpc_end_reduce{_nowait}(<gtid>);
3725 // break;
3726 emitBlock(ThenBB, CurFunc);
3727
3728 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3729 for (auto En : enumerate(ReductionInfos)) {
3730 const ReductionInfo &RI = En.value();
3731 Value *LHS = RI.Variable;
3732 Value *RHS =
3733 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3734
3735 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3736 Value *LHSPtr, *RHSPtr;
3737 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3738 &LHSPtr, &RHSPtr, CurFunc));
3739
3740 // Fix the CallBack code genereated to use the correct Values for the LHS
3741 // and RHS
3742 LHSPtr->replaceUsesWithIf(LHS, [ReductionFunc](const Use &U) {
3743 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3744 ReductionFunc;
3745 });
3746 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3747 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3748 ReductionFunc;
3749 });
3750 } else {
3751 Value *LHSValue = Builder.CreateLoad(RI.ElementType, LHS, "final.lhs");
3752 Value *RHSValue = Builder.CreateLoad(RI.ElementType, RHS, "final.rhs");
3753 Value *Reduced;
3754 InsertPointOrErrorTy AfterIP =
3755 RI.ReductionGen(Builder.saveIP(), RHSValue, LHSValue, Reduced);
3756 if (!AfterIP)
3757 return AfterIP.takeError();
3758 Builder.restoreIP(*AfterIP);
3759 Builder.CreateStore(Reduced, LHS, false);
3760 }
3761 }
3762 emitBlock(ExitBB, CurFunc);
3763 if (ContinuationBlock) {
3764 Builder.CreateBr(ContinuationBlock);
3765 Builder.SetInsertPoint(ContinuationBlock);
3766 }
3767 Config.setEmitLLVMUsed();
3768
3769 return Builder.saveIP();
3770}
3771
3773 Type *VoidTy = Type::getVoidTy(M.getContext());
3774 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3775 auto *FuncTy =
3776 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3778 ".omp.reduction.func", &M);
3779}
3780
3782 Function *ReductionFunc,
3784 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3785 Module *Module = ReductionFunc->getParent();
3786 BasicBlock *ReductionFuncBlock =
3787 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3788 Builder.SetInsertPoint(ReductionFuncBlock);
3789 Value *LHSArrayPtr = nullptr;
3790 Value *RHSArrayPtr = nullptr;
3791 if (IsGPU) {
3792 // Need to alloca memory here and deal with the pointers before getting
3793 // LHS/RHS pointers out
3794 //
3795 Argument *Arg0 = ReductionFunc->getArg(0);
3796 Argument *Arg1 = ReductionFunc->getArg(1);
3797 Type *Arg0Type = Arg0->getType();
3798 Type *Arg1Type = Arg1->getType();
3799
3800 Value *LHSAlloca =
3801 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3802 Value *RHSAlloca =
3803 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3804 Value *LHSAddrCast =
3805 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3806 Value *RHSAddrCast =
3807 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3808 Builder.CreateStore(Arg0, LHSAddrCast);
3809 Builder.CreateStore(Arg1, RHSAddrCast);
3810 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3811 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3812 } else {
3813 LHSArrayPtr = ReductionFunc->getArg(0);
3814 RHSArrayPtr = ReductionFunc->getArg(1);
3815 }
3816
3817 unsigned NumReductions = ReductionInfos.size();
3818 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3819
3820 for (auto En : enumerate(ReductionInfos)) {
3821 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3822 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3823 RedArrayTy, LHSArrayPtr, 0, En.index());
3824 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3825 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3826 LHSI8Ptr, RI.Variable->getType());
3827 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3828 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3829 RedArrayTy, RHSArrayPtr, 0, En.index());
3830 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3831 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3832 RHSI8Ptr, RI.PrivateVariable->getType());
3833 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3834 Value *Reduced;
3835 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3836 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3837 if (!AfterIP)
3838 return AfterIP.takeError();
3839
3840 Builder.restoreIP(*AfterIP);
3841 // TODO: Consider flagging an error.
3842 if (!Builder.GetInsertBlock())
3843 return Error::success();
3844
3845 // store is inside of the reduction region when using by-ref
3846 if (!IsByRef[En.index()])
3847 Builder.CreateStore(Reduced, LHSPtr);
3848 }
3849 Builder.CreateRetVoid();
3850 return Error::success();
3851}
3852
3853OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3854 const LocationDescription &Loc, InsertPointTy AllocaIP,
3855 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3856 bool IsNoWait, bool IsTeamsReduction) {
3857 assert(ReductionInfos.size() == IsByRef.size());
3858 if (Config.isGPU())
3859 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3860 IsNoWait, IsTeamsReduction);
3861
3862 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
3863
3864 if (!updateToLocation(Loc))
3865 return InsertPointTy();
3866
3867 if (ReductionInfos.size() == 0)
3868 return Builder.saveIP();
3869
3870 BasicBlock *InsertBlock = Loc.IP.getBlock();
3871 BasicBlock *ContinuationBlock =
3872 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3873 InsertBlock->getTerminator()->eraseFromParent();
3874
3875 // Create and populate array of type-erased pointers to private reduction
3876 // values.
3877 unsigned NumReductions = ReductionInfos.size();
3878 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3879 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
3880 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
3881
3882 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3883
3884 for (auto En : enumerate(ReductionInfos)) {
3885 unsigned Index = En.index();
3886 const ReductionInfo &RI = En.value();
3887 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
3888 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
3889 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
3890 }
3891
3892 // Emit a call to the runtime function that orchestrates the reduction.
3893 // Declare the reduction function in the process.
3894 Type *IndexTy = Builder.getIndexTy(
3895 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3896 Function *Func = Builder.GetInsertBlock()->getParent();
3897 Module *Module = Func->getParent();
3898 uint32_t SrcLocStrSize;
3899 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3900 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
3901 return RI.AtomicReductionGen;
3902 });
3903 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
3904 CanGenerateAtomic
3905 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
3906 : IdentFlag(0));
3907 Value *ThreadId = getOrCreateThreadID(Ident);
3908 Constant *NumVariables = Builder.getInt32(NumReductions);
3909 const DataLayout &DL = Module->getDataLayout();
3910 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
3911 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
3912 Function *ReductionFunc = getFreshReductionFunc(*Module);
3913 Value *Lock = getOMPCriticalRegionLock(".reduction");
3914 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
3915 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
3916 : RuntimeFunction::OMPRTL___kmpc_reduce);
3917 CallInst *ReduceCall =
3918 Builder.CreateCall(ReduceFunc,
3919 {Ident, ThreadId, NumVariables, RedArraySize, RedArray,
3920 ReductionFunc, Lock},
3921 "reduce");
3922
3923 // Create final reduction entry blocks for the atomic and non-atomic case.
3924 // Emit IR that dispatches control flow to one of the blocks based on the
3925 // reduction supporting the atomic mode.
3926 BasicBlock *NonAtomicRedBlock =
3927 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
3928 BasicBlock *AtomicRedBlock =
3929 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
3931 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
3932 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
3933 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
3934
3935 // Populate the non-atomic reduction using the elementwise reduction function.
3936 // This loads the elements from the global and private variables and reduces
3937 // them before storing back the result to the global variable.
3938 Builder.SetInsertPoint(NonAtomicRedBlock);
3939 for (auto En : enumerate(ReductionInfos)) {
3940 const ReductionInfo &RI = En.value();
3941 Type *ValueType = RI.ElementType;
3942 // We have one less load for by-ref case because that load is now inside of
3943 // the reduction region
3944 Value *RedValue = RI.Variable;
3945 if (!IsByRef[En.index()]) {
3946 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3947 "red.value." + Twine(En.index()));
3948 }
3949 Value *PrivateRedValue =
3950 Builder.CreateLoad(ValueType, RI.PrivateVariable,
3951 "red.private.value." + Twine(En.index()));
3952 Value *Reduced;
3953 InsertPointOrErrorTy AfterIP =
3954 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3955 if (!AfterIP)
3956 return AfterIP.takeError();
3957 Builder.restoreIP(*AfterIP);
3958
3959 if (!Builder.GetInsertBlock())
3960 return InsertPointTy();
3961 // for by-ref case, the load is inside of the reduction region
3962 if (!IsByRef[En.index()])
3963 Builder.CreateStore(Reduced, RI.Variable);
3964 }
3965 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
3966 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
3967 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
3968 Builder.CreateCall(EndReduceFunc, {Ident, ThreadId, Lock});
3969 Builder.CreateBr(ContinuationBlock);
3970
3971 // Populate the atomic reduction using the atomic elementwise reduction
3972 // function. There are no loads/stores here because they will be happening
3973 // inside the atomic elementwise reduction.
3974 Builder.SetInsertPoint(AtomicRedBlock);
3975 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
3976 for (const ReductionInfo &RI : ReductionInfos) {
3977 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
3978 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
3979 if (!AfterIP)
3980 return AfterIP.takeError();
3981 Builder.restoreIP(*AfterIP);
3982 if (!Builder.GetInsertBlock())
3983 return InsertPointTy();
3984 }
3985 Builder.CreateBr(ContinuationBlock);
3986 } else {
3987 Builder.CreateUnreachable();
3988 }
3989
3990 // Populate the outlined reduction function using the elementwise reduction
3991 // function. Partial values are extracted from the type-erased array of
3992 // pointers to private variables.
3993 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
3994 IsByRef, /*isGPU=*/false);
3995 if (Err)
3996 return Err;
3997
3998 if (!Builder.GetInsertBlock())
3999 return InsertPointTy();
4000
4001 Builder.SetInsertPoint(ContinuationBlock);
4002 return Builder.saveIP();
4003}
4004
4005OpenMPIRBuilder::InsertPointOrErrorTy
4006OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4007 BodyGenCallbackTy BodyGenCB,
4008 FinalizeCallbackTy FiniCB) {
4009 if (!updateToLocation(Loc))
4010 return Loc.IP;
4011
4012 Directive OMPD = Directive::OMPD_master;
4013 uint32_t SrcLocStrSize;
4014 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4015 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4016 Value *ThreadId = getOrCreateThreadID(Ident);
4017 Value *Args[] = {Ident, ThreadId};
4018
4019 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4020 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4021
4022 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4023 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
4024
4025 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4026 /*Conditional*/ true, /*hasFinalize*/ true);
4027}
4028
4029OpenMPIRBuilder::InsertPointOrErrorTy
4030OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4031 BodyGenCallbackTy BodyGenCB,
4032 FinalizeCallbackTy FiniCB, Value *Filter) {
4033 if (!updateToLocation(Loc))
4034 return Loc.IP;
4035
4036 Directive OMPD = Directive::OMPD_masked;
4037 uint32_t SrcLocStrSize;
4038 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4039 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4040 Value *ThreadId = getOrCreateThreadID(Ident);
4041 Value *Args[] = {Ident, ThreadId, Filter};
4042 Value *ArgsEnd[] = {Ident, ThreadId};
4043
4044 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4045 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
4046
4047 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4048 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, ArgsEnd);
4049
4050 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4051 /*Conditional*/ true, /*hasFinalize*/ true);
4052}
4053
4055 llvm::FunctionCallee Callee,
4057 const llvm::Twine &Name) {
4058 llvm::CallInst *Call = Builder.CreateCall(
4059 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4060 Call->setDoesNotThrow();
4061 return Call;
4062}
4063
4064// Expects input basic block is dominated by BeforeScanBB.
4065// Once Scan directive is encountered, the code after scan directive should be
4066// dominated by AfterScanBB. Scan directive splits the code sequence to
4067// scan and input phase. Based on whether inclusive or exclusive
4068// clause is used in the scan directive and whether input loop or scan loop
4069// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4070// input loop and second is the scan loop. The code generated handles only
4071// inclusive scans now.
4072OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4073 const LocationDescription &Loc, InsertPointTy AllocaIP,
4074 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4075 bool IsInclusive, ScanInfo *ScanRedInfo) {
4076 if (ScanRedInfo->OMPFirstScanLoop) {
4077 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4078 ScanVarsType, ScanRedInfo);
4079 if (Err)
4080 return Err;
4081 }
4082 if (!updateToLocation(Loc))
4083 return Loc.IP;
4084
4085 llvm::Value *IV = ScanRedInfo->IV;
4086
4087 if (ScanRedInfo->OMPFirstScanLoop) {
4088 // Emit buffer[i] = red; at the end of the input phase.
4089 for (size_t i = 0; i < ScanVars.size(); i++) {
4090 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4091 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4092 Type *DestTy = ScanVarsType[i];
4093 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4094 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4095
4096 Builder.CreateStore(Src, Val);
4097 }
4098 }
4099 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4100 emitBlock(ScanRedInfo->OMPScanDispatch,
4101 Builder.GetInsertBlock()->getParent());
4102
4103 if (!ScanRedInfo->OMPFirstScanLoop) {
4104 IV = ScanRedInfo->IV;
4105 // Emit red = buffer[i]; at the entrance to the scan phase.
4106 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4107 for (size_t i = 0; i < ScanVars.size(); i++) {
4108 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4109 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4110 Type *DestTy = ScanVarsType[i];
4111 Value *SrcPtr =
4112 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4113 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4114 Builder.CreateStore(Src, ScanVars[i]);
4115 }
4116 }
4117
4118 // TODO: Update it to CreateBr and remove dead blocks
4119 llvm::Value *CmpI = Builder.getInt1(true);
4120 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4121 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4122 ScanRedInfo->OMPAfterScanBlock);
4123 } else {
4124 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4125 ScanRedInfo->OMPBeforeScanBlock);
4126 }
4127 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4128 Builder.GetInsertBlock()->getParent());
4129 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4130 return Builder.saveIP();
4131}
4132
4133Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4134 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4135 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4136
4137 Builder.restoreIP(AllocaIP);
4138 // Create the shared pointer at alloca IP.
4139 for (size_t i = 0; i < ScanVars.size(); i++) {
4140 llvm::Value *BuffPtr =
4141 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4142 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4143 }
4144
4145 // Allocate temporary buffer by master thread
4146 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4147 InsertPointTy CodeGenIP) -> Error {
4148 Builder.restoreIP(CodeGenIP);
4149 Value *AllocSpan =
4150 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4151 for (size_t i = 0; i < ScanVars.size(); i++) {
4152 Type *IntPtrTy = Builder.getInt32Ty();
4153 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4154 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4155 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4156 AllocSpan, nullptr, "arr");
4157 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4158 }
4159 return Error::success();
4160 };
4161 // TODO: Perform finalization actions for variables. This has to be
4162 // called for variables which have destructors/finalizers.
4163 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4164
4165 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4166 llvm::Value *FilterVal = Builder.getInt32(0);
4167 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4168 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4169
4170 if (!AfterIP)
4171 return AfterIP.takeError();
4172 Builder.restoreIP(*AfterIP);
4173 BasicBlock *InputBB = Builder.GetInsertBlock();
4174 if (InputBB->getTerminator())
4175 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4176 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4177 if (!AfterIP)
4178 return AfterIP.takeError();
4179 Builder.restoreIP(*AfterIP);
4180
4181 return Error::success();
4182}
4183
4184Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4185 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4186 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4187 InsertPointTy CodeGenIP) -> Error {
4188 Builder.restoreIP(CodeGenIP);
4189 for (ReductionInfo RedInfo : ReductionInfos) {
4190 Value *PrivateVar = RedInfo.PrivateVariable;
4191 Value *OrigVar = RedInfo.Variable;
4192 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4193 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4194
4195 Type *SrcTy = RedInfo.ElementType;
4196 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4197 "arrayOffset");
4198 Value *Src = Builder.CreateLoad(SrcTy, Val);
4199
4200 Builder.CreateStore(Src, OrigVar);
4201 Builder.CreateFree(Buff);
4202 }
4203 return Error::success();
4204 };
4205 // TODO: Perform finalization actions for variables. This has to be
4206 // called for variables which have destructors/finalizers.
4207 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4208
4209 if (ScanRedInfo->OMPScanFinish->getTerminator())
4210 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4211 else
4212 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4213
4214 llvm::Value *FilterVal = Builder.getInt32(0);
4215 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4216 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4217
4218 if (!AfterIP)
4219 return AfterIP.takeError();
4220 Builder.restoreIP(*AfterIP);
4221 BasicBlock *InputBB = Builder.GetInsertBlock();
4222 if (InputBB->getTerminator())
4223 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4224 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4225 if (!AfterIP)
4226 return AfterIP.takeError();
4227 Builder.restoreIP(*AfterIP);
4228 return Error::success();
4229}
4230
4231OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4232 const LocationDescription &Loc,
4234 ScanInfo *ScanRedInfo) {
4235
4236 if (!updateToLocation(Loc))
4237 return Loc.IP;
4238 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4239 InsertPointTy CodeGenIP) -> Error {
4240 Builder.restoreIP(CodeGenIP);
4241 Function *CurFn = Builder.GetInsertBlock()->getParent();
4242 // for (int k = 0; k <= ceil(log2(n)); ++k)
4243 llvm::BasicBlock *LoopBB =
4244 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4245 llvm::BasicBlock *ExitBB =
4246 splitBB(Builder, false, "omp.outer.log.scan.exit");
4248 Builder.GetInsertBlock()->getModule(),
4249 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4250 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4251 llvm::Value *Arg =
4252 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4253 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4255 Builder.GetInsertBlock()->getModule(),
4256 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4257 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4258 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4259 llvm::Value *NMin1 = Builder.CreateNUWSub(
4260 ScanRedInfo->Span,
4261 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4262 Builder.SetInsertPoint(InputBB);
4263 Builder.CreateBr(LoopBB);
4264 emitBlock(LoopBB, CurFn);
4265 Builder.SetInsertPoint(LoopBB);
4266
4267 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4268 // size pow2k = 1;
4269 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4270 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4271 InputBB);
4272 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4273 InputBB);
4274 // for (size i = n - 1; i >= 2 ^ k; --i)
4275 // tmp[i] op= tmp[i-pow2k];
4276 llvm::BasicBlock *InnerLoopBB =
4277 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4278 llvm::BasicBlock *InnerExitBB =
4279 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4280 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4281 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4282 emitBlock(InnerLoopBB, CurFn);
4283 Builder.SetInsertPoint(InnerLoopBB);
4284 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4285 IVal->addIncoming(NMin1, LoopBB);
4286 for (ReductionInfo RedInfo : ReductionInfos) {
4287 Value *ReductionVal = RedInfo.PrivateVariable;
4288 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4289 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4290 Type *DestTy = RedInfo.ElementType;
4291 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4292 Value *LHSPtr =
4293 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4294 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4295 Value *RHSPtr =
4296 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4297 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4298 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4300 InsertPointOrErrorTy AfterIP =
4301 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4302 if (!AfterIP)
4303 return AfterIP.takeError();
4304 Builder.CreateStore(Result, LHSPtr);
4305 }
4306 llvm::Value *NextIVal = Builder.CreateNUWSub(
4307 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4308 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4309 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4310 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4311 emitBlock(InnerExitBB, CurFn);
4312 llvm::Value *Next = Builder.CreateNUWAdd(
4313 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4314 Counter->addIncoming(Next, Builder.GetInsertBlock());
4315 // pow2k <<= 1;
4316 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4317 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4318 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4319 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4320 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4321 return Error::success();
4322 };
4323
4324 // TODO: Perform finalization actions for variables. This has to be
4325 // called for variables which have destructors/finalizers.
4326 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4327
4328 llvm::Value *FilterVal = Builder.getInt32(0);
4329 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4330 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4331
4332 if (!AfterIP)
4333 return AfterIP.takeError();
4334 Builder.restoreIP(*AfterIP);
4335 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4336
4337 if (!AfterIP)
4338 return AfterIP.takeError();
4339 Builder.restoreIP(*AfterIP);
4340 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4341 if (Err)
4342 return Err;
4343
4344 return AfterIP;
4345}
4346
4347Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4348 llvm::function_ref<Error()> InputLoopGen,
4349 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4350 ScanInfo *ScanRedInfo) {
4351
4352 {
4353 // Emit loop with input phase:
4354 // for (i: 0..<num_iters>) {
4355 // <input phase>;
4356 // buffer[i] = red;
4357 // }
4358 ScanRedInfo->OMPFirstScanLoop = true;
4359 Error Err = InputLoopGen();
4360 if (Err)
4361 return Err;
4362 }
4363 {
4364 // Emit loop with scan phase:
4365 // for (i: 0..<num_iters>) {
4366 // red = buffer[i];
4367 // <scan phase>;
4368 // }
4369 ScanRedInfo->OMPFirstScanLoop = false;
4370 Error Err = ScanLoopGen(Builder.saveIP());
4371 if (Err)
4372 return Err;
4373 }
4374 return Error::success();
4375}
4376
4377void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4378 Function *Fun = Builder.GetInsertBlock()->getParent();
4379 ScanRedInfo->OMPScanDispatch =
4380 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4381 ScanRedInfo->OMPAfterScanBlock =
4382 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4383 ScanRedInfo->OMPBeforeScanBlock =
4384 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4385 ScanRedInfo->OMPScanLoopExit =
4386 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4387}
4388CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4389 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4390 BasicBlock *PostInsertBefore, const Twine &Name) {
4391 Module *M = F->getParent();
4392 LLVMContext &Ctx = M->getContext();
4393 Type *IndVarTy = TripCount->getType();
4394
4395 // Create the basic block structure.
4396 BasicBlock *Preheader =
4397 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4398 BasicBlock *Header =
4399 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4400 BasicBlock *Cond =
4401 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4402 BasicBlock *Body =
4403 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4404 BasicBlock *Latch =
4405 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4406 BasicBlock *Exit =
4407 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4408 BasicBlock *After =
4409 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4410
4411 // Use specified DebugLoc for new instructions.
4412 Builder.SetCurrentDebugLocation(DL);
4413
4414 Builder.SetInsertPoint(Preheader);
4415 Builder.CreateBr(Header);
4416
4417 Builder.SetInsertPoint(Header);
4418 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4419 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4420 Builder.CreateBr(Cond);
4421
4422 Builder.SetInsertPoint(Cond);
4423 Value *Cmp =
4424 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4425 Builder.CreateCondBr(Cmp, Body, Exit);
4426
4427 Builder.SetInsertPoint(Body);
4428 Builder.CreateBr(Latch);
4429
4430 Builder.SetInsertPoint(Latch);
4431 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4432 "omp_" + Name + ".next", /*HasNUW=*/true);
4433 Builder.CreateBr(Header);
4434 IndVarPHI->addIncoming(Next, Latch);
4435
4436 Builder.SetInsertPoint(Exit);
4437 Builder.CreateBr(After);
4438
4439 // Remember and return the canonical control flow.
4440 LoopInfos.emplace_front();
4441 CanonicalLoopInfo *CL = &LoopInfos.front();
4442
4443 CL->Header = Header;
4444 CL->Cond = Cond;
4445 CL->Latch = Latch;
4446 CL->Exit = Exit;
4447
4448#ifndef NDEBUG
4449 CL->assertOK();
4450#endif
4451 return CL;
4452}
4453
4455OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4456 LoopBodyGenCallbackTy BodyGenCB,
4457 Value *TripCount, const Twine &Name) {
4458 BasicBlock *BB = Loc.IP.getBlock();
4459 BasicBlock *NextBB = BB->getNextNode();
4460
4461 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4462 NextBB, NextBB, Name);
4463 BasicBlock *After = CL->getAfter();
4464
4465 // If location is not set, don't connect the loop.
4466 if (updateToLocation(Loc)) {
4467 // Split the loop at the insertion point: Branch to the preheader and move
4468 // every following instruction to after the loop (the After BB). Also, the
4469 // new successor is the loop's after block.
4470 spliceBB(Builder, After, /*CreateBranch=*/false);
4471 Builder.CreateBr(CL->getPreheader());
4472 }
4473
4474 // Emit the body content. We do it after connecting the loop to the CFG to
4475 // avoid that the callback encounters degenerate BBs.
4476 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4477 return Err;
4478
4479#ifndef NDEBUG
4480 CL->assertOK();
4481#endif
4482 return CL;
4483}
4484
4485Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4486 ScanInfos.emplace_front();
4487 ScanInfo *Result = &ScanInfos.front();
4488 return Result;
4489}
4490
4492OpenMPIRBuilder::createCanonicalScanLoops(
4493 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4494 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4495 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4496 LocationDescription ComputeLoc =
4497 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4498 updateToLocation(ComputeLoc);
4499
4501
4502 Value *TripCount = calculateCanonicalLoopTripCount(
4503 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4504 ScanRedInfo->Span = TripCount;
4505 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4506 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4507
4508 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4509 Builder.restoreIP(CodeGenIP);
4510 ScanRedInfo->IV = IV;
4511 createScanBBs(ScanRedInfo);
4512 BasicBlock *InputBlock = Builder.GetInsertBlock();
4513 Instruction *Terminator = InputBlock->getTerminator();
4514 assert(Terminator->getNumSuccessors() == 1);
4515 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4516 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4517 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4518 Builder.GetInsertBlock()->getParent());
4519 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4520 emitBlock(ScanRedInfo->OMPScanLoopExit,
4521 Builder.GetInsertBlock()->getParent());
4522 Builder.CreateBr(ContinueBlock);
4523 Builder.SetInsertPoint(
4524 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4525 return BodyGenCB(Builder.saveIP(), IV);
4526 };
4527
4528 const auto &&InputLoopGen = [&]() -> Error {
4529 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4530 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4531 ComputeIP, Name, true, ScanRedInfo);
4532 if (!LoopInfo)
4533 return LoopInfo.takeError();
4534 Result.push_back(*LoopInfo);
4535 Builder.restoreIP((*LoopInfo)->getAfterIP());
4536 return Error::success();
4537 };
4538 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4540 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4541 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4542 if (!LoopInfo)
4543 return LoopInfo.takeError();
4544 Result.push_back(*LoopInfo);
4545 Builder.restoreIP((*LoopInfo)->getAfterIP());
4546 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4547 return Error::success();
4548 };
4549 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4550 if (Err)
4551 return Err;
4552 return Result;
4553}
4554
4555Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4556 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4557 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4558
4559 // Consider the following difficulties (assuming 8-bit signed integers):
4560 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4561 // DO I = 1, 100, 50
4562 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4563 // DO I = 100, 0, -128
4564
4565 // Start, Stop and Step must be of the same integer type.
4566 auto *IndVarTy = cast<IntegerType>(Start->getType());
4567 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4568 assert(IndVarTy == Step->getType() && "Step type mismatch");
4569
4570 updateToLocation(Loc);
4571
4572 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4573 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4574
4575 // Like Step, but always positive.
4576 Value *Incr = Step;
4577
4578 // Distance between Start and Stop; always positive.
4579 Value *Span;
4580
4581 // Condition whether there are no iterations are executed at all, e.g. because
4582 // UB < LB.
4583 Value *ZeroCmp;
4584
4585 if (IsSigned) {
4586 // Ensure that increment is positive. If not, negate and invert LB and UB.
4587 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4588 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4589 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4590 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4591 Span = Builder.CreateSub(UB, LB, "", false, true);
4592 ZeroCmp = Builder.CreateICmp(
4593 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4594 } else {
4595 Span = Builder.CreateSub(Stop, Start, "", true);
4596 ZeroCmp = Builder.CreateICmp(
4597 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4598 }
4599
4600 Value *CountIfLooping;
4601 if (InclusiveStop) {
4602 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4603 } else {
4604 // Avoid incrementing past stop since it could overflow.
4605 Value *CountIfTwo = Builder.CreateAdd(
4606 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4607 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4608 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4609 }
4610
4611 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4612 "omp_" + Name + ".tripcount");
4613}
4614
4615Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4616 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4617 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4618 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4619 ScanInfo *ScanRedInfo) {
4620 LocationDescription ComputeLoc =
4621 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4622
4623 Value *TripCount = calculateCanonicalLoopTripCount(
4624 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4625
4626 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4627 Builder.restoreIP(CodeGenIP);
4628 Value *Span = Builder.CreateMul(IV, Step);
4629 Value *IndVar = Builder.CreateAdd(Span, Start);
4630 if (InScan)
4631 ScanRedInfo->IV = IndVar;
4632 return BodyGenCB(Builder.saveIP(), IndVar);
4633 };
4634 LocationDescription LoopLoc =
4635 ComputeIP.isSet()
4636 ? Loc
4637 : LocationDescription(Builder.saveIP(),
4638 Builder.getCurrentDebugLocation());
4639 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4640}
4641
4642// Returns an LLVM function to call for initializing loop bounds using OpenMP
4643// static scheduling for composite `distribute parallel for` depending on
4644// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4645// integers as unsigned similarly to CanonicalLoopInfo.
4646static FunctionCallee
4648 OpenMPIRBuilder &OMPBuilder) {
4649 unsigned Bitwidth = Ty->getIntegerBitWidth();
4650 if (Bitwidth == 32)
4651 return OMPBuilder.getOrCreateRuntimeFunction(
4652 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4653 if (Bitwidth == 64)
4654 return OMPBuilder.getOrCreateRuntimeFunction(
4655 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4656 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4657}
4658
4659// Returns an LLVM function to call for initializing loop bounds using OpenMP
4660// static scheduling depending on `type`. Only i32 and i64 are supported by the
4661// runtime. Always interpret integers as unsigned similarly to
4662// CanonicalLoopInfo.
4664 OpenMPIRBuilder &OMPBuilder) {
4665 unsigned Bitwidth = Ty->getIntegerBitWidth();
4666 if (Bitwidth == 32)
4667 return OMPBuilder.getOrCreateRuntimeFunction(
4668 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4669 if (Bitwidth == 64)
4670 return OMPBuilder.getOrCreateRuntimeFunction(
4671 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4672 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4673}
4674
4675OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4676 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4677 WorksharingLoopType LoopType, bool NeedsBarrier) {
4678 assert(CLI->isValid() && "Requires a valid canonical loop");
4679 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4680 "Require dedicated allocate IP");
4681
4682 // Set up the source location value for OpenMP runtime.
4683 Builder.restoreIP(CLI->getPreheaderIP());
4684 Builder.SetCurrentDebugLocation(DL);
4685
4686 uint32_t SrcLocStrSize;
4687 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4688 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4689
4690 // Declare useful OpenMP runtime functions.
4691 Value *IV = CLI->getIndVar();
4692 Type *IVTy = IV->getType();
4693 FunctionCallee StaticInit =
4694 LoopType == WorksharingLoopType::DistributeForStaticLoop
4695 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4696 : getKmpcForStaticInitForType(IVTy, M, *this);
4697 FunctionCallee StaticFini =
4698 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4699
4700 // Allocate space for computed loop bounds as expected by the "init" function.
4701 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4702
4703 Type *I32Type = Type::getInt32Ty(M.getContext());
4704 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4705 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4706 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4707 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4708 CLI->setLastIter(PLastIter);
4709
4710 // At the end of the preheader, prepare for calling the "init" function by
4711 // storing the current loop bounds into the allocated space. A canonical loop
4712 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4713 // and produces an inclusive upper bound.
4714 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4715 Constant *Zero = ConstantInt::get(IVTy, 0);
4716 Constant *One = ConstantInt::get(IVTy, 1);
4717 Builder.CreateStore(Zero, PLowerBound);
4718 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4719 Builder.CreateStore(UpperBound, PUpperBound);
4720 Builder.CreateStore(One, PStride);
4721
4722 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4723
4724 OMPScheduleType SchedType =
4725 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4726 ? OMPScheduleType::OrderedDistribute
4728 Constant *SchedulingType =
4729 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4730
4731 // Call the "init" function and update the trip count of the loop with the
4732 // value it produced.
4734 {SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound, PUpperBound});
4735 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4736 Value *PDistUpperBound =
4737 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4738 Args.push_back(PDistUpperBound);
4739 }
4740 Args.append({PStride, One, Zero});
4741 Builder.CreateCall(StaticInit, Args);
4742 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4743 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4744 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4745 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4746 CLI->setTripCount(TripCount);
4747
4748 // Update all uses of the induction variable except the one in the condition
4749 // block that compares it with the actual upper bound, and the increment in
4750 // the latch block.
4751
4752 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4753 Builder.SetInsertPoint(CLI->getBody(),
4754 CLI->getBody()->getFirstInsertionPt());
4755 Builder.SetCurrentDebugLocation(DL);
4756 return Builder.CreateAdd(OldIV, LowerBound);
4757 });
4758
4759 // In the "exit" block, call the "fini" function.
4760 Builder.SetInsertPoint(CLI->getExit(),
4761 CLI->getExit()->getTerminator()->getIterator());
4762 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4763
4764 // Add the barrier if requested.
4765 if (NeedsBarrier) {
4766 InsertPointOrErrorTy BarrierIP =
4767 createBarrier(LocationDescription(Builder.saveIP(), DL),
4768 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4769 /* CheckCancelFlag */ false);
4770 if (!BarrierIP)
4771 return BarrierIP.takeError();
4772 }
4773
4774 InsertPointTy AfterIP = CLI->getAfterIP();
4775 CLI->invalidate();
4776
4777 return AfterIP;
4778}
4779
4780OpenMPIRBuilder::InsertPointOrErrorTy
4781OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(DebugLoc DL,
4782 CanonicalLoopInfo *CLI,
4783 InsertPointTy AllocaIP,
4784 bool NeedsBarrier,
4785 Value *ChunkSize) {
4786 assert(CLI->isValid() && "Requires a valid canonical loop");
4787 assert(ChunkSize && "Chunk size is required");
4788
4789 LLVMContext &Ctx = CLI->getFunction()->getContext();
4790 Value *IV = CLI->getIndVar();
4791 Value *OrigTripCount = CLI->getTripCount();
4792 Type *IVTy = IV->getType();
4793 assert(IVTy->getIntegerBitWidth() <= 64 &&
4794 "Max supported tripcount bitwidth is 64 bits");
4795 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4796 : Type::getInt64Ty(Ctx);
4797 Type *I32Type = Type::getInt32Ty(M.getContext());
4798 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4799 Constant *One = ConstantInt::get(InternalIVTy, 1);
4800
4801 // Declare useful OpenMP runtime functions.
4802 FunctionCallee StaticInit =
4803 getKmpcForStaticInitForType(InternalIVTy, M, *this);
4804 FunctionCallee StaticFini =
4805 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4806
4807 // Allocate space for computed loop bounds as expected by the "init" function.
4808 Builder.restoreIP(AllocaIP);
4809 Builder.SetCurrentDebugLocation(DL);
4810 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4811 Value *PLowerBound =
4812 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
4813 Value *PUpperBound =
4814 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
4815 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
4816 CLI->setLastIter(PLastIter);
4817
4818 // Set up the source location value for the OpenMP runtime.
4819 Builder.restoreIP(CLI->getPreheaderIP());
4820 Builder.SetCurrentDebugLocation(DL);
4821
4822 // TODO: Detect overflow in ubsan or max-out with current tripcount.
4823 Value *CastedChunkSize =
4824 Builder.CreateZExtOrTrunc(ChunkSize, InternalIVTy, "chunksize");
4825 Value *CastedTripCount =
4826 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
4827
4828 Constant *SchedulingType = ConstantInt::get(
4829 I32Type, static_cast<int>(OMPScheduleType::UnorderedStaticChunked));
4830 Builder.CreateStore(Zero, PLowerBound);
4831 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
4832 Builder.CreateStore(OrigUpperBound, PUpperBound);
4833 Builder.CreateStore(One, PStride);
4834
4835 // Call the "init" function and update the trip count of the loop with the
4836 // value it produced.
4837 uint32_t SrcLocStrSize;
4838 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4839 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4840 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4841 Builder.CreateCall(StaticInit,
4842 {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
4843 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
4844 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
4845 /*pstride=*/PStride, /*incr=*/One,
4846 /*chunk=*/CastedChunkSize});
4847
4848 // Load values written by the "init" function.
4849 Value *FirstChunkStart =
4850 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
4851 Value *FirstChunkStop =
4852 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
4853 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
4854 Value *ChunkRange =
4855 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
4856 Value *NextChunkStride =
4857 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
4858
4859 // Create outer "dispatch" loop for enumerating the chunks.
4860 BasicBlock *DispatchEnter = splitBB(Builder, true);
4861 Value *DispatchCounter;
4862
4863 // It is safe to assume this didn't return an error because the callback
4864 // passed into createCanonicalLoop is the only possible error source, and it
4865 // always returns success.
4866 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
4867 {Builder.saveIP(), DL},
4868 [&](InsertPointTy BodyIP, Value *Counter) {
4869 DispatchCounter = Counter;
4870 return Error::success();
4871 },
4872 FirstChunkStart, CastedTripCount, NextChunkStride,
4873 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
4874 "dispatch"));
4875
4876 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
4877 // not have to preserve the canonical invariant.
4878 BasicBlock *DispatchBody = DispatchCLI->getBody();
4879 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
4880 BasicBlock *DispatchExit = DispatchCLI->getExit();
4881 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
4882 DispatchCLI->invalidate();
4883
4884 // Rewire the original loop to become the chunk loop inside the dispatch loop.
4885 redirectTo(DispatchAfter, CLI->getAfter(), DL);
4886 redirectTo(CLI->getExit(), DispatchLatch, DL);
4887 redirectTo(DispatchBody, DispatchEnter, DL);
4888
4889 // Prepare the prolog of the chunk loop.
4890 Builder.restoreIP(CLI->getPreheaderIP());
4891 Builder.SetCurrentDebugLocation(DL);
4892
4893 // Compute the number of iterations of the chunk loop.
4894 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4895 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
4896 Value *IsLastChunk =
4897 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
4898 Value *CountUntilOrigTripCount =
4899 Builder.CreateSub(CastedTripCount, DispatchCounter);
4900 Value *ChunkTripCount = Builder.CreateSelect(
4901 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
4902 Value *BackcastedChunkTC =
4903 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
4904 CLI->setTripCount(BackcastedChunkTC);
4905
4906 // Update all uses of the induction variable except the one in the condition
4907 // block that compares it with the actual upper bound, and the increment in
4908 // the latch block.
4909 Value *BackcastedDispatchCounter =
4910 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
4911 CLI->mapIndVar([&](Instruction *) -> Value * {
4912 Builder.restoreIP(CLI->getBodyIP());
4913 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
4914 });
4915
4916 // In the "exit" block, call the "fini" function.
4917 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
4918 Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
4919
4920 // Add the barrier if requested.
4921 if (NeedsBarrier) {
4922 InsertPointOrErrorTy AfterIP =
4923 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
4924 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
4925 if (!AfterIP)
4926 return AfterIP.takeError();
4927 }
4928
4929#ifndef NDEBUG
4930 // Even though we currently do not support applying additional methods to it,
4931 // the chunk loop should remain a canonical loop.
4932 CLI->assertOK();
4933#endif
4934
4935 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
4936}
4937
4938// Returns an LLVM function to call for executing an OpenMP static worksharing
4939// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
4940// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
4941static FunctionCallee
4942getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
4943 WorksharingLoopType LoopType) {
4944 unsigned Bitwidth = Ty->getIntegerBitWidth();
4945 Module &M = OMPBuilder->M;
4946 switch (LoopType) {
4947 case WorksharingLoopType::ForStaticLoop:
4948 if (Bitwidth == 32)
4949 return OMPBuilder->getOrCreateRuntimeFunction(
4950 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
4951 if (Bitwidth == 64)
4952 return OMPBuilder->getOrCreateRuntimeFunction(
4953 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
4954 break;
4955 case WorksharingLoopType::DistributeStaticLoop:
4956 if (Bitwidth == 32)
4957 return OMPBuilder->getOrCreateRuntimeFunction(
4958 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
4959 if (Bitwidth == 64)
4960 return OMPBuilder->getOrCreateRuntimeFunction(
4961 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
4962 break;
4963 case WorksharingLoopType::DistributeForStaticLoop:
4964 if (Bitwidth == 32)
4965 return OMPBuilder->getOrCreateRuntimeFunction(
4966 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
4967 if (Bitwidth == 64)
4968 return OMPBuilder->getOrCreateRuntimeFunction(
4969 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
4970 break;
4971 }
4972 if (Bitwidth != 32 && Bitwidth != 64) {
4973 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
4974 }
4975 llvm_unreachable("Unknown type of OpenMP worksharing loop");
4976}
4977
4978// Inserts a call to proper OpenMP Device RTL function which handles
4979// loop worksharing.
4980static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
4981 WorksharingLoopType LoopType,
4982 BasicBlock *InsertBlock, Value *Ident,
4983 Value *LoopBodyArg, Value *TripCount,
4984 Function &LoopBodyFn, bool NoLoop) {
4985 Type *TripCountTy = TripCount->getType();
4986 Module &M = OMPBuilder->M;
4987 IRBuilder<> &Builder = OMPBuilder->Builder;
4988 FunctionCallee RTLFn =
4989 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
4990 SmallVector<Value *, 8> RealArgs;
4991 RealArgs.push_back(Ident);
4992 RealArgs.push_back(&LoopBodyFn);
4993 RealArgs.push_back(LoopBodyArg);
4994 RealArgs.push_back(TripCount);
4995 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
4996 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
4997 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
4998 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
4999 Builder.CreateCall(RTLFn, RealArgs);
5000 return;
5001 }
5002 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5003 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5004 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5005 Value *NumThreads = Builder.CreateCall(RTLNumThreads, {});
5006
5007 RealArgs.push_back(
5008 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5009 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5010 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5011 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5012 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5013 } else {
5014 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5015 }
5016
5017 Builder.CreateCall(RTLFn, RealArgs);
5018}
5019
5021 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5022 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5023 WorksharingLoopType LoopType, bool NoLoop) {
5024 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5025 BasicBlock *Preheader = CLI->getPreheader();
5026 Value *TripCount = CLI->getTripCount();
5027
5028 // After loop body outling, the loop body contains only set up
5029 // of loop body argument structure and the call to the outlined
5030 // loop body function. Firstly, we need to move setup of loop body args
5031 // into loop preheader.
5032 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5033 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5034
5035 // The next step is to remove the whole loop. We do not it need anymore.
5036 // That's why make an unconditional branch from loop preheader to loop
5037 // exit block
5038 Builder.restoreIP({Preheader, Preheader->end()});
5039 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5040 Preheader->getTerminator()->eraseFromParent();
5041 Builder.CreateBr(CLI->getExit());
5042
5043 // Delete dead loop blocks
5044 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5045 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5046 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5047 CleanUpInfo.EntryBB = CLI->getHeader();
5048 CleanUpInfo.ExitBB = CLI->getExit();
5049 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5050 DeleteDeadBlocks(BlocksToBeRemoved);
5051
5052 // Find the instruction which corresponds to loop body argument structure
5053 // and remove the call to loop body function instruction.
5054 Value *LoopBodyArg;
5055 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5056 assert(OutlinedFnUser &&
5057 "Expected unique undroppable user of outlined function");
5058 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5059 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5060 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5061 "Expected outlined function call to be located in loop preheader");
5062 // Check in case no argument structure has been passed.
5063 if (OutlinedFnCallInstruction->arg_size() > 1)
5064 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5065 else
5066 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5067 OutlinedFnCallInstruction->eraseFromParent();
5068
5069 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5070 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5071
5072 for (auto &ToBeDeletedItem : ToBeDeleted)
5073 ToBeDeletedItem->eraseFromParent();
5074 CLI->invalidate();
5075}
5076
5077OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5078 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5079 WorksharingLoopType LoopType, bool NoLoop) {
5080 uint32_t SrcLocStrSize;
5081 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5082 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5083
5084 OutlineInfo OI;
5085 OI.OuterAllocaBB = CLI->getPreheader();
5086 Function *OuterFn = CLI->getPreheader()->getParent();
5087
5088 // Instructions which need to be deleted at the end of code generation
5090
5091 OI.OuterAllocaBB = AllocaIP.getBlock();
5092
5093 // Mark the body loop as region which needs to be extracted
5094 OI.EntryBB = CLI->getBody();
5095 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5096 "omp.prelatch", true);
5097
5098 // Prepare loop body for extraction
5099 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5100
5101 // Insert new loop counter variable which will be used only in loop
5102 // body.
5103 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5104 Instruction *NewLoopCntLoad =
5105 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5106 // New loop counter instructions are redundant in the loop preheader when
5107 // code generation for workshare loop is finshed. That's why mark them as
5108 // ready for deletion.
5109 ToBeDeleted.push_back(NewLoopCntLoad);
5110 ToBeDeleted.push_back(NewLoopCnt);
5111
5112 // Analyse loop body region. Find all input variables which are used inside
5113 // loop body region.
5114 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5116 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5117
5118 CodeExtractorAnalysisCache CEAC(*OuterFn);
5119 CodeExtractor Extractor(Blocks,
5120 /* DominatorTree */ nullptr,
5121 /* AggregateArgs */ true,
5122 /* BlockFrequencyInfo */ nullptr,
5123 /* BranchProbabilityInfo */ nullptr,
5124 /* AssumptionCache */ nullptr,
5125 /* AllowVarArgs */ true,
5126 /* AllowAlloca */ true,
5127 /* AllocationBlock */ CLI->getPreheader(),
5128 /* Suffix */ ".omp_wsloop",
5129 /* AggrArgsIn0AddrSpace */ true);
5130
5131 BasicBlock *CommonExit = nullptr;
5132 SetVector<Value *> SinkingCands, HoistingCands;
5133
5134 // Find allocas outside the loop body region which are used inside loop
5135 // body
5136 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5137
5138 // We need to model loop body region as the function f(cnt, loop_arg).
5139 // That's why we replace loop induction variable by the new counter
5140 // which will be one of loop body function argument
5141 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5142 CLI->getIndVar()->user_end());
5143 for (auto Use : Users) {
5144 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5145 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5146 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5147 }
5148 }
5149 }
5150 // Make sure that loop counter variable is not merged into loop body
5151 // function argument structure and it is passed as separate variable
5152 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5153
5154 // PostOutline CB is invoked when loop body function is outlined and
5155 // loop body is replaced by call to outlined function. We need to add
5156 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5157 // function will handle loop control logic.
5158 //
5159 OI.PostOutlineCB = [=, ToBeDeletedVec =
5160 std::move(ToBeDeleted)](Function &OutlinedFn) {
5161 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5162 LoopType, NoLoop);
5163 };
5164 addOutlineInfo(std::move(OI));
5165 return CLI->getAfterIP();
5166}
5167
5168OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5169 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5170 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5171 bool HasSimdModifier, bool HasMonotonicModifier,
5172 bool HasNonmonotonicModifier, bool HasOrderedClause,
5173 WorksharingLoopType LoopType, bool NoLoop) {
5174 if (Config.isTargetDevice())
5175 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5176 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5177 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5178 HasNonmonotonicModifier, HasOrderedClause);
5179
5180 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5181 OMPScheduleType::ModifierOrdered;
5182 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5183 case OMPScheduleType::BaseStatic:
5184 assert(!ChunkSize && "No chunk size with static-chunked schedule");
5185 if (IsOrdered)
5186 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5187 NeedsBarrier, ChunkSize);
5188 // FIXME: Monotonicity ignored?
5189 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier);
5190
5191 case OMPScheduleType::BaseStaticChunked:
5192 if (IsOrdered)
5193 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5194 NeedsBarrier, ChunkSize);
5195 // FIXME: Monotonicity ignored?
5196 return applyStaticChunkedWorkshareLoop(DL, CLI, AllocaIP, NeedsBarrier,
5197 ChunkSize);
5198
5199 case OMPScheduleType::BaseRuntime:
5200 case OMPScheduleType::BaseAuto:
5201 case OMPScheduleType::BaseGreedy:
5202 case OMPScheduleType::BaseBalanced:
5203 case OMPScheduleType::BaseSteal:
5204 case OMPScheduleType::BaseGuidedSimd:
5205 case OMPScheduleType::BaseRuntimeSimd:
5206 assert(!ChunkSize &&
5207 "schedule type does not support user-defined chunk sizes");
5208 [[fallthrough]];
5209 case OMPScheduleType::BaseDynamicChunked:
5210 case OMPScheduleType::BaseGuidedChunked:
5211 case OMPScheduleType::BaseGuidedIterativeChunked:
5212 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5213 case OMPScheduleType::BaseStaticBalancedChunked:
5214 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5215 NeedsBarrier, ChunkSize);
5216
5217 default:
5218 llvm_unreachable("Unknown/unimplemented schedule kind");
5219 }
5220}
5221
5222/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5223/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5224/// the runtime. Always interpret integers as unsigned similarly to
5225/// CanonicalLoopInfo.
5226static FunctionCallee
5227getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5228 unsigned Bitwidth = Ty->getIntegerBitWidth();
5229 if (Bitwidth == 32)
5230 return OMPBuilder.getOrCreateRuntimeFunction(
5231 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5232 if (Bitwidth == 64)
5233 return OMPBuilder.getOrCreateRuntimeFunction(
5234 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5235 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5236}
5237
5238/// Returns an LLVM function to call for updating the next loop using OpenMP
5239/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5240/// the runtime. Always interpret integers as unsigned similarly to
5241/// CanonicalLoopInfo.
5242static FunctionCallee
5243getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5244 unsigned Bitwidth = Ty->getIntegerBitWidth();
5245 if (Bitwidth == 32)
5246 return OMPBuilder.getOrCreateRuntimeFunction(
5247 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5248 if (Bitwidth == 64)
5249 return OMPBuilder.getOrCreateRuntimeFunction(
5250 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5251 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5252}
5253
5254/// Returns an LLVM function to call for finalizing the dynamic loop using
5255/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5256/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5257static FunctionCallee
5258getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5259 unsigned Bitwidth = Ty->getIntegerBitWidth();
5260 if (Bitwidth == 32)
5261 return OMPBuilder.getOrCreateRuntimeFunction(
5262 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5263 if (Bitwidth == 64)
5264 return OMPBuilder.getOrCreateRuntimeFunction(
5265 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5266 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5267}
5268
5269OpenMPIRBuilder::InsertPointOrErrorTy
5270OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5271 InsertPointTy AllocaIP,
5272 OMPScheduleType SchedType,
5273 bool NeedsBarrier, Value *Chunk) {
5274 assert(CLI->isValid() && "Requires a valid canonical loop");
5275 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5276 "Require dedicated allocate IP");
5278 "Require valid schedule type");
5279
5280 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5281 OMPScheduleType::ModifierOrdered;
5282
5283 // Set up the source location value for OpenMP runtime.
5284 Builder.SetCurrentDebugLocation(DL);
5285
5286 uint32_t SrcLocStrSize;
5287 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5288 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5289
5290 // Declare useful OpenMP runtime functions.
5291 Value *IV = CLI->getIndVar();
5292 Type *IVTy = IV->getType();
5293 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5294 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5295
5296 // Allocate space for computed loop bounds as expected by the "init" function.
5297 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5298 Type *I32Type = Type::getInt32Ty(M.getContext());
5299 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5300 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5301 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5302 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5303 CLI->setLastIter(PLastIter);
5304
5305 // At the end of the preheader, prepare for calling the "init" function by
5306 // storing the current loop bounds into the allocated space. A canonical loop
5307 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5308 // and produces an inclusive upper bound.
5309 BasicBlock *PreHeader = CLI->getPreheader();
5310 Builder.SetInsertPoint(PreHeader->getTerminator());
5311 Constant *One = ConstantInt::get(IVTy, 1);
5312 Builder.CreateStore(One, PLowerBound);
5313 Value *UpperBound = CLI->getTripCount();
5314 Builder.CreateStore(UpperBound, PUpperBound);
5315 Builder.CreateStore(One, PStride);
5316
5317 BasicBlock *Header = CLI->getHeader();
5318 BasicBlock *Exit = CLI->getExit();
5319 BasicBlock *Cond = CLI->getCond();
5320 BasicBlock *Latch = CLI->getLatch();
5321 InsertPointTy AfterIP = CLI->getAfterIP();
5322
5323 // The CLI will be "broken" in the code below, as the loop is no longer
5324 // a valid canonical loop.
5325
5326 if (!Chunk)
5327 Chunk = One;
5328
5329 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5330
5331 Constant *SchedulingType =
5332 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5333
5334 // Call the "init" function.
5335 Builder.CreateCall(DynamicInit,
5336 {SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
5337 UpperBound, /* step */ One, Chunk});
5338
5339 // An outer loop around the existing one.
5340 BasicBlock *OuterCond = BasicBlock::Create(
5341 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5342 PreHeader->getParent());
5343 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5344 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5345 Value *Res =
5346 Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
5347 PLowerBound, PUpperBound, PStride});
5348 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5349 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5350 Value *LowerBound =
5351 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5352 Builder.CreateCondBr(MoreWork, Header, Exit);
5353
5354 // Change PHI-node in loop header to use outer cond rather than preheader,
5355 // and set IV to the LowerBound.
5356 Instruction *Phi = &Header->front();
5357 auto *PI = cast<PHINode>(Phi);
5358 PI->setIncomingBlock(0, OuterCond);
5359 PI->setIncomingValue(0, LowerBound);
5360
5361 // Then set the pre-header to jump to the OuterCond
5362 Instruction *Term = PreHeader->getTerminator();
5363 auto *Br = cast<BranchInst>(Term);
5364 Br->setSuccessor(0, OuterCond);
5365
5366 // Modify the inner condition:
5367 // * Use the UpperBound returned from the DynamicNext call.
5368 // * jump to the loop outer loop when done with one of the inner loops.
5369 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5370 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5371 Instruction *Comp = &*Builder.GetInsertPoint();
5372 auto *CI = cast<CmpInst>(Comp);
5373 CI->setOperand(1, UpperBound);
5374 // Redirect the inner exit to branch to outer condition.
5375 Instruction *Branch = &Cond->back();
5376 auto *BI = cast<BranchInst>(Branch);
5377 assert(BI->getSuccessor(1) == Exit);
5378 BI->setSuccessor(1, OuterCond);
5379
5380 // Call the "fini" function if "ordered" is present in wsloop directive.
5381 if (Ordered) {
5382 Builder.SetInsertPoint(&Latch->back());
5383 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5384 Builder.CreateCall(DynamicFini, {SrcLoc, ThreadNum});
5385 }
5386
5387 // Add the barrier if requested.
5388 if (NeedsBarrier) {
5389 Builder.SetInsertPoint(&Exit->back());
5390 InsertPointOrErrorTy BarrierIP =
5391 createBarrier(LocationDescription(Builder.saveIP(), DL),
5392 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5393 /* CheckCancelFlag */ false);
5394 if (!BarrierIP)
5395 return BarrierIP.takeError();
5396 }
5397
5398 CLI->invalidate();
5399 return AfterIP;
5400}
5401
5402/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5403/// after this \p OldTarget will be orphaned.
5405 BasicBlock *NewTarget, DebugLoc DL) {
5406 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5407 redirectTo(Pred, NewTarget, DL);
5408}
5409
5410/// Determine which blocks in \p BBs are reachable from outside and remove the
5411/// ones that are not reachable from the function.
5414 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5415 for (Use &U : BB->uses()) {
5416 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5417 if (!UseInst)
5418 continue;
5419 if (BBsToErase.count(UseInst->getParent()))
5420 continue;
5421 return true;
5422 }
5423 return false;
5424 };
5425
5426 while (BBsToErase.remove_if(HasRemainingUses)) {
5427 // Try again if anything was removed.
5428 }
5429
5430 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5431 DeleteDeadBlocks(BBVec);
5432}
5433
5434CanonicalLoopInfo *
5435OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5436 InsertPointTy ComputeIP) {
5437 assert(Loops.size() >= 1 && "At least one loop required");
5438 size_t NumLoops = Loops.size();
5439
5440 // Nothing to do if there is already just one loop.
5441 if (NumLoops == 1)
5442 return Loops.front();
5443
5444 CanonicalLoopInfo *Outermost = Loops.front();
5445 CanonicalLoopInfo *Innermost = Loops.back();
5446 BasicBlock *OrigPreheader = Outermost->getPreheader();
5447 BasicBlock *OrigAfter = Outermost->getAfter();
5448 Function *F = OrigPreheader->getParent();
5449
5450 // Loop control blocks that may become orphaned later.
5451 SmallVector<BasicBlock *, 12> OldControlBBs;
5452 OldControlBBs.reserve(6 * Loops.size());
5453 for (CanonicalLoopInfo *Loop : Loops)
5454 Loop->collectControlBlocks(OldControlBBs);
5455
5456 // Setup the IRBuilder for inserting the trip count computation.
5457 Builder.SetCurrentDebugLocation(DL);
5458 if (ComputeIP.isSet())
5459 Builder.restoreIP(ComputeIP);
5460 else
5461 Builder.restoreIP(Outermost->getPreheaderIP());
5462
5463 // Derive the collapsed' loop trip count.
5464 // TODO: Find common/largest indvar type.
5465 Value *CollapsedTripCount = nullptr;
5466 for (CanonicalLoopInfo *L : Loops) {
5467 assert(L->isValid() &&
5468 "All loops to collapse must be valid canonical loops");
5469 Value *OrigTripCount = L->getTripCount();
5470 if (!CollapsedTripCount) {
5471 CollapsedTripCount = OrigTripCount;
5472 continue;
5473 }
5474
5475 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5476 CollapsedTripCount =
5477 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5478 }
5479
5480 // Create the collapsed loop control flow.
5481 CanonicalLoopInfo *Result =
5482 createLoopSkeleton(DL, CollapsedTripCount, F,
5483 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5484
5485 // Build the collapsed loop body code.
5486 // Start with deriving the input loop induction variables from the collapsed
5487 // one, using a divmod scheme. To preserve the original loops' order, the
5488 // innermost loop use the least significant bits.
5489 Builder.restoreIP(Result->getBodyIP());
5490
5491 Value *Leftover = Result->getIndVar();
5492 SmallVector<Value *> NewIndVars;
5493 NewIndVars.resize(NumLoops);
5494 for (int i = NumLoops - 1; i >= 1; --i) {
5495 Value *OrigTripCount = Loops[i]->getTripCount();
5496
5497 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5498 NewIndVars[i] = NewIndVar;
5499
5500 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5501 }
5502 // Outermost loop gets all the remaining bits.
5503 NewIndVars[0] = Leftover;
5504
5505 // Construct the loop body control flow.
5506 // We progressively construct the branch structure following in direction of
5507 // the control flow, from the leading in-between code, the loop nest body, the
5508 // trailing in-between code, and rejoining the collapsed loop's latch.
5509 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5510 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5511 // its predecessors as sources.
5512 BasicBlock *ContinueBlock = Result->getBody();
5513 BasicBlock *ContinuePred = nullptr;
5514 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5515 BasicBlock *NextSrc) {
5516 if (ContinueBlock)
5517 redirectTo(ContinueBlock, Dest, DL);
5518 else
5519 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5520
5521 ContinueBlock = nullptr;
5522 ContinuePred = NextSrc;
5523 };
5524
5525 // The code before the nested loop of each level.
5526 // Because we are sinking it into the nest, it will be executed more often
5527 // that the original loop. More sophisticated schemes could keep track of what
5528 // the in-between code is and instantiate it only once per thread.
5529 for (size_t i = 0; i < NumLoops - 1; ++i)
5530 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5531
5532 // Connect the loop nest body.
5533 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5534
5535 // The code after the nested loop at each level.
5536 for (size_t i = NumLoops - 1; i > 0; --i)
5537 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5538
5539 // Connect the finished loop to the collapsed loop latch.
5540 ContinueWith(Result->getLatch(), nullptr);
5541
5542 // Replace the input loops with the new collapsed loop.
5543 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5544 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5545
5546 // Replace the input loop indvars with the derived ones.
5547 for (size_t i = 0; i < NumLoops; ++i)
5548 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5549
5550 // Remove unused parts of the input loops.
5551 removeUnusedBlocksFromParent(OldControlBBs);
5552
5553 for (CanonicalLoopInfo *L : Loops)
5554 L->invalidate();
5555
5556#ifndef NDEBUG
5557 Result->assertOK();
5558#endif
5559 return Result;
5560}
5561
5562std::vector<CanonicalLoopInfo *>
5563OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5564 ArrayRef<Value *> TileSizes) {
5565 assert(TileSizes.size() == Loops.size() &&
5566 "Must pass as many tile sizes as there are loops");
5567 int NumLoops = Loops.size();
5568 assert(NumLoops >= 1 && "At least one loop to tile required");
5569
5570 CanonicalLoopInfo *OutermostLoop = Loops.front();
5571 CanonicalLoopInfo *InnermostLoop = Loops.back();
5572 Function *F = OutermostLoop->getBody()->getParent();
5573 BasicBlock *InnerEnter = InnermostLoop->getBody();
5574 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5575
5576 // Loop control blocks that may become orphaned later.
5577 SmallVector<BasicBlock *, 12> OldControlBBs;
5578 OldControlBBs.reserve(6 * Loops.size());
5579 for (CanonicalLoopInfo *Loop : Loops)
5580 Loop->collectControlBlocks(OldControlBBs);
5581
5582 // Collect original trip counts and induction variable to be accessible by
5583 // index. Also, the structure of the original loops is not preserved during
5584 // the construction of the tiled loops, so do it before we scavenge the BBs of
5585 // any original CanonicalLoopInfo.
5586 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5587 for (CanonicalLoopInfo *L : Loops) {
5588 assert(L->isValid() && "All input loops must be valid canonical loops");
5589 OrigTripCounts.push_back(L->getTripCount());
5590 OrigIndVars.push_back(L->getIndVar());
5591 }
5592
5593 // Collect the code between loop headers. These may contain SSA definitions
5594 // that are used in the loop nest body. To be usable with in the innermost
5595 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5596 // these instructions may be executed more often than before the tiling.
5597 // TODO: It would be sufficient to only sink them into body of the
5598 // corresponding tile loop.
5600 for (int i = 0; i < NumLoops - 1; ++i) {
5601 CanonicalLoopInfo *Surrounding = Loops[i];
5602 CanonicalLoopInfo *Nested = Loops[i + 1];
5603
5604 BasicBlock *EnterBB = Surrounding->getBody();
5605 BasicBlock *ExitBB = Nested->getHeader();
5606 InbetweenCode.emplace_back(EnterBB, ExitBB);
5607 }
5608
5609 // Compute the trip counts of the floor loops.
5610 Builder.SetCurrentDebugLocation(DL);
5611 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5612 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5613 for (int i = 0; i < NumLoops; ++i) {
5614 Value *TileSize = TileSizes[i];
5615 Value *OrigTripCount = OrigTripCounts[i];
5616 Type *IVType = OrigTripCount->getType();
5617
5618 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5619 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5620
5621 // 0 if tripcount divides the tilesize, 1 otherwise.
5622 // 1 means we need an additional iteration for a partial tile.
5623 //
5624 // Unfortunately we cannot just use the roundup-formula
5625 // (tripcount + tilesize - 1)/tilesize
5626 // because the summation might overflow. We do not want introduce undefined
5627 // behavior when the untiled loop nest did not.
5628 Value *FloorTripOverflow =
5629 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5630
5631 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5632 Value *FloorTripCount =
5633 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5634 "omp_floor" + Twine(i) + ".tripcount", true);
5635
5636 // Remember some values for later use.
5637 FloorCompleteCount.push_back(FloorCompleteTripCount);
5638 FloorCount.push_back(FloorTripCount);
5639 FloorRems.push_back(FloorTripRem);
5640 }
5641
5642 // Generate the new loop nest, from the outermost to the innermost.
5643 std::vector<CanonicalLoopInfo *> Result;
5644 Result.reserve(NumLoops * 2);
5645
5646 // The basic block of the surrounding loop that enters the nest generated
5647 // loop.
5648 BasicBlock *Enter = OutermostLoop->getPreheader();
5649
5650 // The basic block of the surrounding loop where the inner code should
5651 // continue.
5652 BasicBlock *Continue = OutermostLoop->getAfter();
5653
5654 // Where the next loop basic block should be inserted.
5655 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5656
5657 auto EmbeddNewLoop =
5658 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5659 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5660 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5661 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5662 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5663 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5664
5665 // Setup the position where the next embedded loop connects to this loop.
5666 Enter = EmbeddedLoop->getBody();
5667 Continue = EmbeddedLoop->getLatch();
5668 OutroInsertBefore = EmbeddedLoop->getLatch();
5669 return EmbeddedLoop;
5670 };
5671
5672 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5673 const Twine &NameBase) {
5674 for (auto P : enumerate(TripCounts)) {
5675 CanonicalLoopInfo *EmbeddedLoop =
5676 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5677 Result.push_back(EmbeddedLoop);
5678 }
5679 };
5680
5681 EmbeddNewLoops(FloorCount, "floor");
5682
5683 // Within the innermost floor loop, emit the code that computes the tile
5684 // sizes.
5685 Builder.SetInsertPoint(Enter->getTerminator());
5686 SmallVector<Value *, 4> TileCounts;
5687 for (int i = 0; i < NumLoops; ++i) {
5688 CanonicalLoopInfo *FloorLoop = Result[i];
5689 Value *TileSize = TileSizes[i];
5690
5691 Value *FloorIsEpilogue =
5692 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5693 Value *TileTripCount =
5694 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5695
5696 TileCounts.push_back(TileTripCount);
5697 }
5698
5699 // Create the tile loops.
5700 EmbeddNewLoops(TileCounts, "tile");
5701
5702 // Insert the inbetween code into the body.
5703 BasicBlock *BodyEnter = Enter;
5704 BasicBlock *BodyEntered = nullptr;
5705 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5706 BasicBlock *EnterBB = P.first;
5707 BasicBlock *ExitBB = P.second;
5708
5709 if (BodyEnter)
5710 redirectTo(BodyEnter, EnterBB, DL);
5711 else
5712 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5713
5714 BodyEnter = nullptr;
5715 BodyEntered = ExitBB;
5716 }
5717
5718 // Append the original loop nest body into the generated loop nest body.
5719 if (BodyEnter)
5720 redirectTo(BodyEnter, InnerEnter, DL);
5721 else
5722 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5724
5725 // Replace the original induction variable with an induction variable computed
5726 // from the tile and floor induction variables.
5727 Builder.restoreIP(Result.back()->getBodyIP());
5728 for (int i = 0; i < NumLoops; ++i) {
5729 CanonicalLoopInfo *FloorLoop = Result[i];
5730 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5731 Value *OrigIndVar = OrigIndVars[i];
5732 Value *Size = TileSizes[i];
5733
5734 Value *Scale =
5735 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5736 Value *Shift =
5737 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5738 OrigIndVar->replaceAllUsesWith(Shift);
5739 }
5740
5741 // Remove unused parts of the original loops.
5742 removeUnusedBlocksFromParent(OldControlBBs);
5743
5744 for (CanonicalLoopInfo *L : Loops)
5745 L->invalidate();
5746
5747#ifndef NDEBUG
5748 for (CanonicalLoopInfo *GenL : Result)
5749 GenL->assertOK();
5750#endif
5751 return Result;
5752}
5753
5754/// Attach metadata \p Properties to the basic block described by \p BB. If the
5755/// basic block already has metadata, the basic block properties are appended.
5757 ArrayRef<Metadata *> Properties) {
5758 // Nothing to do if no property to attach.
5759 if (Properties.empty())
5760 return;
5761
5762 LLVMContext &Ctx = BB->getContext();
5763 SmallVector<Metadata *> NewProperties;
5764 NewProperties.push_back(nullptr);
5765
5766 // If the basic block already has metadata, prepend it to the new metadata.
5767 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5768 if (Existing)
5769 append_range(NewProperties, drop_begin(Existing->operands(), 1));
5770
5771 append_range(NewProperties, Properties);
5772 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
5773 BasicBlockID->replaceOperandWith(0, BasicBlockID);
5774
5775 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
5776}
5777
5778/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
5779/// loop already has metadata, the loop properties are appended.
5780static void addLoopMetadata(CanonicalLoopInfo *Loop,
5781 ArrayRef<Metadata *> Properties) {
5782 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
5783
5784 // Attach metadata to the loop's latch
5785 BasicBlock *Latch = Loop->getLatch();
5786 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
5787 addBasicBlockMetadata(Latch, Properties);
5788}
5789
5790/// Attach llvm.access.group metadata to the memref instructions of \p Block
5791static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup,
5792 LoopInfo &LI) {
5793 for (Instruction &I : *Block) {
5794 if (I.mayReadOrWriteMemory()) {
5795 // TODO: This instruction may already have access group from
5796 // other pragmas e.g. #pragma clang loop vectorize. Append
5797 // so that the existing metadata is not overwritten.
5798 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
5799 }
5800 }
5801}
5802
5803void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
5804 LLVMContext &Ctx = Builder.getContext();
5806 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5807 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
5808}
5809
5810void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
5811 LLVMContext &Ctx = Builder.getContext();
5813 Loop, {
5814 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
5815 });
5816}
5817
5818void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
5819 Value *IfCond, ValueToValueMapTy &VMap,
5820 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
5821 const Twine &NamePrefix) {
5822 Function *F = CanonicalLoop->getFunction();
5823
5824 // We can't do
5825 // if (cond) {
5826 // simd_loop;
5827 // } else {
5828 // non_simd_loop;
5829 // }
5830 // because then the CanonicalLoopInfo would only point to one of the loops:
5831 // leading to other constructs operating on the same loop to malfunction.
5832 // Instead generate
5833 // while (...) {
5834 // if (cond) {
5835 // simd_body;
5836 // } else {
5837 // not_simd_body;
5838 // }
5839 // }
5840 // At least for simple loops, LLVM seems able to hoist the if out of the loop
5841 // body at -O3
5842
5843 // Define where if branch should be inserted
5844 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
5845
5846 // Create additional blocks for the if statement
5847 BasicBlock *Cond = SplitBeforeIt->getParent();
5848 llvm::LLVMContext &C = Cond->getContext();
5850 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
5852 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
5853
5854 // Create if condition branch.
5855 Builder.SetInsertPoint(SplitBeforeIt);
5856 Instruction *BrInstr =
5857 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
5858 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
5859 // Then block contains branch to omp loop body which needs to be vectorized
5860 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
5861 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
5862
5863 Builder.SetInsertPoint(ElseBlock);
5864
5865 // Clone loop for the else branch
5867
5868 SmallVector<BasicBlock *, 8> ExistingBlocks;
5869 ExistingBlocks.reserve(L->getNumBlocks() + 1);
5870 ExistingBlocks.push_back(ThenBlock);
5871 ExistingBlocks.append(L->block_begin(), L->block_end());
5872 // Cond is the block that has the if clause condition
5873 // LoopCond is omp_loop.cond
5874 // LoopHeader is omp_loop.header
5875 BasicBlock *LoopCond = Cond->getUniquePredecessor();
5876 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
5877 assert(LoopCond && LoopHeader && "Invalid loop structure");
5878 for (BasicBlock *Block : ExistingBlocks) {
5879 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
5880 Block == LoopHeader || Block == LoopCond || Block == Cond) {
5881 continue;
5882 }
5883 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
5884
5885 // fix name not to be omp.if.then
5886 if (Block == ThenBlock)
5887 NewBB->setName(NamePrefix + ".if.else");
5888
5889 NewBB->moveBefore(CanonicalLoop->getExit());
5890 VMap[Block] = NewBB;
5891 NewBlocks.push_back(NewBB);
5892 }
5893 remapInstructionsInBlocks(NewBlocks, VMap);
5894 Builder.CreateBr(NewBlocks.front());
5895
5896 // The loop latch must have only one predecessor. Currently it is branched to
5897 // from both the 'then' and 'else' branches.
5898 L->getLoopLatch()->splitBasicBlock(
5899 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
5900
5901 // Ensure that the then block is added to the loop so we add the attributes in
5902 // the next step
5903 L->addBasicBlockToLoop(ThenBlock, LI);
5904}
5905
5906unsigned
5907OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
5908 const StringMap<bool> &Features) {
5909 if (TargetTriple.isX86()) {
5910 if (Features.lookup("avx512f"))
5911 return 512;
5912 else if (Features.lookup("avx"))
5913 return 256;
5914 return 128;
5915 }
5916 if (TargetTriple.isPPC())
5917 return 128;
5918 if (TargetTriple.isWasm())
5919 return 128;
5920 return 0;
5921}
5922
5923void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
5924 MapVector<Value *, Value *> AlignedVars,
5925 Value *IfCond, OrderKind Order,
5926 ConstantInt *Simdlen, ConstantInt *Safelen) {
5927 LLVMContext &Ctx = Builder.getContext();
5928
5929 Function *F = CanonicalLoop->getFunction();
5930
5931 // TODO: We should not rely on pass manager. Currently we use pass manager
5932 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
5933 // object. We should have a method which returns all blocks between
5934 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
5936 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5937 FAM.registerPass([]() { return LoopAnalysis(); });
5938 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5939
5940 LoopAnalysis LIA;
5941 LoopInfo &&LI = LIA.run(*F, FAM);
5942
5943 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
5944 if (AlignedVars.size()) {
5945 InsertPointTy IP = Builder.saveIP();
5946 for (auto &AlignedItem : AlignedVars) {
5947 Value *AlignedPtr = AlignedItem.first;
5948 Value *Alignment = AlignedItem.second;
5949 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
5950 Builder.SetInsertPoint(loadInst->getNextNode());
5951 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
5952 Alignment);
5953 }
5954 Builder.restoreIP(IP);
5955 }
5956
5957 if (IfCond) {
5958 ValueToValueMapTy VMap;
5959 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
5960 }
5961
5963
5964 // Get the basic blocks from the loop in which memref instructions
5965 // can be found.
5966 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5967 // preferably without running any passes.
5968 for (BasicBlock *Block : L->getBlocks()) {
5969 if (Block == CanonicalLoop->getCond() ||
5970 Block == CanonicalLoop->getHeader())
5971 continue;
5972 Reachable.insert(Block);
5973 }
5974
5975 SmallVector<Metadata *> LoopMDList;
5976
5977 // In presence of finite 'safelen', it may be unsafe to mark all
5978 // the memory instructions parallel, because loop-carried
5979 // dependences of 'safelen' iterations are possible.
5980 // If clause order(concurrent) is specified then the memory instructions
5981 // are marked parallel even if 'safelen' is finite.
5982 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent)) {
5983 // Add access group metadata to memory-access instructions.
5984 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5985 for (BasicBlock *BB : Reachable)
5986 addSimdMetadata(BB, AccessGroup, LI);
5987 // TODO: If the loop has existing parallel access metadata, have
5988 // to combine two lists.
5989 LoopMDList.push_back(MDNode::get(
5990 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5991 }
5992
5993 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
5994 // versions so we can't add the loop attributes in that case.
5995 if (IfCond) {
5996 // we can still add llvm.loop.parallel_access
5997 addLoopMetadata(CanonicalLoop, LoopMDList);
5998 return;
5999 }
6000
6001 // Use the above access group metadata to create loop level
6002 // metadata, which should be distinct for each loop.
6003 ConstantAsMetadata *BoolConst =
6005 LoopMDList.push_back(MDNode::get(
6006 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6007
6008 if (Simdlen || Safelen) {
6009 // If both simdlen and safelen clauses are specified, the value of the
6010 // simdlen parameter must be less than or equal to the value of the safelen
6011 // parameter. Therefore, use safelen only in the absence of simdlen.
6012 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6013 LoopMDList.push_back(
6014 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6015 ConstantAsMetadata::get(VectorizeWidth)}));
6016 }
6017
6018 addLoopMetadata(CanonicalLoop, LoopMDList);
6019}
6020
6021/// Create the TargetMachine object to query the backend for optimization
6022/// preferences.
6023///
6024/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6025/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6026/// needed for the LLVM pass pipline. We use some default options to avoid
6027/// having to pass too many settings from the frontend that probably do not
6028/// matter.
6029///
6030/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6031/// method. If we are going to use TargetMachine for more purposes, especially
6032/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6033/// might become be worth requiring front-ends to pass on their TargetMachine,
6034/// or at least cache it between methods. Note that while fontends such as Clang
6035/// have just a single main TargetMachine per translation unit, "target-cpu" and
6036/// "target-features" that determine the TargetMachine are per-function and can
6037/// be overrided using __attribute__((target("OPTIONS"))).
6038static std::unique_ptr<TargetMachine>
6040 Module *M = F->getParent();
6041
6042 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6043 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6044 const llvm::Triple &Triple = M->getTargetTriple();
6045
6046 std::string Error;
6048 if (!TheTarget)
6049 return {};
6050
6052 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6053 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6054 /*CodeModel=*/std::nullopt, OptLevel));
6055}
6056
6057/// Heuristically determine the best-performant unroll factor for \p CLI. This
6058/// depends on the target processor. We are re-using the same heuristics as the
6059/// LoopUnrollPass.
6060static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6061 Function *F = CLI->getFunction();
6062
6063 // Assume the user requests the most aggressive unrolling, even if the rest of
6064 // the code is optimized using a lower setting.
6066 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6067
6069 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6070 FAM.registerPass([]() { return AssumptionAnalysis(); });
6071 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6072 FAM.registerPass([]() { return LoopAnalysis(); });
6073 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6074 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6075 TargetIRAnalysis TIRA;
6076 if (TM)
6077 TIRA = TargetIRAnalysis(
6078 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6079 FAM.registerPass([&]() { return TIRA; });
6080
6081 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6083 ScalarEvolution &&SE = SEA.run(*F, FAM);
6085 DominatorTree &&DT = DTA.run(*F, FAM);
6086 LoopAnalysis LIA;
6087 LoopInfo &&LI = LIA.run(*F, FAM);
6089 AssumptionCache &&AC = ACT.run(*F, FAM);
6091
6092 Loop *L = LI.getLoopFor(CLI->getHeader());
6093 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6094
6096 L, SE, TTI,
6097 /*BlockFrequencyInfo=*/nullptr,
6098 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6099 /*UserThreshold=*/std::nullopt,
6100 /*UserCount=*/std::nullopt,
6101 /*UserAllowPartial=*/true,
6102 /*UserAllowRuntime=*/true,
6103 /*UserUpperBound=*/std::nullopt,
6104 /*UserFullUnrollMaxCount=*/std::nullopt);
6105
6106 UP.Force = true;
6107
6108 // Account for additional optimizations taking place before the LoopUnrollPass
6109 // would unroll the loop.
6112
6113 // Use normal unroll factors even if the rest of the code is optimized for
6114 // size.
6117
6118 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6119 << " Threshold=" << UP.Threshold << "\n"
6120 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6121 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6122 << " PartialOptSizeThreshold="
6123 << UP.PartialOptSizeThreshold << "\n");
6124
6125 // Disable peeling.
6128 /*UserAllowPeeling=*/false,
6129 /*UserAllowProfileBasedPeeling=*/false,
6130 /*UnrollingSpecficValues=*/false);
6131
6133 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6134
6135 // Assume that reads and writes to stack variables can be eliminated by
6136 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6137 // size.
6138 for (BasicBlock *BB : L->blocks()) {
6139 for (Instruction &I : *BB) {
6140 Value *Ptr;
6141 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6142 Ptr = Load->getPointerOperand();
6143 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6144 Ptr = Store->getPointerOperand();
6145 } else
6146 continue;
6147
6148 Ptr = Ptr->stripPointerCasts();
6149
6150 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6151 if (Alloca->getParent() == &F->getEntryBlock())
6152 EphValues.insert(&I);
6153 }
6154 }
6155 }
6156
6157 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6158
6159 // Loop is not unrollable if the loop contains certain instructions.
6160 if (!UCE.canUnroll()) {
6161 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6162 return 1;
6163 }
6164
6165 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6166 << "\n");
6167
6168 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6169 // be able to use it.
6170 int TripCount = 0;
6171 int MaxTripCount = 0;
6172 bool MaxOrZero = false;
6173 unsigned TripMultiple = 0;
6174
6175 bool UseUpperBound = false;
6176 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6177 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6178 UseUpperBound);
6179 unsigned Factor = UP.Count;
6180 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6181
6182 // This function returns 1 to signal to not unroll a loop.
6183 if (Factor == 0)
6184 return 1;
6185 return Factor;
6186}
6187
6188void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6189 int32_t Factor,
6190 CanonicalLoopInfo **UnrolledCLI) {
6191 assert(Factor >= 0 && "Unroll factor must not be negative");
6192
6193 Function *F = Loop->getFunction();
6194 LLVMContext &Ctx = F->getContext();
6195
6196 // If the unrolled loop is not used for another loop-associated directive, it
6197 // is sufficient to add metadata for the LoopUnrollPass.
6198 if (!UnrolledCLI) {
6199 SmallVector<Metadata *, 2> LoopMetadata;
6200 LoopMetadata.push_back(
6201 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6202
6203 if (Factor >= 1) {
6205 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6206 LoopMetadata.push_back(MDNode::get(
6207 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6208 }
6209
6210 addLoopMetadata(Loop, LoopMetadata);
6211 return;
6212 }
6213
6214 // Heuristically determine the unroll factor.
6215 if (Factor == 0)
6217
6218 // No change required with unroll factor 1.
6219 if (Factor == 1) {
6220 *UnrolledCLI = Loop;
6221 return;
6222 }
6223
6224 assert(Factor >= 2 &&
6225 "unrolling only makes sense with a factor of 2 or larger");
6226
6227 Type *IndVarTy = Loop->getIndVarType();
6228
6229 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6230 // unroll the inner loop.
6231 Value *FactorVal =
6232 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6233 /*isSigned=*/false));
6234 std::vector<CanonicalLoopInfo *> LoopNest =
6235 tileLoops(DL, {Loop}, {FactorVal});
6236 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6237 *UnrolledCLI = LoopNest[0];
6238 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6239
6240 // LoopUnrollPass can only fully unroll loops with constant trip count.
6241 // Unroll by the unroll factor with a fallback epilog for the remainder
6242 // iterations if necessary.
6244 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6246 InnerLoop,
6247 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6249 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6250
6251#ifndef NDEBUG
6252 (*UnrolledCLI)->assertOK();
6253#endif
6254}
6255
6256OpenMPIRBuilder::InsertPointTy
6257OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6258 llvm::Value *BufSize, llvm::Value *CpyBuf,
6259 llvm::Value *CpyFn, llvm::Value *DidIt) {
6260 if (!updateToLocation(Loc))
6261 return Loc.IP;
6262
6263 uint32_t SrcLocStrSize;
6264 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6265 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6266 Value *ThreadId = getOrCreateThreadID(Ident);
6267
6268 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6269
6270 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6271
6272 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6273 Builder.CreateCall(Fn, Args);
6274
6275 return Builder.saveIP();
6276}
6277
6278OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6279 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6280 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6282
6283 if (!updateToLocation(Loc))
6284 return Loc.IP;
6285
6286 // If needed allocate and initialize `DidIt` with 0.
6287 // DidIt: flag variable: 1=single thread; 0=not single thread.
6288 llvm::Value *DidIt = nullptr;
6289 if (!CPVars.empty()) {
6290 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6291 Builder.CreateStore(Builder.getInt32(0), DidIt);
6292 }
6293
6294 Directive OMPD = Directive::OMPD_single;
6295 uint32_t SrcLocStrSize;
6296 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6297 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6298 Value *ThreadId = getOrCreateThreadID(Ident);
6299 Value *Args[] = {Ident, ThreadId};
6300
6301 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6302 Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6303
6304 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6305 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6306
6307 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6308 if (Error Err = FiniCB(IP))
6309 return Err;
6310
6311 // The thread that executes the single region must set `DidIt` to 1.
6312 // This is used by __kmpc_copyprivate, to know if the caller is the
6313 // single thread or not.
6314 if (DidIt)
6315 Builder.CreateStore(Builder.getInt32(1), DidIt);
6316
6317 return Error::success();
6318 };
6319
6320 // generates the following:
6321 // if (__kmpc_single()) {
6322 // .... single region ...
6323 // __kmpc_end_single
6324 // }
6325 // __kmpc_copyprivate
6326 // __kmpc_barrier
6327
6328 InsertPointOrErrorTy AfterIP =
6329 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6330 /*Conditional*/ true,
6331 /*hasFinalize*/ true);
6332 if (!AfterIP)
6333 return AfterIP.takeError();
6334
6335 if (DidIt) {
6336 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6337 // NOTE BufSize is currently unused, so just pass 0.
6338 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6339 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6340 CPFuncs[I], DidIt);
6341 // NOTE __kmpc_copyprivate already inserts a barrier
6342 } else if (!IsNowait) {
6343 InsertPointOrErrorTy AfterIP =
6344 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6345 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6346 /* CheckCancelFlag */ false);
6347 if (!AfterIP)
6348 return AfterIP.takeError();
6349 }
6350 return Builder.saveIP();
6351}
6352
6353OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6354 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6355 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6356
6357 if (!updateToLocation(Loc))
6358 return Loc.IP;
6359
6360 Directive OMPD = Directive::OMPD_critical;
6361 uint32_t SrcLocStrSize;
6362 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6363 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6364 Value *ThreadId = getOrCreateThreadID(Ident);
6365 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6366 Value *Args[] = {Ident, ThreadId, LockVar};
6367
6368 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6369 Function *RTFn = nullptr;
6370 if (HintInst) {
6371 // Add Hint to entry Args and create call
6372 EnterArgs.push_back(HintInst);
6373 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6374 } else {
6375 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6376 }
6377 Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
6378
6379 Function *ExitRTLFn =
6380 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6381 Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6382
6383 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6384 /*Conditional*/ false, /*hasFinalize*/ true);
6385}
6386
6387OpenMPIRBuilder::InsertPointTy
6388OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6389 InsertPointTy AllocaIP, unsigned NumLoops,
6390 ArrayRef<llvm::Value *> StoreValues,
6391 const Twine &Name, bool IsDependSource) {
6392 assert(
6393 llvm::all_of(StoreValues,
6394 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6395 "OpenMP runtime requires depend vec with i64 type");
6396
6397 if (!updateToLocation(Loc))
6398 return Loc.IP;
6399
6400 // Allocate space for vector and generate alloc instruction.
6401 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6402 Builder.restoreIP(AllocaIP);
6403 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6404 ArgsBase->setAlignment(Align(8));
6405 updateToLocation(Loc);
6406
6407 // Store the index value with offset in depend vector.
6408 for (unsigned I = 0; I < NumLoops; ++I) {
6409 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6410 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6411 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6412 STInst->setAlignment(Align(8));
6413 }
6414
6415 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6416 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6417
6418 uint32_t SrcLocStrSize;
6419 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6420 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6421 Value *ThreadId = getOrCreateThreadID(Ident);
6422 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6423
6424 Function *RTLFn = nullptr;
6425 if (IsDependSource)
6426 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6427 else
6428 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6429 Builder.CreateCall(RTLFn, Args);
6430
6431 return Builder.saveIP();
6432}
6433
6434OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6435 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6436 FinalizeCallbackTy FiniCB, bool IsThreads) {
6437 if (!updateToLocation(Loc))
6438 return Loc.IP;
6439
6440 Directive OMPD = Directive::OMPD_ordered;
6441 Instruction *EntryCall = nullptr;
6442 Instruction *ExitCall = nullptr;
6443
6444 if (IsThreads) {
6445 uint32_t SrcLocStrSize;
6446 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6447 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6448 Value *ThreadId = getOrCreateThreadID(Ident);
6449 Value *Args[] = {Ident, ThreadId};
6450
6451 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6452 EntryCall = Builder.CreateCall(EntryRTLFn, Args);
6453
6454 Function *ExitRTLFn =
6455 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6456 ExitCall = Builder.CreateCall(ExitRTLFn, Args);
6457 }
6458
6459 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6460 /*Conditional*/ false, /*hasFinalize*/ true);
6461}
6462
6463OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6464 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6465 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6466 bool HasFinalize, bool IsCancellable) {
6467
6468 if (HasFinalize)
6469 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6470
6471 // Create inlined region's entry and body blocks, in preparation
6472 // for conditional creation
6473 BasicBlock *EntryBB = Builder.GetInsertBlock();
6474 Instruction *SplitPos = EntryBB->getTerminator();
6475 if (!isa_and_nonnull<BranchInst>(SplitPos))
6476 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6477 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6478 BasicBlock *FiniBB =
6479 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6480
6481 Builder.SetInsertPoint(EntryBB->getTerminator());
6482 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6483
6484 // generate body
6485 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6486 /* CodeGenIP */ Builder.saveIP()))
6487 return Err;
6488
6489 // emit exit call and do any needed finalization.
6490 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6491 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6492 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6493 "Unexpected control flow graph state!!");
6494 InsertPointOrErrorTy AfterIP =
6495 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6496 if (!AfterIP)
6497 return AfterIP.takeError();
6498 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6499 "Unexpected Control Flow State!");
6501
6502 // If we are skipping the region of a non conditional, remove the exit
6503 // block, and clear the builder's insertion point.
6504 assert(SplitPos->getParent() == ExitBB &&
6505 "Unexpected Insertion point location!");
6506 auto merged = MergeBlockIntoPredecessor(ExitBB);
6507 BasicBlock *ExitPredBB = SplitPos->getParent();
6508 auto InsertBB = merged ? ExitPredBB : ExitBB;
6509 if (!isa_and_nonnull<BranchInst>(SplitPos))
6510 SplitPos->eraseFromParent();
6511 Builder.SetInsertPoint(InsertBB);
6512
6513 return Builder.saveIP();
6514}
6515
6516OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6517 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6518 // if nothing to do, Return current insertion point.
6519 if (!Conditional || !EntryCall)
6520 return Builder.saveIP();
6521
6522 BasicBlock *EntryBB = Builder.GetInsertBlock();
6523 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6524 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6525 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6526
6527 // Emit thenBB and set the Builder's insertion point there for
6528 // body generation next. Place the block after the current block.
6529 Function *CurFn = EntryBB->getParent();
6530 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6531
6532 // Move Entry branch to end of ThenBB, and replace with conditional
6533 // branch (If-stmt)
6534 Instruction *EntryBBTI = EntryBB->getTerminator();
6535 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6536 EntryBBTI->removeFromParent();
6537 Builder.SetInsertPoint(UI);
6538 Builder.Insert(EntryBBTI);
6539 UI->eraseFromParent();
6540 Builder.SetInsertPoint(ThenBB->getTerminator());
6541
6542 // return an insertion point to ExitBB.
6543 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6544}
6545
6546OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6547 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6548 bool HasFinalize) {
6549
6550 Builder.restoreIP(FinIP);
6551
6552 // If there is finalization to do, emit it before the exit call
6553 if (HasFinalize) {
6554 assert(!FinalizationStack.empty() &&
6555 "Unexpected finalization stack state!");
6556
6557 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6558 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6559
6560 if (Error Err = Fi.FiniCB(FinIP))
6561 return Err;
6562
6563 BasicBlock *FiniBB = FinIP.getBlock();
6564 Instruction *FiniBBTI = FiniBB->getTerminator();
6565
6566 // set Builder IP for call creation
6567 Builder.SetInsertPoint(FiniBBTI);
6568 }
6569
6570 if (!ExitCall)
6571 return Builder.saveIP();
6572
6573 // place the Exitcall as last instruction before Finalization block terminator
6574 ExitCall->removeFromParent();
6575 Builder.Insert(ExitCall);
6576
6577 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6578 ExitCall->getIterator());
6579}
6580
6581OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6582 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6583 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6584 if (!IP.isSet())
6585 return IP;
6586
6587 IRBuilder<>::InsertPointGuard IPG(Builder);
6588
6589 // creates the following CFG structure
6590 // OMP_Entry : (MasterAddr != PrivateAddr)?
6591 // F T
6592 // | \
6593 // | copin.not.master
6594 // | /
6595 // v /
6596 // copyin.not.master.end
6597 // |
6598 // v
6599 // OMP.Entry.Next
6600
6601 BasicBlock *OMP_Entry = IP.getBlock();
6602 Function *CurFn = OMP_Entry->getParent();
6603 BasicBlock *CopyBegin =
6604 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6605 BasicBlock *CopyEnd = nullptr;
6606
6607 // If entry block is terminated, split to preserve the branch to following
6608 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6609 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6610 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6611 "copyin.not.master.end");
6612 OMP_Entry->getTerminator()->eraseFromParent();
6613 } else {
6614 CopyEnd =
6615 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6616 }
6617
6618 Builder.SetInsertPoint(OMP_Entry);
6619 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6620 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6621 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6622 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6623
6624 Builder.SetInsertPoint(CopyBegin);
6625 if (BranchtoEnd)
6626 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6627
6628 return Builder.saveIP();
6629}
6630
6631CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6633 std::string Name) {
6634 IRBuilder<>::InsertPointGuard IPG(Builder);
6635 updateToLocation(Loc);
6636
6637 uint32_t SrcLocStrSize;
6638 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6639 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6640 Value *ThreadId = getOrCreateThreadID(Ident);
6641 Value *Args[] = {ThreadId, Size, Allocator};
6642
6643 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6644
6645 return Builder.CreateCall(Fn, Args, Name);
6646}
6647
6648CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6649 Value *Addr, Value *Allocator,
6650 std::string Name) {
6651 IRBuilder<>::InsertPointGuard IPG(Builder);
6652 updateToLocation(Loc);
6653
6654 uint32_t SrcLocStrSize;
6655 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6656 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6657 Value *ThreadId = getOrCreateThreadID(Ident);
6658 Value *Args[] = {ThreadId, Addr, Allocator};
6659 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6660 return Builder.CreateCall(Fn, Args, Name);
6661}
6662
6663CallInst *OpenMPIRBuilder::createOMPInteropInit(
6664 const LocationDescription &Loc, Value *InteropVar,
6665 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6666 Value *DependenceAddress, bool HaveNowaitClause) {
6667 IRBuilder<>::InsertPointGuard IPG(Builder);
6668 updateToLocation(Loc);
6669
6670 uint32_t SrcLocStrSize;
6671 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6672 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6673 Value *ThreadId = getOrCreateThreadID(Ident);
6674 if (Device == nullptr)
6676 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6677 if (NumDependences == nullptr) {
6678 NumDependences = ConstantInt::get(Int32, 0);
6679 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6680 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6681 }
6682 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6683 Value *Args[] = {
6684 Ident, ThreadId, InteropVar, InteropTypeVal,
6685 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6686
6687 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6688
6689 return Builder.CreateCall(Fn, Args);
6690}
6691
6692CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6693 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6694 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6695 IRBuilder<>::InsertPointGuard IPG(Builder);
6696 updateToLocation(Loc);
6697
6698 uint32_t SrcLocStrSize;
6699 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6700 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6701 Value *ThreadId = getOrCreateThreadID(Ident);
6702 if (Device == nullptr)
6704 if (NumDependences == nullptr) {
6705 NumDependences = ConstantInt::get(Int32, 0);
6706 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6707 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6708 }
6709 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6710 Value *Args[] = {
6711 Ident, ThreadId, InteropVar, Device,
6712 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6713
6714 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6715
6716 return Builder.CreateCall(Fn, Args);
6717}
6718
6719CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6720 Value *InteropVar, Value *Device,
6721 Value *NumDependences,
6722 Value *DependenceAddress,
6723 bool HaveNowaitClause) {
6724 IRBuilder<>::InsertPointGuard IPG(Builder);
6725 updateToLocation(Loc);
6726 uint32_t SrcLocStrSize;
6727 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6728 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6729 Value *ThreadId = getOrCreateThreadID(Ident);
6730 if (Device == nullptr)
6732 if (NumDependences == nullptr) {
6733 NumDependences = ConstantInt::get(Int32, 0);
6734 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6735 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6736 }
6737 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6738 Value *Args[] = {
6739 Ident, ThreadId, InteropVar, Device,
6740 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6741
6742 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6743
6744 return Builder.CreateCall(Fn, Args);
6745}
6746
6747CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6748 const LocationDescription &Loc, llvm::Value *Pointer,
6749 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6750 IRBuilder<>::InsertPointGuard IPG(Builder);
6751 updateToLocation(Loc);
6752
6753 uint32_t SrcLocStrSize;
6754 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6755 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6756 Value *ThreadId = getOrCreateThreadID(Ident);
6757 Constant *ThreadPrivateCache =
6758 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6759 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6760
6761 Function *Fn =
6762 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6763
6764 return Builder.CreateCall(Fn, Args);
6765}
6766
6767OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6768 const LocationDescription &Loc,
6769 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6770 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6771 "expected num_threads and num_teams to be specified");
6772
6773 if (!updateToLocation(Loc))
6774 return Loc.IP;
6775
6776 uint32_t SrcLocStrSize;
6777 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6778 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6779 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
6780 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
6781 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
6782 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
6783 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
6784
6785 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
6786 Function *Kernel = DebugKernelWrapper;
6787
6788 // We need to strip the debug prefix to get the correct kernel name.
6789 StringRef KernelName = Kernel->getName();
6790 const std::string DebugPrefix = "_debug__";
6791 if (KernelName.ends_with(DebugPrefix)) {
6792 KernelName = KernelName.drop_back(DebugPrefix.length());
6793 Kernel = M.getFunction(KernelName);
6794 assert(Kernel && "Expected the real kernel to exist");
6795 }
6796
6797 // Manifest the launch configuration in the metadata matching the kernel
6798 // environment.
6799 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
6800 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
6801
6802 // If MaxThreads not set, select the maximum between the default workgroup
6803 // size and the MinThreads value.
6804 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
6805 if (MaxThreadsVal < 0)
6806 MaxThreadsVal = std::max(
6807 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
6808
6809 if (MaxThreadsVal > 0)
6810 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
6811
6812 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
6814 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
6815 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
6816 Constant *ReductionDataSize =
6817 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
6818 Constant *ReductionBufferLength =
6819 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
6820
6821 Function *Fn = getOrCreateRuntimeFunctionPtr(
6822 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
6823 const DataLayout &DL = Fn->getDataLayout();
6824
6825 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
6826 Constant *DynamicEnvironmentInitializer =
6827 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
6828 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
6829 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
6830 DynamicEnvironmentInitializer, DynamicEnvironmentName,
6831 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6832 DL.getDefaultGlobalsAddressSpace());
6833 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6834
6835 Constant *DynamicEnvironment =
6836 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
6837 ? DynamicEnvironmentGV
6838 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
6839 DynamicEnvironmentPtr);
6840
6841 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
6842 ConfigurationEnvironment, {
6843 UseGenericStateMachineVal,
6844 MayUseNestedParallelismVal,
6845 IsSPMDVal,
6846 MinThreads,
6847 MaxThreads,
6848 MinTeams,
6849 MaxTeams,
6850 ReductionDataSize,
6851 ReductionBufferLength,
6852 });
6853 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
6854 KernelEnvironment, {
6855 ConfigurationEnvironmentInitializer,
6856 Ident,
6857 DynamicEnvironment,
6858 });
6859 std::string KernelEnvironmentName =
6860 (KernelName + "_kernel_environment").str();
6861 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
6862 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
6863 KernelEnvironmentInitializer, KernelEnvironmentName,
6864 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
6865 DL.getDefaultGlobalsAddressSpace());
6866 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
6867
6868 Constant *KernelEnvironment =
6869 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
6870 ? KernelEnvironmentGV
6871 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
6872 KernelEnvironmentPtr);
6873 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
6874 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
6875 KernelLaunchEnvironment =
6876 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
6877 ? KernelLaunchEnvironment
6878 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
6879 KernelLaunchEnvParamTy);
6880 CallInst *ThreadKind =
6881 Builder.CreateCall(Fn, {KernelEnvironment, KernelLaunchEnvironment});
6882
6883 Value *ExecUserCode = Builder.CreateICmpEQ(
6884 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
6885 "exec_user_code");
6886
6887 // ThreadKind = __kmpc_target_init(...)
6888 // if (ThreadKind == -1)
6889 // user_code
6890 // else
6891 // return;
6892
6893 auto *UI = Builder.CreateUnreachable();
6894 BasicBlock *CheckBB = UI->getParent();
6895 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
6896
6897 BasicBlock *WorkerExitBB = BasicBlock::Create(
6898 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
6899 Builder.SetInsertPoint(WorkerExitBB);
6900 Builder.CreateRetVoid();
6901
6902 auto *CheckBBTI = CheckBB->getTerminator();
6903 Builder.SetInsertPoint(CheckBBTI);
6904 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
6905
6906 CheckBBTI->eraseFromParent();
6907 UI->eraseFromParent();
6908
6909 // Continue in the "user_code" block, see diagram above and in
6910 // openmp/libomptarget/deviceRTLs/common/include/target.h .
6911 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
6912}
6913
6914void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
6915 int32_t TeamsReductionDataSize,
6916 int32_t TeamsReductionBufferLength) {
6917 if (!updateToLocation(Loc))
6918 return;
6919
6920 Function *Fn = getOrCreateRuntimeFunctionPtr(
6921 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
6922
6923 Builder.CreateCall(Fn, {});
6924
6925 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
6926 return;
6927
6928 Function *Kernel = Builder.GetInsertBlock()->getParent();
6929 // We need to strip the debug prefix to get the correct kernel name.
6930 StringRef KernelName = Kernel->getName();
6931 const std::string DebugPrefix = "_debug__";
6932 if (KernelName.ends_with(DebugPrefix))
6933 KernelName = KernelName.drop_back(DebugPrefix.length());
6934 auto *KernelEnvironmentGV =
6935 M.getNamedGlobal((KernelName + "_kernel_environment").str());
6936 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
6937 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
6938 auto *NewInitializer = ConstantFoldInsertValueInstruction(
6939 KernelEnvironmentInitializer,
6940 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
6941 NewInitializer = ConstantFoldInsertValueInstruction(
6942 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
6943 {0, 8});
6944 KernelEnvironmentGV->setInitializer(NewInitializer);
6945}
6946
6947static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
6948 bool Min) {
6949 if (Kernel.hasFnAttribute(Name)) {
6950 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
6951 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
6952 }
6953 Kernel.addFnAttr(Name, llvm::utostr(Value));
6954}
6955
6956std::pair<int32_t, int32_t>
6957OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
6958 int32_t ThreadLimit =
6959 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
6960
6961 if (T.isAMDGPU()) {
6962 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
6963 if (!Attr.isValid() || !Attr.isStringAttribute())
6964 return {0, ThreadLimit};
6965 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
6966 int32_t LB, UB;
6967 if (!llvm::to_integer(UBStr, UB, 10))
6968 return {0, ThreadLimit};
6969 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
6970 if (!llvm::to_integer(LBStr, LB, 10))
6971 return {0, UB};
6972 return {LB, UB};
6973 }
6974
6975 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
6976 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
6977 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
6978 }
6979 return {0, ThreadLimit};
6980}
6981
6982void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
6983 Function &Kernel, int32_t LB,
6984 int32_t UB) {
6985 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
6986
6987 if (T.isAMDGPU()) {
6988 Kernel.addFnAttr("amdgpu-flat-work-group-size",
6989 llvm::utostr(LB) + "," + llvm::utostr(UB));
6990 return;
6991 }
6992
6993 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
6994}
6995
6996std::pair<int32_t, int32_t>
6997OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
6998 // TODO: Read from backend annotations if available.
6999 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7000}
7001
7002void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7003 int32_t LB, int32_t UB) {
7004 if (T.isNVPTX())
7005 if (UB > 0)
7006 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7007 if (T.isAMDGPU())
7008 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7009
7010 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7011}
7012
7013void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7014 Function *OutlinedFn) {
7015 if (Config.isTargetDevice()) {
7017 // TODO: Determine if DSO local can be set to true.
7018 OutlinedFn->setDSOLocal(false);
7020 if (T.isAMDGCN())
7022 else if (T.isNVPTX())
7024 else if (T.isSPIRV())
7026 }
7027}
7028
7029Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7030 StringRef EntryFnIDName) {
7031 if (Config.isTargetDevice()) {
7032 assert(OutlinedFn && "The outlined function must exist if embedded");
7033 return OutlinedFn;
7034 }
7035
7036 return new GlobalVariable(
7037 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7038 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7039}
7040
7041Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7042 StringRef EntryFnName) {
7043 if (OutlinedFn)
7044 return OutlinedFn;
7045
7046 assert(!M.getGlobalVariable(EntryFnName, true) &&
7047 "Named kernel already exists?");
7048 return new GlobalVariable(
7049 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7050 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7051}
7052
7053Error OpenMPIRBuilder::emitTargetRegionFunction(
7054 TargetRegionEntryInfo &EntryInfo,
7055 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7056 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7057
7058 SmallString<64> EntryFnName;
7059 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7060
7061 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7062 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7063 if (!CBResult)
7064 return CBResult.takeError();
7065 OutlinedFn = *CBResult;
7066 } else {
7067 OutlinedFn = nullptr;
7068 }
7069
7070 // If this target outline function is not an offload entry, we don't need to
7071 // register it. This may be in the case of a false if clause, or if there are
7072 // no OpenMP targets.
7073 if (!IsOffloadEntry)
7074 return Error::success();
7075
7076 std::string EntryFnIDName =
7077 Config.isTargetDevice()
7078 ? std::string(EntryFnName)
7079 : createPlatformSpecificName({EntryFnName, "region_id"});
7080
7081 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7082 EntryFnName, EntryFnIDName);
7083 return Error::success();
7084}
7085
7086Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7087 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7088 StringRef EntryFnName, StringRef EntryFnIDName) {
7089 if (OutlinedFn)
7090 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7091 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7092 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7093 OffloadInfoManager.registerTargetRegionEntryInfo(
7094 EntryInfo, EntryAddr, OutlinedFnID,
7095 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7096 return OutlinedFnID;
7097}
7098
7099OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7100 const LocationDescription &Loc, InsertPointTy AllocaIP,
7101 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7102 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7103 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7104 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7105 BodyGenTy BodyGenType)>
7106 BodyGenCB,
7107 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7108 if (!updateToLocation(Loc))
7109 return InsertPointTy();
7110
7111 Builder.restoreIP(CodeGenIP);
7112 // Disable TargetData CodeGen on Device pass.
7113 if (Config.IsTargetDevice.value_or(false)) {
7114 if (BodyGenCB) {
7115 InsertPointOrErrorTy AfterIP =
7116 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7117 if (!AfterIP)
7118 return AfterIP.takeError();
7119 Builder.restoreIP(*AfterIP);
7120 }
7121 return Builder.saveIP();
7122 }
7123
7124 bool IsStandAlone = !BodyGenCB;
7125 MapInfosTy *MapInfo;
7126 // Generate the code for the opening of the data environment. Capture all the
7127 // arguments of the runtime call by reference because they are used in the
7128 // closing of the region.
7129 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7130 InsertPointTy CodeGenIP) -> Error {
7131 MapInfo = &GenMapInfoCB(Builder.saveIP());
7132 if (Error Err = emitOffloadingArrays(
7133 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7134 /*IsNonContiguous=*/true, DeviceAddrCB))
7135 return Err;
7136
7137 TargetDataRTArgs RTArgs;
7138 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7139
7140 // Emit the number of elements in the offloading arrays.
7141 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7142
7143 // Source location for the ident struct
7144 if (!SrcLocInfo) {
7145 uint32_t SrcLocStrSize;
7146 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7147 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7148 }
7149
7150 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7151 SrcLocInfo, DeviceID,
7152 PointerNum, RTArgs.BasePointersArray,
7153 RTArgs.PointersArray, RTArgs.SizesArray,
7154 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7155 RTArgs.MappersArray};
7156
7157 if (IsStandAlone) {
7158 assert(MapperFunc && "MapperFunc missing for standalone target data");
7159
7160 auto TaskBodyCB = [&](Value *, Value *,
7162 if (Info.HasNoWait) {
7163 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7167 }
7168
7169 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7170 OffloadingArgs);
7171
7172 if (Info.HasNoWait) {
7173 BasicBlock *OffloadContBlock =
7174 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7175 Function *CurFn = Builder.GetInsertBlock()->getParent();
7176 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7177 Builder.restoreIP(Builder.saveIP());
7178 }
7179 return Error::success();
7180 };
7181
7182 bool RequiresOuterTargetTask = Info.HasNoWait;
7183 if (!RequiresOuterTargetTask)
7184 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7185 /*TargetTaskAllocaIP=*/{}));
7186 else
7187 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7188 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7189 } else {
7190 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7191 omp::OMPRTL___tgt_target_data_begin_mapper);
7192
7193 Builder.CreateCall(BeginMapperFunc, OffloadingArgs);
7194
7195 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7196 if (isa<AllocaInst>(DeviceMap.second.second)) {
7197 auto *LI =
7198 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7199 Builder.CreateStore(LI, DeviceMap.second.second);
7200 }
7201 }
7202
7203 // If device pointer privatization is required, emit the body of the
7204 // region here. It will have to be duplicated: with and without
7205 // privatization.
7206 InsertPointOrErrorTy AfterIP =
7207 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7208 if (!AfterIP)
7209 return AfterIP.takeError();
7210 Builder.restoreIP(*AfterIP);
7211 }
7212 return Error::success();
7213 };
7214
7215 // If we need device pointer privatization, we need to emit the body of the
7216 // region with no privatization in the 'else' branch of the conditional.
7217 // Otherwise, we don't have to do anything.
7218 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7219 InsertPointTy CodeGenIP) -> Error {
7220 InsertPointOrErrorTy AfterIP =
7221 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7222 if (!AfterIP)
7223 return AfterIP.takeError();
7224 Builder.restoreIP(*AfterIP);
7225 return Error::success();
7226 };
7227
7228 // Generate code for the closing of the data region.
7229 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7230 TargetDataRTArgs RTArgs;
7231 Info.EmitDebug = !MapInfo->Names.empty();
7232 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7233
7234 // Emit the number of elements in the offloading arrays.
7235 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7236
7237 // Source location for the ident struct
7238 if (!SrcLocInfo) {
7239 uint32_t SrcLocStrSize;
7240 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7241 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7242 }
7243
7244 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7245 PointerNum, RTArgs.BasePointersArray,
7246 RTArgs.PointersArray, RTArgs.SizesArray,
7247 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7248 RTArgs.MappersArray};
7249 Function *EndMapperFunc =
7250 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7251
7252 Builder.CreateCall(EndMapperFunc, OffloadingArgs);
7253 return Error::success();
7254 };
7255
7256 // We don't have to do anything to close the region if the if clause evaluates
7257 // to false.
7258 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7259 return Error::success();
7260 };
7261
7262 Error Err = [&]() -> Error {
7263 if (BodyGenCB) {
7264 Error Err = [&]() {
7265 if (IfCond)
7266 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7267 return BeginThenGen(AllocaIP, Builder.saveIP());
7268 }();
7269
7270 if (Err)
7271 return Err;
7272
7273 // If we don't require privatization of device pointers, we emit the body
7274 // in between the runtime calls. This avoids duplicating the body code.
7275 InsertPointOrErrorTy AfterIP =
7276 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7277 if (!AfterIP)
7278 return AfterIP.takeError();
7279 restoreIPandDebugLoc(Builder, *AfterIP);
7280
7281 if (IfCond)
7282 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7283 return EndThenGen(AllocaIP, Builder.saveIP());
7284 }
7285 if (IfCond)
7286 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7287 return BeginThenGen(AllocaIP, Builder.saveIP());
7288 }();
7289
7290 if (Err)
7291 return Err;
7292
7293 return Builder.saveIP();
7294}
7295
7297OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7298 bool IsGPUDistribute) {
7299 assert((IVSize == 32 || IVSize == 64) &&
7300 "IV size is not compatible with the omp runtime");
7302 if (IsGPUDistribute)
7303 Name = IVSize == 32
7304 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7305 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7306 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7307 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7308 else
7309 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7310 : omp::OMPRTL___kmpc_for_static_init_4u)
7311 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7312 : omp::OMPRTL___kmpc_for_static_init_8u);
7313
7314 return getOrCreateRuntimeFunction(M, Name);
7315}
7316
7317FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7318 bool IVSigned) {
7319 assert((IVSize == 32 || IVSize == 64) &&
7320 "IV size is not compatible with the omp runtime");
7321 RuntimeFunction Name = IVSize == 32
7322 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7323 : omp::OMPRTL___kmpc_dispatch_init_4u)
7324 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7325 : omp::OMPRTL___kmpc_dispatch_init_8u);
7326
7327 return getOrCreateRuntimeFunction(M, Name);
7328}
7329
7330FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7331 bool IVSigned) {
7332 assert((IVSize == 32 || IVSize == 64) &&
7333 "IV size is not compatible with the omp runtime");
7334 RuntimeFunction Name = IVSize == 32
7335 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7336 : omp::OMPRTL___kmpc_dispatch_next_4u)
7337 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7338 : omp::OMPRTL___kmpc_dispatch_next_8u);
7339
7340 return getOrCreateRuntimeFunction(M, Name);
7341}
7342
7343FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7344 bool IVSigned) {
7345 assert((IVSize == 32 || IVSize == 64) &&
7346 "IV size is not compatible with the omp runtime");
7347 RuntimeFunction Name = IVSize == 32
7348 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7349 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7350 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7351 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7352
7353 return getOrCreateRuntimeFunction(M, Name);
7354}
7355
7356FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7357 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7358}
7359
7361 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7362 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7363
7364 DISubprogram *NewSP = Func->getSubprogram();
7365 if (!NewSP)
7366 return;
7367
7369
7370 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7371 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7372 // Only use cached variable if the arg number matches. This is important
7373 // so that DIVariable created for privatized variables are not discarded.
7374 if (NewVar && (arg == NewVar->getArg()))
7375 return NewVar;
7376
7378 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7379 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7380 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7381 return NewVar;
7382 };
7383
7384 auto UpdateDebugRecord = [&](auto *DR) {
7385 DILocalVariable *OldVar = DR->getVariable();
7386 unsigned ArgNo = 0;
7387 for (auto Loc : DR->location_ops()) {
7388 auto Iter = ValueReplacementMap.find(Loc);
7389 if (Iter != ValueReplacementMap.end()) {
7390 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7391 ArgNo = std::get<1>(Iter->second) + 1;
7392 }
7393 }
7394 if (ArgNo != 0)
7395 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7396 };
7397
7398 // The location and scope of variable intrinsics and records still point to
7399 // the parent function of the target region. Update them.
7400 for (Instruction &I : instructions(Func)) {
7402 "Unexpected debug intrinsic");
7403 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7404 UpdateDebugRecord(&DVR);
7405 }
7406 // An extra argument is passed to the device. Create the debug data for it.
7407 if (OMPBuilder.Config.isTargetDevice()) {
7408 DICompileUnit *CU = NewSP->getUnit();
7409 Module *M = Func->getParent();
7410 DIBuilder DB(*M, true, CU);
7411 DIType *VoidPtrTy =
7412 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7413 DILocalVariable *Var = DB.createParameterVariable(
7414 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7415 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7416 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7417 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7418 &(*Func->begin()));
7419 }
7420}
7421
7423 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7424 return cast<Operator>(V)->getOperand(0);
7425 return V;
7426}
7427
7429 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7430 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7431 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7432 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7433 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7434 SmallVector<Type *> ParameterTypes;
7435 if (OMPBuilder.Config.isTargetDevice()) {
7436 // Add the "implicit" runtime argument we use to provide launch specific
7437 // information for target devices.
7438 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7439 ParameterTypes.push_back(Int8PtrTy);
7440
7441 // All parameters to target devices are passed as pointers
7442 // or i64. This assumes 64-bit address spaces/pointers.
7443 for (auto &Arg : Inputs)
7444 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7445 ? Arg->getType()
7446 : Type::getInt64Ty(Builder.getContext()));
7447 } else {
7448 for (auto &Arg : Inputs)
7449 ParameterTypes.push_back(Arg->getType());
7450 }
7451
7452 auto BB = Builder.GetInsertBlock();
7453 auto M = BB->getModule();
7454 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7455 /*isVarArg*/ false);
7456 auto Func =
7457 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7458
7459 // Forward target-cpu and target-features function attributes from the
7460 // original function to the new outlined function.
7461 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7462
7463 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7464 if (TargetCpuAttr.isStringAttribute())
7465 Func->addFnAttr(TargetCpuAttr);
7466
7467 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7468 if (TargetFeaturesAttr.isStringAttribute())
7469 Func->addFnAttr(TargetFeaturesAttr);
7470
7471 if (OMPBuilder.Config.isTargetDevice()) {
7472 Value *ExecMode =
7473 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7474 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7475 }
7476
7477 // Save insert point.
7478 IRBuilder<>::InsertPointGuard IPG(Builder);
7479 // We will generate the entries in the outlined function but the debug
7480 // location may still be pointing to the parent function. Reset it now.
7481 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7482
7483 // Generate the region into the function.
7484 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7485 Builder.SetInsertPoint(EntryBB);
7486
7487 // Insert target init call in the device compilation pass.
7488 if (OMPBuilder.Config.isTargetDevice())
7489 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7490
7491 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7492
7493 // As we embed the user code in the middle of our target region after we
7494 // generate entry code, we must move what allocas we can into the entry
7495 // block to avoid possible breaking optimisations for device
7496 if (OMPBuilder.Config.isTargetDevice())
7497 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7498
7499 // Insert target deinit call in the device compilation pass.
7500 BasicBlock *OutlinedBodyBB =
7501 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7502 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7503 Builder.saveIP(),
7504 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7505 if (!AfterIP)
7506 return AfterIP.takeError();
7507 Builder.restoreIP(*AfterIP);
7508 if (OMPBuilder.Config.isTargetDevice())
7509 OMPBuilder.createTargetDeinit(Builder);
7510
7511 // Insert return instruction.
7512 Builder.CreateRetVoid();
7513
7514 // New Alloca IP at entry point of created device function.
7515 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7516 auto AllocaIP = Builder.saveIP();
7517
7518 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7519
7520 // Skip the artificial dyn_ptr on the device.
7521 const auto &ArgRange =
7522 OMPBuilder.Config.isTargetDevice()
7523 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7524 : Func->args();
7525
7527
7528 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7529 // Things like GEP's can come in the form of Constants. Constants and
7530 // ConstantExpr's do not have access to the knowledge of what they're
7531 // contained in, so we must dig a little to find an instruction so we
7532 // can tell if they're used inside of the function we're outlining. We
7533 // also replace the original constant expression with a new instruction
7534 // equivalent; an instruction as it allows easy modification in the
7535 // following loop, as we can now know the constant (instruction) is
7536 // owned by our target function and replaceUsesOfWith can now be invoked
7537 // on it (cannot do this with constants it seems). A brand new one also
7538 // allows us to be cautious as it is perhaps possible the old expression
7539 // was used inside of the function but exists and is used externally
7540 // (unlikely by the nature of a Constant, but still).
7541 // NOTE: We cannot remove dead constants that have been rewritten to
7542 // instructions at this stage, we run the risk of breaking later lowering
7543 // by doing so as we could still be in the process of lowering the module
7544 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7545 // constants we have created rewritten versions of.
7546 if (auto *Const = dyn_cast<Constant>(Input))
7547 convertUsersOfConstantsToInstructions(Const, Func, false);
7548
7549 // Collect users before iterating over them to avoid invalidating the
7550 // iteration in case a user uses Input more than once (e.g. a call
7551 // instruction).
7552 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7553 // Collect all the instructions
7555 if (auto *Instr = dyn_cast<Instruction>(User))
7556 if (Instr->getFunction() == Func)
7557 Instr->replaceUsesOfWith(Input, InputCopy);
7558 };
7559
7560 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7561
7562 // Rewrite uses of input valus to parameters.
7563 for (auto InArg : zip(Inputs, ArgRange)) {
7564 Value *Input = std::get<0>(InArg);
7565 Argument &Arg = std::get<1>(InArg);
7566 Value *InputCopy = nullptr;
7567
7568 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7569 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7570 if (!AfterIP)
7571 return AfterIP.takeError();
7572 Builder.restoreIP(*AfterIP);
7573 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7574
7575 // In certain cases a Global may be set up for replacement, however, this
7576 // Global may be used in multiple arguments to the kernel, just segmented
7577 // apart, for example, if we have a global array, that is sectioned into
7578 // multiple mappings (technically not legal in OpenMP, but there is a case
7579 // in Fortran for Common Blocks where this is neccesary), we will end up
7580 // with GEP's into this array inside the kernel, that refer to the Global
7581 // but are technically seperate arguments to the kernel for all intents and
7582 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7583 // index, it will fold into an referal to the Global, if we then encounter
7584 // this folded GEP during replacement all of the references to the
7585 // Global in the kernel will be replaced with the argument we have generated
7586 // that corresponds to it, including any other GEP's that refer to the
7587 // Global that may be other arguments. This will invalidate all of the other
7588 // preceding mapped arguments that refer to the same global that may be
7589 // seperate segments. To prevent this, we defer global processing until all
7590 // other processing has been performed.
7593 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7594 continue;
7595 }
7596
7598 continue;
7599
7600 ReplaceValue(Input, InputCopy, Func);
7601 }
7602
7603 // Replace all of our deferred Input values, currently just Globals.
7604 for (auto Deferred : DeferredReplacement)
7605 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7606
7607 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7608 ValueReplacementMap);
7609 return Func;
7610}
7611/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7612/// of pointers containing shared data between the parent task and the created
7613/// task.
7614static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7615 IRBuilderBase &Builder,
7616 Value *TaskWithPrivates,
7617 Type *TaskWithPrivatesTy) {
7618
7619 Type *TaskTy = OMPIRBuilder.Task;
7620 LLVMContext &Ctx = Builder.getContext();
7621 Value *TaskT =
7622 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7623 Value *Shareds = TaskT;
7624 // TaskWithPrivatesTy can be one of the following
7625 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7626 // %struct.privates }
7627 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7628 //
7629 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7630 // its first member has to be the task descriptor. TaskTy is the type of the
7631 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7632 // first member of TaskT, gives us the pointer to shared data.
7633 if (TaskWithPrivatesTy != TaskTy)
7634 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7635 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7636}
7637/// Create an entry point for a target task with the following.
7638/// It'll have the following signature
7639/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7640/// This function is called from emitTargetTask once the
7641/// code to launch the target kernel has been outlined already.
7642/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7643/// into the task structure so that the deferred target task can access this
7644/// data even after the stack frame of the generating task has been rolled
7645/// back. Offloading arrays contain base pointers, pointers, sizes etc
7646/// of the data that the target kernel will access. These in effect are the
7647/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7649 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7650 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7651 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7652
7653 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7654 // This is because PrivatesTy is the type of the structure in which
7655 // we pass the offloading arrays to the deferred target task.
7656 assert((!NumOffloadingArrays || PrivatesTy) &&
7657 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7658 "to privatize");
7659
7660 Module &M = OMPBuilder.M;
7661 // KernelLaunchFunction is the target launch function, i.e.
7662 // the function that sets up kernel arguments and calls
7663 // __tgt_target_kernel to launch the kernel on the device.
7664 //
7665 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7666
7667 // StaleCI is the CallInst which is the call to the outlined
7668 // target kernel launch function. If there are local live-in values
7669 // that the outlined function uses then these are aggregated into a structure
7670 // which is passed as the second argument. If there are no local live-in
7671 // values or if all values used by the outlined kernel are global variables,
7672 // then there's only one argument, the threadID. So, StaleCI can be
7673 //
7674 // %structArg = alloca { ptr, ptr }, align 8
7675 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7676 // store ptr %20, ptr %gep_, align 8
7677 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7678 // store ptr %21, ptr %gep_8, align 8
7679 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7680 //
7681 // OR
7682 //
7683 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7684 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7685 StaleCI->getIterator());
7686
7687 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7688
7689 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7690 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7691 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7692
7693 auto ProxyFnTy =
7694 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7695 /* isVarArg */ false);
7696 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7697 ".omp_target_task_proxy_func",
7698 Builder.GetInsertBlock()->getModule());
7699 Value *ThreadId = ProxyFn->getArg(0);
7700 Value *TaskWithPrivates = ProxyFn->getArg(1);
7701 ThreadId->setName("thread.id");
7702 TaskWithPrivates->setName("task");
7703
7704 bool HasShareds = SharedArgsOperandNo > 0;
7705 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7706 BasicBlock *EntryBB =
7707 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7708 Builder.SetInsertPoint(EntryBB);
7709
7710 SmallVector<Value *> KernelLaunchArgs;
7711 KernelLaunchArgs.reserve(StaleCI->arg_size());
7712 KernelLaunchArgs.push_back(ThreadId);
7713
7714 if (HasOffloadingArrays) {
7715 assert(TaskTy != TaskWithPrivatesTy &&
7716 "If there are offloading arrays to pass to the target"
7717 "TaskTy cannot be the same as TaskWithPrivatesTy");
7718 (void)TaskTy;
7719 Value *Privates =
7720 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7721 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7722 KernelLaunchArgs.push_back(
7723 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7724 }
7725
7726 if (HasShareds) {
7727 auto *ArgStructAlloca =
7728 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7729 assert(ArgStructAlloca &&
7730 "Unable to find the alloca instruction corresponding to arguments "
7731 "for extracted function");
7732 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7733
7734 AllocaInst *NewArgStructAlloca =
7735 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7736
7737 Value *SharedsSize =
7738 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7739
7741 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7742
7743 Builder.CreateMemCpy(
7744 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7745 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7746 KernelLaunchArgs.push_back(NewArgStructAlloca);
7747 }
7748 Builder.CreateCall(KernelLaunchFunction, KernelLaunchArgs);
7749 Builder.CreateRetVoid();
7750 return ProxyFn;
7751}
7753
7754 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7755 return GEP->getSourceElementType();
7756 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7757 return Alloca->getAllocatedType();
7758
7759 llvm_unreachable("Unhandled Instruction type");
7760 return nullptr;
7761}
7762// This function returns a struct that has at most two members.
7763// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7764// descriptor. The second member, if needed, is a struct containing arrays
7765// that need to be passed to the offloaded target kernel. For example,
7766// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7767// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7768// respectively, then the types created by this function are
7769//
7770// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7771// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7772// %struct.privates }
7773// %struct.task_with_privates is returned by this function.
7774// If there aren't any offloading arrays to pass to the target kernel,
7775// %struct.kmp_task_ompbuilder_t is returned.
7776static StructType *
7777createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
7778 ArrayRef<Value *> OffloadingArraysToPrivatize) {
7779
7780 if (OffloadingArraysToPrivatize.empty())
7781 return OMPIRBuilder.Task;
7782
7783 SmallVector<Type *, 4> StructFieldTypes;
7784 for (Value *V : OffloadingArraysToPrivatize) {
7785 assert(V->getType()->isPointerTy() &&
7786 "Expected pointer to array to privatize. Got a non-pointer value "
7787 "instead");
7788 Type *ArrayTy = getOffloadingArrayType(V);
7789 assert(ArrayTy && "ArrayType cannot be nullptr");
7790 StructFieldTypes.push_back(ArrayTy);
7791 }
7792 StructType *PrivatesStructTy =
7793 StructType::create(StructFieldTypes, "struct.privates");
7794 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
7795 "struct.task_with_privates");
7796}
7798 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
7799 TargetRegionEntryInfo &EntryInfo,
7800 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7801 Function *&OutlinedFn, Constant *&OutlinedFnID,
7803 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7804 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7805
7806 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
7807 [&](StringRef EntryFnName) {
7808 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
7809 EntryFnName, Inputs, CBFunc,
7810 ArgAccessorFuncCB);
7811 };
7812
7813 return OMPBuilder.emitTargetRegionFunction(
7814 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
7815 OutlinedFnID);
7816}
7817
7818OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
7819 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
7820 OpenMPIRBuilder::InsertPointTy AllocaIP,
7822 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
7823
7824 // The following explains the code-gen scenario for the `target` directive. A
7825 // similar scneario is followed for other device-related directives (e.g.
7826 // `target enter data`) but in similar fashion since we only need to emit task
7827 // that encapsulates the proper runtime call.
7828 //
7829 // When we arrive at this function, the target region itself has been
7830 // outlined into the function OutlinedFn.
7831 // So at ths point, for
7832 // --------------------------------------------------------------
7833 // void user_code_that_offloads(...) {
7834 // omp target depend(..) map(from:a) map(to:b) private(i)
7835 // do i = 1, 10
7836 // a(i) = b(i) + n
7837 // }
7838 //
7839 // --------------------------------------------------------------
7840 //
7841 // we have
7842 //
7843 // --------------------------------------------------------------
7844 //
7845 // void user_code_that_offloads(...) {
7846 // %.offload_baseptrs = alloca [2 x ptr], align 8
7847 // %.offload_ptrs = alloca [2 x ptr], align 8
7848 // %.offload_mappers = alloca [2 x ptr], align 8
7849 // ;; target region has been outlined and now we need to
7850 // ;; offload to it via a target task.
7851 // }
7852 // void outlined_device_function(ptr a, ptr b, ptr n) {
7853 // n = *n_ptr;
7854 // do i = 1, 10
7855 // a(i) = b(i) + n
7856 // }
7857 //
7858 // We have to now do the following
7859 // (i) Make an offloading call to outlined_device_function using the OpenMP
7860 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
7861 // emitted by emitKernelLaunch
7862 // (ii) Create a task entry point function that calls kernel_launch_function
7863 // and is the entry point for the target task. See
7864 // '@.omp_target_task_proxy_func in the pseudocode below.
7865 // (iii) Create a task with the task entry point created in (ii)
7866 //
7867 // That is we create the following
7868 // struct task_with_privates {
7869 // struct kmp_task_ompbuilder_t task_struct;
7870 // struct privates {
7871 // [2 x ptr] ; baseptrs
7872 // [2 x ptr] ; ptrs
7873 // [2 x i64] ; sizes
7874 // }
7875 // }
7876 // void user_code_that_offloads(...) {
7877 // %.offload_baseptrs = alloca [2 x ptr], align 8
7878 // %.offload_ptrs = alloca [2 x ptr], align 8
7879 // %.offload_sizes = alloca [2 x i64], align 8
7880 //
7881 // %structArg = alloca { ptr, ptr, ptr }, align 8
7882 // %strucArg[0] = a
7883 // %strucArg[1] = b
7884 // %strucArg[2] = &n
7885 //
7886 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
7887 // sizeof(kmp_task_ompbuilder_t),
7888 // sizeof(structArg),
7889 // @.omp_target_task_proxy_func,
7890 // ...)
7891 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
7892 // sizeof(structArg))
7893 // memcpy(target_task_with_privates->privates->baseptrs,
7894 // offload_baseptrs, sizeof(offload_baseptrs)
7895 // memcpy(target_task_with_privates->privates->ptrs,
7896 // offload_ptrs, sizeof(offload_ptrs)
7897 // memcpy(target_task_with_privates->privates->sizes,
7898 // offload_sizes, sizeof(offload_sizes)
7899 // dependencies_array = ...
7900 // ;; if nowait not present
7901 // call @__kmpc_omp_wait_deps(..., dependencies_array)
7902 // call @__kmpc_omp_task_begin_if0(...)
7903 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
7904 // %target_task_with_privates)
7905 // call @__kmpc_omp_task_complete_if0(...)
7906 // }
7907 //
7908 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
7909 // ptr %task) {
7910 // %structArg = alloca {ptr, ptr, ptr}
7911 // %task_ptr = getelementptr(%task, 0, 0)
7912 // %shared_data = load (getelementptr %task_ptr, 0, 0)
7913 // mempcy(%structArg, %shared_data, sizeof(%structArg))
7914 //
7915 // %offloading_arrays = getelementptr(%task, 0, 1)
7916 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
7917 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
7918 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
7919 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
7920 // %offload_sizes, %structArg)
7921 // }
7922 //
7923 // We need the proxy function because the signature of the task entry point
7924 // expected by kmpc_omp_task is always the same and will be different from
7925 // that of the kernel_launch function.
7926 //
7927 // kernel_launch_function is generated by emitKernelLaunch and has the
7928 // always_inline attribute. For this example, it'll look like so:
7929 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
7930 // %offload_sizes, %structArg) alwaysinline {
7931 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
7932 // ; load aggregated data from %structArg
7933 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
7934 // ; offload_sizes
7935 // call i32 @__tgt_target_kernel(...,
7936 // outlined_device_function,
7937 // ptr %kernel_args)
7938 // }
7939 // void outlined_device_function(ptr a, ptr b, ptr n) {
7940 // n = *n_ptr;
7941 // do i = 1, 10
7942 // a(i) = b(i) + n
7943 // }
7944 //
7945 BasicBlock *TargetTaskBodyBB =
7946 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
7947 BasicBlock *TargetTaskAllocaBB =
7948 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
7949
7950 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
7951 TargetTaskAllocaBB->begin());
7952 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
7953
7954 OutlineInfo OI;
7955 OI.EntryBB = TargetTaskAllocaBB;
7956 OI.OuterAllocaBB = AllocaIP.getBlock();
7957
7958 // Add the thread ID argument.
7960 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
7961 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
7962
7963 // Generate the task body which will subsequently be outlined.
7964 Builder.restoreIP(TargetTaskBodyIP);
7965 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
7966 return Err;
7967
7968 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
7969 // it is given. These blocks are enumerated by
7970 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
7971 // to be outside the region. In other words, OI.ExitBlock is expected to be
7972 // the start of the region after the outlining. We used to set OI.ExitBlock
7973 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
7974 // except when the task body is a single basic block. In that case,
7975 // OI.ExitBlock is set to the single task body block and will get left out of
7976 // the outlining process. So, simply create a new empty block to which we
7977 // uncoditionally branch from where TaskBodyCB left off
7978 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
7979 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
7980 /*IsFinished=*/true);
7981
7982 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
7983 bool NeedsTargetTask = HasNoWait && DeviceID;
7984 if (NeedsTargetTask) {
7985 for (auto *V :
7986 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
7987 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
7988 RTArgs.SizesArray}) {
7990 OffloadingArraysToPrivatize.push_back(V);
7991 OI.ExcludeArgsFromAggregate.push_back(V);
7992 }
7993 }
7994 }
7995 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
7996 DeviceID, OffloadingArraysToPrivatize](
7997 Function &OutlinedFn) mutable {
7998 assert(OutlinedFn.hasOneUse() &&
7999 "there must be a single user for the outlined function");
8000
8001 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8002
8003 // The first argument of StaleCI is always the thread id.
8004 // The next few arguments are the pointers to offloading arrays
8005 // if any. (see OffloadingArraysToPrivatize)
8006 // Finally, all other local values that are live-in into the outlined region
8007 // end up in a structure whose pointer is passed as the last argument. This
8008 // piece of data is passed in the "shared" field of the task structure. So,
8009 // we know we have to pass shareds to the task if the number of arguments is
8010 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8011 // thread id. Further, for safety, we assert that the number of arguments of
8012 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8013 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8014 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8015 assert((!HasShareds ||
8016 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8017 "Wrong number of arguments for StaleCI when shareds are present");
8018 int SharedArgOperandNo =
8019 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8020
8021 StructType *TaskWithPrivatesTy =
8022 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8023 StructType *PrivatesTy = nullptr;
8024
8025 if (!OffloadingArraysToPrivatize.empty())
8026 PrivatesTy =
8027 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8028
8030 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8031 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8032
8033 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8034 << "\n");
8035
8036 Builder.SetInsertPoint(StaleCI);
8037
8038 // Gather the arguments for emitting the runtime call.
8039 uint32_t SrcLocStrSize;
8040 Constant *SrcLocStr =
8041 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8042 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8043
8044 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8045 //
8046 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8047 // the DeviceID to the deferred task and also since
8048 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8049 Function *TaskAllocFn =
8050 !NeedsTargetTask
8051 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8052 : getOrCreateRuntimeFunctionPtr(
8053 OMPRTL___kmpc_omp_target_task_alloc);
8054
8055 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8056 // call.
8057 Value *ThreadID = getOrCreateThreadID(Ident);
8058
8059 // Argument - `sizeof_kmp_task_t` (TaskSize)
8060 // Tasksize refers to the size in bytes of kmp_task_t data structure
8061 // plus any other data to be passed to the target task, if any, which
8062 // is packed into a struct. kmp_task_t and the struct so created are
8063 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8064 Value *TaskSize = Builder.getInt64(
8065 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8066
8067 // Argument - `sizeof_shareds` (SharedsSize)
8068 // SharedsSize refers to the shareds array size in the kmp_task_t data
8069 // structure.
8070 Value *SharedsSize = Builder.getInt64(0);
8071 if (HasShareds) {
8072 auto *ArgStructAlloca =
8073 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8074 assert(ArgStructAlloca &&
8075 "Unable to find the alloca instruction corresponding to arguments "
8076 "for extracted function");
8077 auto *ArgStructType =
8078 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8079 assert(ArgStructType && "Unable to find struct type corresponding to "
8080 "arguments for extracted function");
8081 SharedsSize =
8082 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8083 }
8084
8085 // Argument - `flags`
8086 // Task is tied iff (Flags & 1) == 1.
8087 // Task is untied iff (Flags & 1) == 0.
8088 // Task is final iff (Flags & 2) == 2.
8089 // Task is not final iff (Flags & 2) == 0.
8090 // A target task is not final and is untied.
8091 Value *Flags = Builder.getInt32(0);
8092
8093 // Emit the @__kmpc_omp_task_alloc runtime call
8094 // The runtime call returns a pointer to an area where the task captured
8095 // variables must be copied before the task is run (TaskData)
8096 CallInst *TaskData = nullptr;
8097
8098 SmallVector<llvm::Value *> TaskAllocArgs = {
8099 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8100 /*flags=*/Flags,
8101 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8102 /*task_func=*/ProxyFn};
8103
8104 if (NeedsTargetTask) {
8105 assert(DeviceID && "Expected non-empty device ID.");
8106 TaskAllocArgs.push_back(DeviceID);
8107 }
8108
8109 TaskData = Builder.CreateCall(TaskAllocFn, TaskAllocArgs);
8110
8111 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8112 if (HasShareds) {
8113 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8115 *this, Builder, TaskData, TaskWithPrivatesTy);
8116 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8117 SharedsSize);
8118 }
8119 if (!OffloadingArraysToPrivatize.empty()) {
8120 Value *Privates =
8121 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8122 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8123 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8124 [[maybe_unused]] Type *ArrayType =
8125 getOffloadingArrayType(PtrToPrivatize);
8126 assert(ArrayType && "ArrayType cannot be nullptr");
8127
8128 Type *ElementType = PrivatesTy->getElementType(i);
8129 assert(ElementType == ArrayType &&
8130 "ElementType should match ArrayType");
8131 (void)ArrayType;
8132
8133 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8134 Builder.CreateMemCpy(
8135 Dst, Alignment, PtrToPrivatize, Alignment,
8136 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8137 }
8138 }
8139
8140 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8141
8142 // ---------------------------------------------------------------
8143 // V5.2 13.8 target construct
8144 // If the nowait clause is present, execution of the target task
8145 // may be deferred. If the nowait clause is not present, the target task is
8146 // an included task.
8147 // ---------------------------------------------------------------
8148 // The above means that the lack of a nowait on the target construct
8149 // translates to '#pragma omp task if(0)'
8150 if (!NeedsTargetTask) {
8151 if (DepArray) {
8152 Function *TaskWaitFn =
8153 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8154 Builder.CreateCall(
8155 TaskWaitFn,
8156 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8157 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8158 /*dep_list=*/DepArray,
8159 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8160 /*noalias_dep_list=*/
8162 }
8163 // Included task.
8164 Function *TaskBeginFn =
8165 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8166 Function *TaskCompleteFn =
8167 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8168 Builder.CreateCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8169 CallInst *CI = Builder.CreateCall(ProxyFn, {ThreadID, TaskData});
8170 CI->setDebugLoc(StaleCI->getDebugLoc());
8171 Builder.CreateCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8172 } else if (DepArray) {
8173 // HasNoWait - meaning the task may be deferred. Call
8174 // __kmpc_omp_task_with_deps if there are dependencies,
8175 // else call __kmpc_omp_task
8176 Function *TaskFn =
8177 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8178 Builder.CreateCall(
8179 TaskFn,
8180 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8181 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8183 } else {
8184 // Emit the @__kmpc_omp_task runtime call to spawn the task
8185 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8186 Builder.CreateCall(TaskFn, {Ident, ThreadID, TaskData});
8187 }
8188
8189 StaleCI->eraseFromParent();
8190 for (Instruction *I : llvm::reverse(ToBeDeleted))
8191 I->eraseFromParent();
8192 };
8193 addOutlineInfo(std::move(OI));
8194
8195 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8196 << *(Builder.GetInsertBlock()) << "\n");
8197 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8198 << *(Builder.GetInsertBlock()->getParent()->getParent())
8199 << "\n");
8200 return Builder.saveIP();
8201}
8202
8203Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8204 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8205 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8206 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8207 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8208 if (Error Err =
8209 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8210 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8211 return Err;
8212 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8213 return Error::success();
8214}
8215
8216static void emitTargetCall(
8217 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8218 OpenMPIRBuilder::InsertPointTy AllocaIP,
8219 OpenMPIRBuilder::TargetDataInfo &Info,
8220 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8221 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8222 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8224 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8225 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8227 bool HasNoWait) {
8228 // Generate a function call to the host fallback implementation of the target
8229 // region. This is called by the host when no offload entry was generated for
8230 // the target region and when the offloading call fails at runtime.
8231 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8232 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8233 Builder.restoreIP(IP);
8234 Builder.CreateCall(OutlinedFn, Args);
8235 return Builder.saveIP();
8236 };
8237
8238 bool HasDependencies = Dependencies.size() > 0;
8239 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8240
8241 OpenMPIRBuilder::TargetKernelArgs KArgs;
8242
8243 auto TaskBodyCB =
8244 [&](Value *DeviceID, Value *RTLoc,
8245 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8246 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8247 // produce any.
8248 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8249 // emitKernelLaunch makes the necessary runtime call to offload the
8250 // kernel. We then outline all that code into a separate function
8251 // ('kernel_launch_function' in the pseudo code above). This function is
8252 // then called by the target task proxy function (see
8253 // '@.omp_target_task_proxy_func' in the pseudo code above)
8254 // "@.omp_target_task_proxy_func' is generated by
8255 // emitTargetTaskProxyFunction.
8256 if (OutlinedFnID && DeviceID)
8257 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8258 EmitTargetCallFallbackCB, KArgs,
8259 DeviceID, RTLoc, TargetTaskAllocaIP);
8260
8261 // We only need to do the outlining if `DeviceID` is set to avoid calling
8262 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8263 // generating the `else` branch of an `if` clause.
8264 //
8265 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8266 // In this case, we execute the host implementation directly.
8267 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8268 }());
8269
8270 OMPBuilder.Builder.restoreIP(AfterIP);
8271 return Error::success();
8272 };
8273
8274 auto &&EmitTargetCallElse =
8275 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8276 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8277 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8278 // produce any.
8279 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8280 if (RequiresOuterTargetTask) {
8281 // Arguments that are intended to be directly forwarded to an
8282 // emitKernelLaunch call are pased as nullptr, since
8283 // OutlinedFnID=nullptr results in that call not being done.
8284 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8285 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8286 /*RTLoc=*/nullptr, AllocaIP,
8287 Dependencies, EmptyRTArgs, HasNoWait);
8288 }
8289 return EmitTargetCallFallbackCB(Builder.saveIP());
8290 }());
8291
8292 Builder.restoreIP(AfterIP);
8293 return Error::success();
8294 };
8295
8296 auto &&EmitTargetCallThen =
8297 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8298 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8299 Info.HasNoWait = HasNoWait;
8300 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8301 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8302 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8303 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8304 /*IsNonContiguous=*/true,
8305 /*ForEndCall=*/false))
8306 return Err;
8307
8308 SmallVector<Value *, 3> NumTeamsC;
8309 for (auto [DefaultVal, RuntimeVal] :
8310 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8311 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8312 : Builder.getInt32(DefaultVal));
8313
8314 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8315 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8316 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8317 if (Clause)
8318 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8319 /*isSigned=*/false);
8320 return Clause;
8321 };
8322 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8323 if (Clause)
8324 Result =
8325 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8326 Result, Clause)
8327 : Clause;
8328 };
8329
8330 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8331 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8332 SmallVector<Value *, 3> NumThreadsC;
8333 Value *MaxThreadsClause =
8334 RuntimeAttrs.TeamsThreadLimit.size() == 1
8335 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8336 : nullptr;
8337
8338 for (auto [TeamsVal, TargetVal] : zip_equal(
8339 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8340 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8341 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8342
8343 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8344 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8345
8346 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8347 }
8348
8349 unsigned NumTargetItems = Info.NumberOfPtrs;
8350 // TODO: Use correct device ID
8351 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8352 uint32_t SrcLocStrSize;
8353 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8354 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8355 llvm::omp::IdentFlag(0), 0);
8356
8357 Value *TripCount = RuntimeAttrs.LoopTripCount
8358 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8359 Builder.getInt64Ty(),
8360 /*isSigned=*/false)
8361 : Builder.getInt64(0);
8362
8363 // TODO: Use correct DynCGGroupMem
8364 Value *DynCGGroupMem = Builder.getInt32(0);
8365
8366 KArgs = OpenMPIRBuilder::TargetKernelArgs(NumTargetItems, RTArgs, TripCount,
8367 NumTeamsC, NumThreadsC,
8368 DynCGGroupMem, HasNoWait);
8369
8370 // Assume no error was returned because TaskBodyCB and
8371 // EmitTargetCallFallbackCB don't produce any.
8372 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8373 // The presence of certain clauses on the target directive require the
8374 // explicit generation of the target task.
8375 if (RequiresOuterTargetTask)
8376 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8377 Dependencies, KArgs.RTArgs,
8378 Info.HasNoWait);
8379
8380 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8381 EmitTargetCallFallbackCB, KArgs,
8382 DeviceID, RTLoc, AllocaIP);
8383 }());
8384
8385 Builder.restoreIP(AfterIP);
8386 return Error::success();
8387 };
8388
8389 // If we don't have an ID for the target region, it means an offload entry
8390 // wasn't created. In this case we just run the host fallback directly and
8391 // ignore any potential 'if' clauses.
8392 if (!OutlinedFnID) {
8393 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8394 return;
8395 }
8396
8397 // If there's no 'if' clause, only generate the kernel launch code path.
8398 if (!IfCond) {
8399 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8400 return;
8401 }
8402
8403 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8404 EmitTargetCallElse, AllocaIP));
8405}
8406
8407OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8408 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8409 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8410 TargetRegionEntryInfo &EntryInfo,
8411 const TargetKernelDefaultAttrs &DefaultAttrs,
8412 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8413 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8414 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8415 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8416 CustomMapperCallbackTy CustomMapperCB,
8417 const SmallVector<DependData> &Dependencies, bool HasNowait) {
8418
8419 if (!updateToLocation(Loc))
8420 return InsertPointTy();
8421
8422 Builder.restoreIP(CodeGenIP);
8423
8424 Function *OutlinedFn;
8425 Constant *OutlinedFnID = nullptr;
8426 // The target region is outlined into its own function. The LLVM IR for
8427 // the target region itself is generated using the callbacks CBFunc
8428 // and ArgAccessorFuncCB
8430 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8431 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8432 return Err;
8433
8434 // If we are not on the target device, then we need to generate code
8435 // to make a remote call (offload) to the previously outlined function
8436 // that represents the target region. Do that now.
8437 if (!Config.isTargetDevice())
8438 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8439 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8440 CustomMapperCB, Dependencies, HasNowait);
8441 return Builder.saveIP();
8442}
8443
8444std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8445 StringRef FirstSeparator,
8446 StringRef Separator) {
8447 SmallString<128> Buffer;
8448 llvm::raw_svector_ostream OS(Buffer);
8449 StringRef Sep = FirstSeparator;
8450 for (StringRef Part : Parts) {
8451 OS << Sep << Part;
8452 Sep = Separator;
8453 }
8454 return OS.str().str();
8455}
8456
8457std::string
8458OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8459 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8460 Config.separator());
8461}
8462
8464OpenMPIRBuilder::getOrCreateInternalVariable(Type *Ty, const StringRef &Name,
8465 unsigned AddressSpace) {
8466 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8467 if (Elem.second) {
8468 assert(Elem.second->getValueType() == Ty &&
8469 "OMP internal variable has different type than requested");
8470 } else {
8471 // TODO: investigate the appropriate linkage type used for the global
8472 // variable for possibly changing that to internal or private, or maybe
8473 // create different versions of the function for different OMP internal
8474 // variables.
8475 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8478 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8479 Constant::getNullValue(Ty), Elem.first(),
8480 /*InsertBefore=*/nullptr,
8482 const DataLayout &DL = M.getDataLayout();
8483 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8484 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpace);
8485 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8486 Elem.second = GV;
8487 }
8488
8489 return Elem.second;
8490}
8491
8492Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8493 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8494 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8495 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8496}
8497
8498Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8499 LLVMContext &Ctx = Builder.getContext();
8500 Value *Null =
8502 Value *SizeGep =
8503 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8504 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8505 return SizePtrToInt;
8506}
8507
8509OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8510 std::string VarName) {
8511 llvm::Constant *MaptypesArrayInit =
8512 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8513 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8514 M, MaptypesArrayInit->getType(),
8515 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8516 VarName);
8517 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8518 return MaptypesArrayGlobal;
8519}
8520
8521void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8522 InsertPointTy AllocaIP,
8523 unsigned NumOperands,
8524 struct MapperAllocas &MapperAllocas) {
8525 if (!updateToLocation(Loc))
8526 return;
8527
8528 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8529 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8530 Builder.restoreIP(AllocaIP);
8531 AllocaInst *ArgsBase = Builder.CreateAlloca(
8532 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8533 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8534 ".offload_ptrs");
8535 AllocaInst *ArgSizes = Builder.CreateAlloca(
8536 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8537 updateToLocation(Loc);
8538 MapperAllocas.ArgsBase = ArgsBase;
8539 MapperAllocas.Args = Args;
8540 MapperAllocas.ArgSizes = ArgSizes;
8541}
8542
8543void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8544 Function *MapperFunc, Value *SrcLocInfo,
8545 Value *MaptypesArg, Value *MapnamesArg,
8546 struct MapperAllocas &MapperAllocas,
8547 int64_t DeviceID, unsigned NumOperands) {
8548 if (!updateToLocation(Loc))
8549 return;
8550
8551 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8552 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8553 Value *ArgsBaseGEP =
8554 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8555 {Builder.getInt32(0), Builder.getInt32(0)});
8556 Value *ArgsGEP =
8557 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8558 {Builder.getInt32(0), Builder.getInt32(0)});
8559 Value *ArgSizesGEP =
8560 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8561 {Builder.getInt32(0), Builder.getInt32(0)});
8562 Value *NullPtr =
8563 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8564 Builder.CreateCall(MapperFunc,
8565 {SrcLocInfo, Builder.getInt64(DeviceID),
8566 Builder.getInt32(NumOperands), ArgsBaseGEP, ArgsGEP,
8567 ArgSizesGEP, MaptypesArg, MapnamesArg, NullPtr});
8568}
8569
8570void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8571 TargetDataRTArgs &RTArgs,
8572 TargetDataInfo &Info,
8573 bool ForEndCall) {
8574 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8575 "expected region end call to runtime only when end call is separate");
8576 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8577 auto VoidPtrTy = UnqualPtrTy;
8578 auto VoidPtrPtrTy = UnqualPtrTy;
8579 auto Int64Ty = Type::getInt64Ty(M.getContext());
8580 auto Int64PtrTy = UnqualPtrTy;
8581
8582 if (!Info.NumberOfPtrs) {
8583 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8584 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8585 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8586 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8587 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8588 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8589 return;
8590 }
8591
8592 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8593 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8594 Info.RTArgs.BasePointersArray,
8595 /*Idx0=*/0, /*Idx1=*/0);
8596 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8597 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8598 /*Idx0=*/0,
8599 /*Idx1=*/0);
8600 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8601 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8602 /*Idx0=*/0, /*Idx1=*/0);
8603 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8604 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8605 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8606 : Info.RTArgs.MapTypesArray,
8607 /*Idx0=*/0,
8608 /*Idx1=*/0);
8609
8610 // Only emit the mapper information arrays if debug information is
8611 // requested.
8612 if (!Info.EmitDebug)
8613 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8614 else
8615 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8616 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8617 /*Idx0=*/0,
8618 /*Idx1=*/0);
8619 // If there is no user-defined mapper, set the mapper array to nullptr to
8620 // avoid an unnecessary data privatization
8621 if (!Info.HasMapper)
8622 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8623 else
8624 RTArgs.MappersArray =
8625 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8626}
8627
8628void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8629 InsertPointTy CodeGenIP,
8630 MapInfosTy &CombinedInfo,
8631 TargetDataInfo &Info) {
8632 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8633 CombinedInfo.NonContigInfo;
8634
8635 // Build an array of struct descriptor_dim and then assign it to
8636 // offload_args.
8637 //
8638 // struct descriptor_dim {
8639 // uint64_t offset;
8640 // uint64_t count;
8641 // uint64_t stride
8642 // };
8643 Type *Int64Ty = Builder.getInt64Ty();
8645 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8646 "struct.descriptor_dim");
8647
8648 enum { OffsetFD = 0, CountFD, StrideFD };
8649 // We need two index variable here since the size of "Dims" is the same as
8650 // the size of Components, however, the size of offset, count, and stride is
8651 // equal to the size of base declaration that is non-contiguous.
8652 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8653 // Skip emitting ir if dimension size is 1 since it cannot be
8654 // non-contiguous.
8655 if (NonContigInfo.Dims[I] == 1)
8656 continue;
8657 Builder.restoreIP(AllocaIP);
8658 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8659 AllocaInst *DimsAddr =
8660 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8661 Builder.restoreIP(CodeGenIP);
8662 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8663 unsigned RevIdx = EE - II - 1;
8664 Value *DimsLVal = Builder.CreateInBoundsGEP(
8665 DimsAddr->getAllocatedType(), DimsAddr,
8666 {Builder.getInt64(0), Builder.getInt64(II)});
8667 // Offset
8668 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8669 Builder.CreateAlignedStore(
8670 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8671 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8672 // Count
8673 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8674 Builder.CreateAlignedStore(
8675 NonContigInfo.Counts[L][RevIdx], CountLVal,
8676 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8677 // Stride
8678 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8679 Builder.CreateAlignedStore(
8680 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8681 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8682 }
8683 // args[I] = &dims
8684 Builder.restoreIP(CodeGenIP);
8685 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8686 DimsAddr, Builder.getPtrTy());
8687 Value *P = Builder.CreateConstInBoundsGEP2_32(
8688 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8689 Info.RTArgs.PointersArray, 0, I);
8690 Builder.CreateAlignedStore(
8691 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8692 ++L;
8693 }
8694}
8695
8696void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8697 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8698 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8699 BasicBlock *ExitBB, bool IsInit) {
8700 StringRef Prefix = IsInit ? ".init" : ".del";
8701
8702 // Evaluate if this is an array section.
8704 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8705 Value *IsArray =
8706 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8707 Value *DeleteBit = Builder.CreateAnd(
8708 MapType,
8709 Builder.getInt64(
8710 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8711 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8712 Value *DeleteCond;
8713 Value *Cond;
8714 if (IsInit) {
8715 // base != begin?
8716 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8717 // IsPtrAndObj?
8718 Value *PtrAndObjBit = Builder.CreateAnd(
8719 MapType,
8720 Builder.getInt64(
8721 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8722 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8723 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8724 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8725 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8726 DeleteCond = Builder.CreateIsNull(
8727 DeleteBit,
8728 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8729 } else {
8730 Cond = IsArray;
8731 DeleteCond = Builder.CreateIsNotNull(
8732 DeleteBit,
8733 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8734 }
8735 Cond = Builder.CreateAnd(Cond, DeleteCond);
8736 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8737
8738 emitBlock(BodyBB, MapperFn);
8739 // Get the array size by multiplying element size and element number (i.e., \p
8740 // Size).
8741 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8742 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8743 // memory allocation/deletion purpose only.
8744 Value *MapTypeArg = Builder.CreateAnd(
8745 MapType,
8746 Builder.getInt64(
8747 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8748 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8749 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8750 MapTypeArg = Builder.CreateOr(
8751 MapTypeArg,
8752 Builder.getInt64(
8753 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8754 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8755
8756 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8757 // data structure.
8758 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8759 ArraySize, MapTypeArg, MapName};
8760 Builder.CreateCall(
8761 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8762 OffloadingArgs);
8763}
8764
8765Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
8766 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
8767 llvm::Value *BeginArg)>
8768 GenMapInfoCB,
8769 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
8770 SmallVector<Type *> Params;
8771 Params.emplace_back(Builder.getPtrTy());
8772 Params.emplace_back(Builder.getPtrTy());
8773 Params.emplace_back(Builder.getPtrTy());
8774 Params.emplace_back(Builder.getInt64Ty());
8775 Params.emplace_back(Builder.getInt64Ty());
8776 Params.emplace_back(Builder.getPtrTy());
8777
8778 auto *FnTy =
8779 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
8780
8781 SmallString<64> TyStr;
8782 raw_svector_ostream Out(TyStr);
8783 Function *MapperFn =
8785 MapperFn->addFnAttr(Attribute::NoInline);
8786 MapperFn->addFnAttr(Attribute::NoUnwind);
8787 MapperFn->addParamAttr(0, Attribute::NoUndef);
8788 MapperFn->addParamAttr(1, Attribute::NoUndef);
8789 MapperFn->addParamAttr(2, Attribute::NoUndef);
8790 MapperFn->addParamAttr(3, Attribute::NoUndef);
8791 MapperFn->addParamAttr(4, Attribute::NoUndef);
8792 MapperFn->addParamAttr(5, Attribute::NoUndef);
8793
8794 // Start the mapper function code generation.
8795 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
8796 auto SavedIP = Builder.saveIP();
8797 Builder.SetInsertPoint(EntryBB);
8798
8799 Value *MapperHandle = MapperFn->getArg(0);
8800 Value *BaseIn = MapperFn->getArg(1);
8801 Value *BeginIn = MapperFn->getArg(2);
8802 Value *Size = MapperFn->getArg(3);
8803 Value *MapType = MapperFn->getArg(4);
8804 Value *MapName = MapperFn->getArg(5);
8805
8806 // Compute the starting and end addresses of array elements.
8807 // Prepare common arguments for array initiation and deletion.
8808 // Convert the size in bytes into the number of array elements.
8809 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
8810 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
8811 Value *PtrBegin = BeginIn;
8812 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
8813
8814 // Emit array initiation if this is an array section and \p MapType indicates
8815 // that memory allocation is required.
8816 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
8817 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8818 MapType, MapName, ElementSize, HeadBB,
8819 /*IsInit=*/true);
8820
8821 // Emit a for loop to iterate through SizeArg of elements and map all of them.
8822
8823 // Emit the loop header block.
8824 emitBlock(HeadBB, MapperFn);
8825 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
8826 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
8827 // Evaluate whether the initial condition is satisfied.
8828 Value *IsEmpty =
8829 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
8830 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
8831
8832 // Emit the loop body block.
8833 emitBlock(BodyBB, MapperFn);
8834 BasicBlock *LastBB = BodyBB;
8835 PHINode *PtrPHI =
8836 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
8837 PtrPHI->addIncoming(PtrBegin, HeadBB);
8838
8839 // Get map clause information. Fill up the arrays with all mapped variables.
8840 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
8841 if (!Info)
8842 return Info.takeError();
8843
8844 // Call the runtime API __tgt_mapper_num_components to get the number of
8845 // pre-existing components.
8846 Value *OffloadingArgs[] = {MapperHandle};
8847 Value *PreviousSize = Builder.CreateCall(
8848 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
8849 OffloadingArgs);
8850 Value *ShiftedPreviousSize =
8851 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
8852
8853 // Fill up the runtime mapper handle for all components.
8854 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
8855 Value *CurBaseArg = Info->BasePointers[I];
8856 Value *CurBeginArg = Info->Pointers[I];
8857 Value *CurSizeArg = Info->Sizes[I];
8858 Value *CurNameArg = Info->Names.size()
8859 ? Info->Names[I]
8860 : Constant::getNullValue(Builder.getPtrTy());
8861
8862 // Extract the MEMBER_OF field from the map type.
8863 Value *OriMapType = Builder.getInt64(
8864 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8865 Info->Types[I]));
8866 Value *MemberMapType =
8867 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
8868
8869 // Combine the map type inherited from user-defined mapper with that
8870 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
8871 // bits of the \a MapType, which is the input argument of the mapper
8872 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
8873 // bits of MemberMapType.
8874 // [OpenMP 5.0], 1.2.6. map-type decay.
8875 // | alloc | to | from | tofrom | release | delete
8876 // ----------------------------------------------------------
8877 // alloc | alloc | alloc | alloc | alloc | release | delete
8878 // to | alloc | to | alloc | to | release | delete
8879 // from | alloc | alloc | from | from | release | delete
8880 // tofrom | alloc | to | from | tofrom | release | delete
8881 Value *LeftToFrom = Builder.CreateAnd(
8882 MapType,
8883 Builder.getInt64(
8884 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8885 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8886 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8887 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
8888 BasicBlock *AllocElseBB =
8889 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
8890 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
8891 BasicBlock *ToElseBB =
8892 BasicBlock::Create(M.getContext(), "omp.type.to.else");
8893 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
8894 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
8895 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
8896 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
8897 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
8898 emitBlock(AllocBB, MapperFn);
8899 Value *AllocMapType = Builder.CreateAnd(
8900 MemberMapType,
8901 Builder.getInt64(
8902 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8903 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8904 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8905 Builder.CreateBr(EndBB);
8906 emitBlock(AllocElseBB, MapperFn);
8907 Value *IsTo = Builder.CreateICmpEQ(
8908 LeftToFrom,
8909 Builder.getInt64(
8910 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8911 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8912 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
8913 // In case of to, clear OMP_MAP_FROM.
8914 emitBlock(ToBB, MapperFn);
8915 Value *ToMapType = Builder.CreateAnd(
8916 MemberMapType,
8917 Builder.getInt64(
8918 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8919 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8920 Builder.CreateBr(EndBB);
8921 emitBlock(ToElseBB, MapperFn);
8922 Value *IsFrom = Builder.CreateICmpEQ(
8923 LeftToFrom,
8924 Builder.getInt64(
8925 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8926 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8927 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
8928 // In case of from, clear OMP_MAP_TO.
8929 emitBlock(FromBB, MapperFn);
8930 Value *FromMapType = Builder.CreateAnd(
8931 MemberMapType,
8932 Builder.getInt64(
8933 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8934 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
8935 // In case of tofrom, do nothing.
8936 emitBlock(EndBB, MapperFn);
8937 LastBB = EndBB;
8938 PHINode *CurMapType =
8939 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
8940 CurMapType->addIncoming(AllocMapType, AllocBB);
8941 CurMapType->addIncoming(ToMapType, ToBB);
8942 CurMapType->addIncoming(FromMapType, FromBB);
8943 CurMapType->addIncoming(MemberMapType, ToElseBB);
8944
8945 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
8946 CurSizeArg, CurMapType, CurNameArg};
8947
8948 auto ChildMapperFn = CustomMapperCB(I);
8949 if (!ChildMapperFn)
8950 return ChildMapperFn.takeError();
8951 if (*ChildMapperFn) {
8952 // Call the corresponding mapper function.
8953 Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
8954 } else {
8955 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8956 // data structure.
8957 Builder.CreateCall(
8958 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8959 OffloadingArgs);
8960 }
8961 }
8962
8963 // Update the pointer to point to the next element that needs to be mapped,
8964 // and check whether we have mapped all elements.
8965 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
8966 "omp.arraymap.next");
8967 PtrPHI->addIncoming(PtrNext, LastBB);
8968 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
8969 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
8970 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
8971
8972 emitBlock(ExitBB, MapperFn);
8973 // Emit array deletion if this is an array section and \p MapType indicates
8974 // that deletion is required.
8975 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
8976 MapType, MapName, ElementSize, DoneBB,
8977 /*IsInit=*/false);
8978
8979 // Emit the function exit block.
8980 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
8981
8982 Builder.CreateRetVoid();
8983 Builder.restoreIP(SavedIP);
8984 return MapperFn;
8985}
8986
8987Error OpenMPIRBuilder::emitOffloadingArrays(
8988 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
8989 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
8990 bool IsNonContiguous,
8991 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8992
8993 // Reset the array information.
8994 Info.clearArrayInfo();
8995 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
8996
8997 if (Info.NumberOfPtrs == 0)
8998 return Error::success();
8999
9000 Builder.restoreIP(AllocaIP);
9001 // Detect if we have any capture size requiring runtime evaluation of the
9002 // size so that a constant array could be eventually used.
9003 ArrayType *PointerArrayType =
9004 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9005
9006 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9007 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9008
9009 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9010 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9011 AllocaInst *MappersArray = Builder.CreateAlloca(
9012 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9013 Info.RTArgs.MappersArray = MappersArray;
9014
9015 // If we don't have any VLA types or other types that require runtime
9016 // evaluation, we can use a constant array for the map sizes, otherwise we
9017 // need to fill up the arrays as we do for the pointers.
9018 Type *Int64Ty = Builder.getInt64Ty();
9019 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9020 ConstantInt::get(Int64Ty, 0));
9021 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9022 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9023 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9024 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9025 if (IsNonContiguous &&
9026 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9027 CombinedInfo.Types[I] &
9028 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9029 ConstSizes[I] =
9030 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9031 else
9032 ConstSizes[I] = CI;
9033 continue;
9034 }
9035 }
9036 RuntimeSizes.set(I);
9037 }
9038
9039 if (RuntimeSizes.all()) {
9040 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9041 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9042 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9043 restoreIPandDebugLoc(Builder, CodeGenIP);
9044 } else {
9045 auto *SizesArrayInit = ConstantArray::get(
9046 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9047 std::string Name = createPlatformSpecificName({"offload_sizes"});
9048 auto *SizesArrayGbl =
9049 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9050 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9051 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9052
9053 if (!RuntimeSizes.any()) {
9054 Info.RTArgs.SizesArray = SizesArrayGbl;
9055 } else {
9056 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9057 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9058 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9059 AllocaInst *Buffer = Builder.CreateAlloca(
9060 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9061 Buffer->setAlignment(OffloadSizeAlign);
9062 restoreIPandDebugLoc(Builder, CodeGenIP);
9063 Builder.CreateMemCpy(
9064 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9065 SizesArrayGbl, OffloadSizeAlign,
9066 Builder.getIntN(
9067 IndexSize,
9068 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9069
9070 Info.RTArgs.SizesArray = Buffer;
9071 }
9072 restoreIPandDebugLoc(Builder, CodeGenIP);
9073 }
9074
9075 // The map types are always constant so we don't need to generate code to
9076 // fill arrays. Instead, we create an array constant.
9078 for (auto mapFlag : CombinedInfo.Types)
9079 Mapping.push_back(
9080 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9081 mapFlag));
9082 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9083 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9084 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9085
9086 // The information types are only built if provided.
9087 if (!CombinedInfo.Names.empty()) {
9088 auto *MapNamesArrayGbl = createOffloadMapnames(
9089 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9090 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9091 Info.EmitDebug = true;
9092 } else {
9093 Info.RTArgs.MapNamesArray =
9094 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9095 Info.EmitDebug = false;
9096 }
9097
9098 // If there's a present map type modifier, it must not be applied to the end
9099 // of a region, so generate a separate map type array in that case.
9100 if (Info.separateBeginEndCalls()) {
9101 bool EndMapTypesDiffer = false;
9102 for (uint64_t &Type : Mapping) {
9103 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9104 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9105 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9106 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9107 EndMapTypesDiffer = true;
9108 }
9109 }
9110 if (EndMapTypesDiffer) {
9111 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9112 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9113 }
9114 }
9115
9116 PointerType *PtrTy = Builder.getPtrTy();
9117 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9118 Value *BPVal = CombinedInfo.BasePointers[I];
9119 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9120 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9121 0, I);
9122 Builder.CreateAlignedStore(BPVal, BP,
9123 M.getDataLayout().getPrefTypeAlign(PtrTy));
9124
9125 if (Info.requiresDevicePointerInfo()) {
9126 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9127 CodeGenIP = Builder.saveIP();
9128 Builder.restoreIP(AllocaIP);
9129 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9130 Builder.restoreIP(CodeGenIP);
9131 if (DeviceAddrCB)
9132 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9133 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9134 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9135 if (DeviceAddrCB)
9136 DeviceAddrCB(I, BP);
9137 }
9138 }
9139
9140 Value *PVal = CombinedInfo.Pointers[I];
9141 Value *P = Builder.CreateConstInBoundsGEP2_32(
9142 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9143 I);
9144 // TODO: Check alignment correct.
9145 Builder.CreateAlignedStore(PVal, P,
9146 M.getDataLayout().getPrefTypeAlign(PtrTy));
9147
9148 if (RuntimeSizes.test(I)) {
9149 Value *S = Builder.CreateConstInBoundsGEP2_32(
9150 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9151 /*Idx0=*/0,
9152 /*Idx1=*/I);
9153 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9154 Int64Ty,
9155 /*isSigned=*/true),
9156 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9157 }
9158 // Fill up the mapper array.
9159 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9160 Value *MFunc = ConstantPointerNull::get(PtrTy);
9161
9162 auto CustomMFunc = CustomMapperCB(I);
9163 if (!CustomMFunc)
9164 return CustomMFunc.takeError();
9165 if (*CustomMFunc)
9166 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9167
9168 Value *MAddr = Builder.CreateInBoundsGEP(
9169 MappersArray->getAllocatedType(), MappersArray,
9170 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9171 Builder.CreateAlignedStore(
9172 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9173 }
9174
9175 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9176 Info.NumberOfPtrs == 0)
9177 return Error::success();
9178 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9179 return Error::success();
9180}
9181
9182void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9183 BasicBlock *CurBB = Builder.GetInsertBlock();
9184
9185 if (!CurBB || CurBB->getTerminator()) {
9186 // If there is no insert point or the previous block is already
9187 // terminated, don't touch it.
9188 } else {
9189 // Otherwise, create a fall-through branch.
9190 Builder.CreateBr(Target);
9191 }
9192
9193 Builder.ClearInsertionPoint();
9194}
9195
9196void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9197 bool IsFinished) {
9198 BasicBlock *CurBB = Builder.GetInsertBlock();
9199
9200 // Fall out of the current block (if necessary).
9201 emitBranch(BB);
9202
9203 if (IsFinished && BB->use_empty()) {
9204 BB->eraseFromParent();
9205 return;
9206 }
9207
9208 // Place the block after the current block, if possible, or else at
9209 // the end of the function.
9210 if (CurBB && CurBB->getParent())
9211 CurFn->insert(std::next(CurBB->getIterator()), BB);
9212 else
9213 CurFn->insert(CurFn->end(), BB);
9214 Builder.SetInsertPoint(BB);
9215}
9216
9217Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9218 BodyGenCallbackTy ElseGen,
9219 InsertPointTy AllocaIP) {
9220 // If the condition constant folds and can be elided, try to avoid emitting
9221 // the condition and the dead arm of the if/else.
9222 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9223 auto CondConstant = CI->getSExtValue();
9224 if (CondConstant)
9225 return ThenGen(AllocaIP, Builder.saveIP());
9226
9227 return ElseGen(AllocaIP, Builder.saveIP());
9228 }
9229
9230 Function *CurFn = Builder.GetInsertBlock()->getParent();
9231
9232 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9233 // emit the conditional branch.
9234 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9235 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9236 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9237 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9238 // Emit the 'then' code.
9239 emitBlock(ThenBlock, CurFn);
9240 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9241 return Err;
9242 emitBranch(ContBlock);
9243 // Emit the 'else' code if present.
9244 // There is no need to emit line number for unconditional branch.
9245 emitBlock(ElseBlock, CurFn);
9246 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9247 return Err;
9248 // There is no need to emit line number for unconditional branch.
9249 emitBranch(ContBlock);
9250 // Emit the continuation block for code after the if.
9251 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9252 return Error::success();
9253}
9254
9255bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9256 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9259 "Unexpected Atomic Ordering.");
9260
9261 bool Flush = false;
9263
9264 switch (AK) {
9265 case Read:
9268 FlushAO = AtomicOrdering::Acquire;
9269 Flush = true;
9270 }
9271 break;
9272 case Write:
9273 case Compare:
9274 case Update:
9277 FlushAO = AtomicOrdering::Release;
9278 Flush = true;
9279 }
9280 break;
9281 case Capture:
9282 switch (AO) {
9284 FlushAO = AtomicOrdering::Acquire;
9285 Flush = true;
9286 break;
9288 FlushAO = AtomicOrdering::Release;
9289 Flush = true;
9290 break;
9294 Flush = true;
9295 break;
9296 default:
9297 // do nothing - leave silently.
9298 break;
9299 }
9300 }
9301
9302 if (Flush) {
9303 // Currently Flush RT call still doesn't take memory_ordering, so for when
9304 // that happens, this tries to do the resolution of which atomic ordering
9305 // to use with but issue the flush call
9306 // TODO: pass `FlushAO` after memory ordering support is added
9307 (void)FlushAO;
9308 emitFlush(Loc);
9309 }
9310
9311 // for AO == AtomicOrdering::Monotonic and all other case combinations
9312 // do nothing
9313 return Flush;
9314}
9315
9316OpenMPIRBuilder::InsertPointTy
9317OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9318 AtomicOpValue &X, AtomicOpValue &V,
9319 AtomicOrdering AO, InsertPointTy AllocaIP) {
9320 if (!updateToLocation(Loc))
9321 return Loc.IP;
9322
9323 assert(X.Var->getType()->isPointerTy() &&
9324 "OMP Atomic expects a pointer to target memory");
9325 Type *XElemTy = X.ElemTy;
9326 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9327 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9328 "OMP atomic read expected a scalar type");
9329
9330 Value *XRead = nullptr;
9331
9332 if (XElemTy->isIntegerTy()) {
9333 LoadInst *XLD =
9334 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9335 XLD->setAtomic(AO);
9336 XRead = cast<Value>(XLD);
9337 } else if (XElemTy->isStructTy()) {
9338 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9339 // target does not support `atomicrmw` of the size of the struct
9340 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9341 OldVal->setAtomic(AO);
9342 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9343 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9344 OpenMPIRBuilder::AtomicInfo atomicInfo(
9345 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9346 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9347 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9348 XRead = AtomicLoadRes.first;
9349 OldVal->eraseFromParent();
9350 } else {
9351 // We need to perform atomic op as integer
9352 IntegerType *IntCastTy =
9353 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9354 LoadInst *XLoad =
9355 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9356 XLoad->setAtomic(AO);
9357 if (XElemTy->isFloatingPointTy()) {
9358 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9359 } else {
9360 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9361 }
9362 }
9363 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9364 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9365 return Builder.saveIP();
9366}
9367
9368OpenMPIRBuilder::InsertPointTy
9369OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9370 AtomicOpValue &X, Value *Expr,
9371 AtomicOrdering AO, InsertPointTy AllocaIP) {
9372 if (!updateToLocation(Loc))
9373 return Loc.IP;
9374
9375 assert(X.Var->getType()->isPointerTy() &&
9376 "OMP Atomic expects a pointer to target memory");
9377 Type *XElemTy = X.ElemTy;
9378 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9379 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9380 "OMP atomic write expected a scalar type");
9381
9382 if (XElemTy->isIntegerTy()) {
9383 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9384 XSt->setAtomic(AO);
9385 } else if (XElemTy->isStructTy()) {
9386 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9387 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9388 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9389 OpenMPIRBuilder::AtomicInfo atomicInfo(
9390 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9391 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9392 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9393 OldVal->eraseFromParent();
9394 } else {
9395 // We need to bitcast and perform atomic op as integers
9396 IntegerType *IntCastTy =
9397 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9398 Value *ExprCast =
9399 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9400 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9401 XSt->setAtomic(AO);
9402 }
9403
9404 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9405 return Builder.saveIP();
9406}
9407
9408OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9409 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9410 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9411 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9412 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9413 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9414 if (!updateToLocation(Loc))
9415 return Loc.IP;
9416
9417 LLVM_DEBUG({
9418 Type *XTy = X.Var->getType();
9419 assert(XTy->isPointerTy() &&
9420 "OMP Atomic expects a pointer to target memory");
9421 Type *XElemTy = X.ElemTy;
9422 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9423 XElemTy->isPointerTy()) &&
9424 "OMP atomic update expected a scalar type");
9425 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9426 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9427 "OpenMP atomic does not support LT or GT operations");
9428 });
9429
9430 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9431 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9432 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9433 if (!AtomicResult)
9434 return AtomicResult.takeError();
9435 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9436 return Builder.saveIP();
9437}
9438
9439// FIXME: Duplicating AtomicExpand
9440Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9441 AtomicRMWInst::BinOp RMWOp) {
9442 switch (RMWOp) {
9443 case AtomicRMWInst::Add:
9444 return Builder.CreateAdd(Src1, Src2);
9445 case AtomicRMWInst::Sub:
9446 return Builder.CreateSub(Src1, Src2);
9447 case AtomicRMWInst::And:
9448 return Builder.CreateAnd(Src1, Src2);
9450 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9451 case AtomicRMWInst::Or:
9452 return Builder.CreateOr(Src1, Src2);
9453 case AtomicRMWInst::Xor:
9454 return Builder.CreateXor(Src1, Src2);
9459 case AtomicRMWInst::Max:
9460 case AtomicRMWInst::Min:
9471 llvm_unreachable("Unsupported atomic update operation");
9472 }
9473 llvm_unreachable("Unsupported atomic update operation");
9474}
9475
9476Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9477 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9479 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9480 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9481 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9482 // or a complex datatype.
9483 bool emitRMWOp = false;
9484 switch (RMWOp) {
9485 case AtomicRMWInst::Add:
9486 case AtomicRMWInst::And:
9488 case AtomicRMWInst::Or:
9489 case AtomicRMWInst::Xor:
9491 emitRMWOp = XElemTy;
9492 break;
9493 case AtomicRMWInst::Sub:
9494 emitRMWOp = (IsXBinopExpr && XElemTy);
9495 break;
9496 default:
9497 emitRMWOp = false;
9498 }
9499 emitRMWOp &= XElemTy->isIntegerTy();
9500
9501 std::pair<Value *, Value *> Res;
9502 if (emitRMWOp) {
9503 AtomicRMWInst *RMWInst =
9504 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9505 if (T.isAMDGPU()) {
9506 if (IsIgnoreDenormalMode)
9507 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9508 llvm::MDNode::get(Builder.getContext(), {}));
9509 if (!IsFineGrainedMemory)
9510 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9511 llvm::MDNode::get(Builder.getContext(), {}));
9512 if (!IsRemoteMemory)
9513 RMWInst->setMetadata("amdgpu.no.remote.memory",
9514 llvm::MDNode::get(Builder.getContext(), {}));
9515 }
9516 Res.first = RMWInst;
9517 // not needed except in case of postfix captures. Generate anyway for
9518 // consistency with the else part. Will be removed with any DCE pass.
9519 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9520 if (RMWOp == AtomicRMWInst::Xchg)
9521 Res.second = Res.first;
9522 else
9523 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9524 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9525 XElemTy->isStructTy()) {
9526 LoadInst *OldVal =
9527 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9528 OldVal->setAtomic(AO);
9529 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9530 unsigned LoadSize =
9531 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9532
9533 OpenMPIRBuilder::AtomicInfo atomicInfo(
9534 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9535 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9536 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9537 BasicBlock *CurBB = Builder.GetInsertBlock();
9538 Instruction *CurBBTI = CurBB->getTerminator();
9539 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9540 BasicBlock *ExitBB =
9541 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9542 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9543 X->getName() + ".atomic.cont");
9544 ContBB->getTerminator()->eraseFromParent();
9545 Builder.restoreIP(AllocaIP);
9546 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9547 NewAtomicAddr->setName(X->getName() + "x.new.val");
9548 Builder.SetInsertPoint(ContBB);
9549 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9550 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9551 Value *OldExprVal = PHI;
9552 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9553 if (!CBResult)
9554 return CBResult.takeError();
9555 Value *Upd = *CBResult;
9556 Builder.CreateStore(Upd, NewAtomicAddr);
9559 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9560 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9561 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9562 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9563 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9564 OldVal->eraseFromParent();
9565 Res.first = OldExprVal;
9566 Res.second = Upd;
9567
9568 if (UnreachableInst *ExitTI =
9570 CurBBTI->eraseFromParent();
9571 Builder.SetInsertPoint(ExitBB);
9572 } else {
9573 Builder.SetInsertPoint(ExitTI);
9574 }
9575 } else {
9576 IntegerType *IntCastTy =
9577 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9578 LoadInst *OldVal =
9579 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9580 OldVal->setAtomic(AO);
9581 // CurBB
9582 // | /---\
9583 // ContBB |
9584 // | \---/
9585 // ExitBB
9586 BasicBlock *CurBB = Builder.GetInsertBlock();
9587 Instruction *CurBBTI = CurBB->getTerminator();
9588 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9589 BasicBlock *ExitBB =
9590 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9591 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9592 X->getName() + ".atomic.cont");
9593 ContBB->getTerminator()->eraseFromParent();
9594 Builder.restoreIP(AllocaIP);
9595 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9596 NewAtomicAddr->setName(X->getName() + "x.new.val");
9597 Builder.SetInsertPoint(ContBB);
9598 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9599 PHI->addIncoming(OldVal, CurBB);
9600 bool IsIntTy = XElemTy->isIntegerTy();
9601 Value *OldExprVal = PHI;
9602 if (!IsIntTy) {
9603 if (XElemTy->isFloatingPointTy()) {
9604 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9605 X->getName() + ".atomic.fltCast");
9606 } else {
9607 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9608 X->getName() + ".atomic.ptrCast");
9609 }
9610 }
9611
9612 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9613 if (!CBResult)
9614 return CBResult.takeError();
9615 Value *Upd = *CBResult;
9616 Builder.CreateStore(Upd, NewAtomicAddr);
9617 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9620 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9621 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9622 Result->setVolatile(VolatileX);
9623 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9624 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9625 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9626 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9627
9628 Res.first = OldExprVal;
9629 Res.second = Upd;
9630
9631 // set Insertion point in exit block
9632 if (UnreachableInst *ExitTI =
9634 CurBBTI->eraseFromParent();
9635 Builder.SetInsertPoint(ExitBB);
9636 } else {
9637 Builder.SetInsertPoint(ExitTI);
9638 }
9639 }
9640
9641 return Res;
9642}
9643
9644OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9645 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9646 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9647 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9648 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9649 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9650 if (!updateToLocation(Loc))
9651 return Loc.IP;
9652
9653 LLVM_DEBUG({
9654 Type *XTy = X.Var->getType();
9655 assert(XTy->isPointerTy() &&
9656 "OMP Atomic expects a pointer to target memory");
9657 Type *XElemTy = X.ElemTy;
9658 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9659 XElemTy->isPointerTy()) &&
9660 "OMP atomic capture expected a scalar type");
9661 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9662 "OpenMP atomic does not support LT or GT operations");
9663 });
9664
9665 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9666 // 'x' is simply atomically rewritten with 'expr'.
9667 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9668 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9669 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9670 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9671 if (!AtomicResult)
9672 return AtomicResult.takeError();
9673 Value *CapturedVal =
9674 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9675 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9676
9677 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9678 return Builder.saveIP();
9679}
9680
9681OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9682 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9683 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9684 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9685 bool IsFailOnly) {
9686
9688 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9689 IsPostfixUpdate, IsFailOnly, Failure);
9690}
9691
9692OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9693 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9694 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9695 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9696 bool IsFailOnly, AtomicOrdering Failure) {
9697
9698 if (!updateToLocation(Loc))
9699 return Loc.IP;
9700
9701 assert(X.Var->getType()->isPointerTy() &&
9702 "OMP atomic expects a pointer to target memory");
9703 // compare capture
9704 if (V.Var) {
9705 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9706 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9707 }
9708
9709 bool IsInteger = E->getType()->isIntegerTy();
9710
9711 if (Op == OMPAtomicCompareOp::EQ) {
9712 AtomicCmpXchgInst *Result = nullptr;
9713 if (!IsInteger) {
9714 IntegerType *IntCastTy =
9715 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9716 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9717 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9718 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9719 AO, Failure);
9720 } else {
9721 Result =
9722 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9723 }
9724
9725 if (V.Var) {
9726 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9727 if (!IsInteger)
9728 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9729 assert(OldValue->getType() == V.ElemTy &&
9730 "OldValue and V must be of same type");
9731 if (IsPostfixUpdate) {
9732 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9733 } else {
9734 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9735 if (IsFailOnly) {
9736 // CurBB----
9737 // | |
9738 // v |
9739 // ContBB |
9740 // | |
9741 // v |
9742 // ExitBB <-
9743 //
9744 // where ContBB only contains the store of old value to 'v'.
9745 BasicBlock *CurBB = Builder.GetInsertBlock();
9746 Instruction *CurBBTI = CurBB->getTerminator();
9747 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9748 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9749 CurBBTI, X.Var->getName() + ".atomic.exit");
9750 BasicBlock *ContBB = CurBB->splitBasicBlock(
9751 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9752 ContBB->getTerminator()->eraseFromParent();
9753 CurBB->getTerminator()->eraseFromParent();
9754
9755 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9756
9757 Builder.SetInsertPoint(ContBB);
9758 Builder.CreateStore(OldValue, V.Var);
9759 Builder.CreateBr(ExitBB);
9760
9761 if (UnreachableInst *ExitTI =
9763 CurBBTI->eraseFromParent();
9764 Builder.SetInsertPoint(ExitBB);
9765 } else {
9766 Builder.SetInsertPoint(ExitTI);
9767 }
9768 } else {
9769 Value *CapturedValue =
9770 Builder.CreateSelect(SuccessOrFail, E, OldValue);
9771 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9772 }
9773 }
9774 }
9775 // The comparison result has to be stored.
9776 if (R.Var) {
9777 assert(R.Var->getType()->isPointerTy() &&
9778 "r.var must be of pointer type");
9779 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
9780
9781 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9782 Value *ResultCast = R.IsSigned
9783 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
9784 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
9785 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
9786 }
9787 } else {
9788 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
9789 "Op should be either max or min at this point");
9790 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
9791
9792 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
9793 // Let's take max as example.
9794 // OpenMP form:
9795 // x = x > expr ? expr : x;
9796 // LLVM form:
9797 // *ptr = *ptr > val ? *ptr : val;
9798 // We need to transform to LLVM form.
9799 // x = x <= expr ? x : expr;
9801 if (IsXBinopExpr) {
9802 if (IsInteger) {
9803 if (X.IsSigned)
9804 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
9806 else
9807 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
9809 } else {
9810 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
9812 }
9813 } else {
9814 if (IsInteger) {
9815 if (X.IsSigned)
9816 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
9818 else
9819 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
9821 } else {
9822 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
9824 }
9825 }
9826
9827 AtomicRMWInst *OldValue =
9828 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
9829 if (V.Var) {
9830 Value *CapturedValue = nullptr;
9831 if (IsPostfixUpdate) {
9832 CapturedValue = OldValue;
9833 } else {
9834 CmpInst::Predicate Pred;
9835 switch (NewOp) {
9836 case AtomicRMWInst::Max:
9837 Pred = CmpInst::ICMP_SGT;
9838 break;
9840 Pred = CmpInst::ICMP_UGT;
9841 break;
9843 Pred = CmpInst::FCMP_OGT;
9844 break;
9845 case AtomicRMWInst::Min:
9846 Pred = CmpInst::ICMP_SLT;
9847 break;
9849 Pred = CmpInst::ICMP_ULT;
9850 break;
9852 Pred = CmpInst::FCMP_OLT;
9853 break;
9854 default:
9855 llvm_unreachable("unexpected comparison op");
9856 }
9857 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
9858 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
9859 }
9860 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
9861 }
9862 }
9863
9864 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
9865
9866 return Builder.saveIP();
9867}
9868
9869OpenMPIRBuilder::InsertPointOrErrorTy
9870OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
9871 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
9872 Value *NumTeamsUpper, Value *ThreadLimit,
9873 Value *IfExpr) {
9874 if (!updateToLocation(Loc))
9875 return InsertPointTy();
9876
9877 uint32_t SrcLocStrSize;
9878 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
9879 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9880 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
9881
9882 // Outer allocation basicblock is the entry block of the current function.
9883 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
9884 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
9885 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
9886 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
9887 }
9888
9889 // The current basic block is split into four basic blocks. After outlining,
9890 // they will be mapped as follows:
9891 // ```
9892 // def current_fn() {
9893 // current_basic_block:
9894 // br label %teams.exit
9895 // teams.exit:
9896 // ; instructions after teams
9897 // }
9898 //
9899 // def outlined_fn() {
9900 // teams.alloca:
9901 // br label %teams.body
9902 // teams.body:
9903 // ; instructions within teams body
9904 // }
9905 // ```
9906 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
9907 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
9908 BasicBlock *AllocaBB =
9909 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
9910
9911 bool SubClausesPresent =
9912 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
9913 // Push num_teams
9914 if (!Config.isTargetDevice() && SubClausesPresent) {
9915 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
9916 "if lowerbound is non-null, then upperbound must also be non-null "
9917 "for bounds on num_teams");
9918
9919 if (NumTeamsUpper == nullptr)
9920 NumTeamsUpper = Builder.getInt32(0);
9921
9922 if (NumTeamsLower == nullptr)
9923 NumTeamsLower = NumTeamsUpper;
9924
9925 if (IfExpr) {
9926 assert(IfExpr->getType()->isIntegerTy() &&
9927 "argument to if clause must be an integer value");
9928
9929 // upper = ifexpr ? upper : 1
9930 if (IfExpr->getType() != Int1)
9931 IfExpr = Builder.CreateICmpNE(IfExpr,
9932 ConstantInt::get(IfExpr->getType(), 0));
9933 NumTeamsUpper = Builder.CreateSelect(
9934 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
9935
9936 // lower = ifexpr ? lower : 1
9937 NumTeamsLower = Builder.CreateSelect(
9938 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
9939 }
9940
9941 if (ThreadLimit == nullptr)
9942 ThreadLimit = Builder.getInt32(0);
9943
9944 Value *ThreadNum = getOrCreateThreadID(Ident);
9945 Builder.CreateCall(
9946 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
9947 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
9948 }
9949 // Generate the body of teams.
9950 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
9951 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
9952 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
9953 return Err;
9954
9955 OutlineInfo OI;
9956 OI.EntryBB = AllocaBB;
9957 OI.ExitBB = ExitBB;
9958 OI.OuterAllocaBB = &OuterAllocaBB;
9959
9960 // Insert fake values for global tid and bound tid.
9962 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
9963 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9964 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
9965 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9966 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
9967
9968 auto HostPostOutlineCB = [this, Ident,
9969 ToBeDeleted](Function &OutlinedFn) mutable {
9970 // The stale call instruction will be replaced with a new call instruction
9971 // for runtime call with the outlined function.
9972
9973 assert(OutlinedFn.hasOneUse() &&
9974 "there must be a single user for the outlined function");
9975 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9976 ToBeDeleted.push_back(StaleCI);
9977
9978 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
9979 "Outlined function must have two or three arguments only");
9980
9981 bool HasShared = OutlinedFn.arg_size() == 3;
9982
9983 OutlinedFn.getArg(0)->setName("global.tid.ptr");
9984 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
9985 if (HasShared)
9986 OutlinedFn.getArg(2)->setName("data");
9987
9988 // Call to the runtime function for teams in the current function.
9989 assert(StaleCI && "Error while outlining - no CallInst user found for the "
9990 "outlined function.");
9991 Builder.SetInsertPoint(StaleCI);
9993 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
9994 if (HasShared)
9995 Args.push_back(StaleCI->getArgOperand(2));
9996 Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
9997 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
9998 Args);
9999
10000 for (Instruction *I : llvm::reverse(ToBeDeleted))
10001 I->eraseFromParent();
10002 };
10003
10004 if (!Config.isTargetDevice())
10005 OI.PostOutlineCB = HostPostOutlineCB;
10006
10007 addOutlineInfo(std::move(OI));
10008
10009 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10010
10011 return Builder.saveIP();
10012}
10013
10014OpenMPIRBuilder::InsertPointOrErrorTy
10015OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10016 InsertPointTy OuterAllocaIP,
10017 BodyGenCallbackTy BodyGenCB) {
10018 if (!updateToLocation(Loc))
10019 return InsertPointTy();
10020
10021 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10022
10023 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10024 BasicBlock *BodyBB =
10025 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10026 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10027 }
10028 BasicBlock *ExitBB =
10029 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10030 BasicBlock *BodyBB =
10031 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10032 BasicBlock *AllocaBB =
10033 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10034
10035 // Generate the body of distribute clause
10036 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10037 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10038 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10039 return Err;
10040
10041 // When using target we use different runtime functions which require a
10042 // callback.
10043 if (Config.isTargetDevice()) {
10044 OutlineInfo OI;
10045 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10046 OI.EntryBB = AllocaBB;
10047 OI.ExitBB = ExitBB;
10048
10049 addOutlineInfo(std::move(OI));
10050 }
10051 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10052
10053 return Builder.saveIP();
10054}
10055
10057OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10058 std::string VarName) {
10059 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10061 Names.size()),
10062 Names);
10063 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10064 M, MapNamesArrayInit->getType(),
10065 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10066 VarName);
10067 return MapNamesArrayGlobal;
10068}
10069
10070// Create all simple and struct types exposed by the runtime and remember
10071// the llvm::PointerTypes of them for easy access later.
10072void OpenMPIRBuilder::initializeTypes(Module &M) {
10073 LLVMContext &Ctx = M.getContext();
10074 StructType *T;
10075 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10076 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10077#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10078#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10079 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10080 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10081#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10082 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10083 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10084#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10085 T = StructType::getTypeByName(Ctx, StructName); \
10086 if (!T) \
10087 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10088 VarName = T; \
10089 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10090#include "llvm/Frontend/OpenMP/OMPKinds.def"
10091}
10092
10093void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10095 SmallVectorImpl<BasicBlock *> &BlockVector) {
10097 BlockSet.insert(EntryBB);
10098 BlockSet.insert(ExitBB);
10099
10100 Worklist.push_back(EntryBB);
10101 while (!Worklist.empty()) {
10102 BasicBlock *BB = Worklist.pop_back_val();
10103 BlockVector.push_back(BB);
10104 for (BasicBlock *SuccBB : successors(BB))
10105 if (BlockSet.insert(SuccBB).second)
10106 Worklist.push_back(SuccBB);
10107 }
10108}
10109
10110void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10111 uint64_t Size, int32_t Flags,
10113 StringRef Name) {
10114 if (!Config.isGPU()) {
10117 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10118 return;
10119 }
10120 // TODO: Add support for global variables on the device after declare target
10121 // support.
10122 Function *Fn = dyn_cast<Function>(Addr);
10123 if (!Fn)
10124 return;
10125
10126 // Add a function attribute for the kernel.
10127 Fn->addFnAttr("kernel");
10128 if (T.isAMDGCN())
10129 Fn->addFnAttr("uniform-work-group-size", "true");
10130 Fn->addFnAttr(Attribute::MustProgress);
10131}
10132
10133// We only generate metadata for function that contain target regions.
10134void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10135 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10136
10137 // If there are no entries, we don't need to do anything.
10138 if (OffloadInfoManager.empty())
10139 return;
10140
10141 LLVMContext &C = M.getContext();
10142 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10143 TargetRegionEntryInfo>,
10144 16>
10145 OrderedEntries(OffloadInfoManager.size());
10146
10147 // Auxiliary methods to create metadata values and strings.
10148 auto &&GetMDInt = [this](unsigned V) {
10149 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10150 };
10151
10152 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10153
10154 // Create the offloading info metadata node.
10155 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10156 auto &&TargetRegionMetadataEmitter =
10157 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10158 const TargetRegionEntryInfo &EntryInfo,
10159 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10160 // Generate metadata for target regions. Each entry of this metadata
10161 // contains:
10162 // - Entry 0 -> Kind of this type of metadata (0).
10163 // - Entry 1 -> Device ID of the file where the entry was identified.
10164 // - Entry 2 -> File ID of the file where the entry was identified.
10165 // - Entry 3 -> Mangled name of the function where the entry was
10166 // identified.
10167 // - Entry 4 -> Line in the file where the entry was identified.
10168 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10169 // - Entry 6 -> Order the entry was created.
10170 // The first element of the metadata node is the kind.
10171 Metadata *Ops[] = {
10172 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10173 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10174 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10175 GetMDInt(E.getOrder())};
10176
10177 // Save this entry in the right position of the ordered entries array.
10178 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10179
10180 // Add metadata to the named metadata node.
10181 MD->addOperand(MDNode::get(C, Ops));
10182 };
10183
10184 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10185
10186 // Create function that emits metadata for each device global variable entry;
10187 auto &&DeviceGlobalVarMetadataEmitter =
10188 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10189 StringRef MangledName,
10190 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10191 // Generate metadata for global variables. Each entry of this metadata
10192 // contains:
10193 // - Entry 0 -> Kind of this type of metadata (1).
10194 // - Entry 1 -> Mangled name of the variable.
10195 // - Entry 2 -> Declare target kind.
10196 // - Entry 3 -> Order the entry was created.
10197 // The first element of the metadata node is the kind.
10198 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10199 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10200
10201 // Save this entry in the right position of the ordered entries array.
10202 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10203 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10204
10205 // Add metadata to the named metadata node.
10206 MD->addOperand(MDNode::get(C, Ops));
10207 };
10208
10209 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10210 DeviceGlobalVarMetadataEmitter);
10211
10212 for (const auto &E : OrderedEntries) {
10213 assert(E.first && "All ordered entries must exist!");
10214 if (const auto *CE =
10216 E.first)) {
10217 if (!CE->getID() || !CE->getAddress()) {
10218 // Do not blame the entry if the parent funtion is not emitted.
10219 TargetRegionEntryInfo EntryInfo = E.second;
10220 StringRef FnName = EntryInfo.ParentName;
10221 if (!M.getNamedValue(FnName))
10222 continue;
10223 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10224 continue;
10225 }
10226 createOffloadEntry(CE->getID(), CE->getAddress(),
10227 /*Size=*/0, CE->getFlags(),
10229 } else if (const auto *CE = dyn_cast<
10230 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10231 E.first)) {
10232 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10233 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10234 CE->getFlags());
10235 switch (Flags) {
10236 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10237 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10238 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10239 continue;
10240 if (!CE->getAddress()) {
10241 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10242 continue;
10243 }
10244 // The vaiable has no definition - no need to add the entry.
10245 if (CE->getVarSize() == 0)
10246 continue;
10247 break;
10248 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10249 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10250 (!Config.isTargetDevice() && CE->getAddress())) &&
10251 "Declaret target link address is set.");
10252 if (Config.isTargetDevice())
10253 continue;
10254 if (!CE->getAddress()) {
10255 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10256 continue;
10257 }
10258 break;
10259 default:
10260 break;
10261 }
10262
10263 // Hidden or internal symbols on the device are not externally visible.
10264 // We should not attempt to register them by creating an offloading
10265 // entry. Indirect variables are handled separately on the device.
10266 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10267 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10268 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10269 continue;
10270
10271 // Indirect globals need to use a special name that doesn't match the name
10272 // of the associated host global.
10273 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10274 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10275 Flags, CE->getLinkage(), CE->getVarName());
10276 else
10277 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10278 Flags, CE->getLinkage());
10279
10280 } else {
10281 llvm_unreachable("Unsupported entry kind.");
10282 }
10283 }
10284
10285 // Emit requires directive globals to a special entry so the runtime can
10286 // register them when the device image is loaded.
10287 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10288 // entries should be redesigned to better suit this use-case.
10289 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10293 ".requires", /*Size=*/0,
10294 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10295 Config.getRequiresFlags());
10296}
10297
10298void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10299 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10300 unsigned FileID, unsigned Line, unsigned Count) {
10301 raw_svector_ostream OS(Name);
10302 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10303 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10304 if (Count)
10305 OS << "_" << Count;
10306}
10307
10308void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10309 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10310 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10311 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10312 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10313 EntryInfo.Line, NewCount);
10314}
10315
10316TargetRegionEntryInfo
10317OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10318 vfs::FileSystem &VFS,
10319 StringRef ParentName) {
10320 sys::fs::UniqueID ID(0xdeadf17e, 0);
10321 auto FileIDInfo = CallBack();
10322 uint64_t FileID = 0;
10323 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10324 ID = Status->getUniqueID();
10325 FileID = Status->getUniqueID().getFile();
10326 } else {
10327 // If the inode ID could not be determined, create a hash value
10328 // the current file name and use that as an ID.
10329 FileID = hash_value(std::get<0>(FileIDInfo));
10330 }
10331
10332 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10333 std::get<1>(FileIDInfo));
10334}
10335
10336unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10337 unsigned Offset = 0;
10338 for (uint64_t Remain =
10339 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10341 !(Remain & 1); Remain = Remain >> 1)
10342 Offset++;
10343 return Offset;
10344}
10345
10347OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10348 // Rotate by getFlagMemberOffset() bits.
10349 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10350 << getFlagMemberOffset());
10351}
10352
10353void OpenMPIRBuilder::setCorrectMemberOfFlag(
10355 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10356 // If the entry is PTR_AND_OBJ but has not been marked with the special
10357 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10358 // marked as MEMBER_OF.
10359 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10361 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10364 return;
10365
10366 // Reset the placeholder value to prepare the flag for the assignment of the
10367 // proper MEMBER_OF value.
10368 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10369 Flags |= MemberOfFlag;
10370}
10371
10372Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10373 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10374 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10375 bool IsDeclaration, bool IsExternallyVisible,
10376 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10377 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10378 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10379 std::function<Constant *()> GlobalInitializer,
10380 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10381 // TODO: convert this to utilise the IRBuilder Config rather than
10382 // a passed down argument.
10383 if (OpenMPSIMD)
10384 return nullptr;
10385
10386 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10387 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10388 CaptureClause ==
10389 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10390 Config.hasRequiresUnifiedSharedMemory())) {
10391 SmallString<64> PtrName;
10392 {
10393 raw_svector_ostream OS(PtrName);
10394 OS << MangledName;
10395 if (!IsExternallyVisible)
10396 OS << format("_%x", EntryInfo.FileID);
10397 OS << "_decl_tgt_ref_ptr";
10398 }
10399
10400 Value *Ptr = M.getNamedValue(PtrName);
10401
10402 if (!Ptr) {
10403 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10404 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10405
10406 auto *GV = cast<GlobalVariable>(Ptr);
10407 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10408
10409 if (!Config.isTargetDevice()) {
10410 if (GlobalInitializer)
10411 GV->setInitializer(GlobalInitializer());
10412 else
10413 GV->setInitializer(GlobalValue);
10414 }
10415
10416 registerTargetGlobalVariable(
10417 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10418 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10419 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10420 }
10421
10422 return cast<Constant>(Ptr);
10423 }
10424
10425 return nullptr;
10426}
10427
10428void OpenMPIRBuilder::registerTargetGlobalVariable(
10429 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10430 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10431 bool IsDeclaration, bool IsExternallyVisible,
10432 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10433 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10434 std::vector<Triple> TargetTriple,
10435 std::function<Constant *()> GlobalInitializer,
10436 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10437 Constant *Addr) {
10438 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10439 (TargetTriple.empty() && !Config.isTargetDevice()))
10440 return;
10441
10442 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10444 int64_t VarSize;
10446
10447 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10448 CaptureClause ==
10449 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10450 !Config.hasRequiresUnifiedSharedMemory()) {
10451 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10452 VarName = MangledName;
10453 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10454
10455 if (!IsDeclaration)
10456 VarSize = divideCeil(
10457 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10458 else
10459 VarSize = 0;
10460 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10461
10462 // This is a workaround carried over from Clang which prevents undesired
10463 // optimisation of internal variables.
10464 if (Config.isTargetDevice() &&
10465 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10466 // Do not create a "ref-variable" if the original is not also available
10467 // on the host.
10468 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10469 return;
10470
10471 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10472
10473 if (!M.getNamedValue(RefName)) {
10474 Constant *AddrRef =
10475 getOrCreateInternalVariable(Addr->getType(), RefName);
10476 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10477 GvAddrRef->setConstant(true);
10478 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10479 GvAddrRef->setInitializer(Addr);
10480 GeneratedRefs.push_back(GvAddrRef);
10481 }
10482 }
10483 } else {
10484 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10485 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10486 else
10487 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10488
10489 if (Config.isTargetDevice()) {
10490 VarName = (Addr) ? Addr->getName() : "";
10491 Addr = nullptr;
10492 } else {
10493 Addr = getAddrOfDeclareTargetVar(
10494 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10495 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10496 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10497 VarName = (Addr) ? Addr->getName() : "";
10498 }
10499 VarSize = M.getDataLayout().getPointerSize();
10501 }
10502
10503 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10504 Flags, Linkage);
10505}
10506
10507/// Loads all the offload entries information from the host IR
10508/// metadata.
10509void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10510 // If we are in target mode, load the metadata from the host IR. This code has
10511 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10512
10513 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10514 if (!MD)
10515 return;
10516
10517 for (MDNode *MN : MD->operands()) {
10518 auto &&GetMDInt = [MN](unsigned Idx) {
10519 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10520 return cast<ConstantInt>(V->getValue())->getZExtValue();
10521 };
10522
10523 auto &&GetMDString = [MN](unsigned Idx) {
10524 auto *V = cast<MDString>(MN->getOperand(Idx));
10525 return V->getString();
10526 };
10527
10528 switch (GetMDInt(0)) {
10529 default:
10530 llvm_unreachable("Unexpected metadata!");
10531 break;
10532 case OffloadEntriesInfoManager::OffloadEntryInfo::
10533 OffloadingEntryInfoTargetRegion: {
10534 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10535 /*DeviceID=*/GetMDInt(1),
10536 /*FileID=*/GetMDInt(2),
10537 /*Line=*/GetMDInt(4),
10538 /*Count=*/GetMDInt(5));
10539 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10540 /*Order=*/GetMDInt(6));
10541 break;
10542 }
10543 case OffloadEntriesInfoManager::OffloadEntryInfo::
10544 OffloadingEntryInfoDeviceGlobalVar:
10545 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10546 /*MangledName=*/GetMDString(1),
10547 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10548 /*Flags=*/GetMDInt(2)),
10549 /*Order=*/GetMDInt(3));
10550 break;
10551 }
10552 }
10553}
10554
10555void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10556 StringRef HostFilePath) {
10557 if (HostFilePath.empty())
10558 return;
10559
10560 auto Buf = VFS.getBufferForFile(HostFilePath);
10561 if (std::error_code Err = Buf.getError()) {
10562 report_fatal_error(("error opening host file from host file path inside of "
10563 "OpenMPIRBuilder: " +
10564 Err.message())
10565 .c_str());
10566 }
10567
10568 LLVMContext Ctx;
10570 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10571 if (std::error_code Err = M.getError()) {
10573 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10574 .c_str());
10575 }
10576
10577 loadOffloadInfoMetadata(*M.get());
10578}
10579
10580//===----------------------------------------------------------------------===//
10581// OffloadEntriesInfoManager
10582//===----------------------------------------------------------------------===//
10583
10584bool OffloadEntriesInfoManager::empty() const {
10585 return OffloadEntriesTargetRegion.empty() &&
10586 OffloadEntriesDeviceGlobalVar.empty();
10587}
10588
10589unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10590 const TargetRegionEntryInfo &EntryInfo) const {
10591 auto It = OffloadEntriesTargetRegionCount.find(
10592 getTargetRegionEntryCountKey(EntryInfo));
10593 if (It == OffloadEntriesTargetRegionCount.end())
10594 return 0;
10595 return It->second;
10596}
10597
10598void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10599 const TargetRegionEntryInfo &EntryInfo) {
10600 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10601 EntryInfo.Count + 1;
10602}
10603
10604/// Initialize target region entry.
10605void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10606 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10607 OffloadEntriesTargetRegion[EntryInfo] =
10608 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10609 OMPTargetRegionEntryTargetRegion);
10610 ++OffloadingEntriesNum;
10611}
10612
10613void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10614 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10615 OMPTargetRegionEntryKind Flags) {
10616 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10617
10618 // Update the EntryInfo with the next available count for this location.
10619 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10620
10621 // If we are emitting code for a target, the entry is already initialized,
10622 // only has to be registered.
10623 if (OMPBuilder->Config.isTargetDevice()) {
10624 // This could happen if the device compilation is invoked standalone.
10625 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10626 return;
10627 }
10628 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10629 Entry.setAddress(Addr);
10630 Entry.setID(ID);
10631 Entry.setFlags(Flags);
10632 } else {
10633 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10634 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10635 return;
10636 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10637 "Target region entry already registered!");
10638 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10639 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10640 ++OffloadingEntriesNum;
10641 }
10642 incrementTargetRegionEntryInfoCount(EntryInfo);
10643}
10644
10645bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10646 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10647
10648 // Update the EntryInfo with the next available count for this location.
10649 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10650
10651 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10652 if (It == OffloadEntriesTargetRegion.end()) {
10653 return false;
10654 }
10655 // Fail if this entry is already registered.
10656 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10657 return false;
10658 return true;
10659}
10660
10661void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10662 const OffloadTargetRegionEntryInfoActTy &Action) {
10663 // Scan all target region entries and perform the provided action.
10664 for (const auto &It : OffloadEntriesTargetRegion) {
10665 Action(It.first, It.second);
10666 }
10667}
10668
10669void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10670 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10671 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10672 ++OffloadingEntriesNum;
10673}
10674
10675void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10676 StringRef VarName, Constant *Addr, int64_t VarSize,
10677 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10678 if (OMPBuilder->Config.isTargetDevice()) {
10679 // This could happen if the device compilation is invoked standalone.
10680 if (!hasDeviceGlobalVarEntryInfo(VarName))
10681 return;
10682 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10683 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10684 if (Entry.getVarSize() == 0) {
10685 Entry.setVarSize(VarSize);
10686 Entry.setLinkage(Linkage);
10687 }
10688 return;
10689 }
10690 Entry.setVarSize(VarSize);
10691 Entry.setLinkage(Linkage);
10692 Entry.setAddress(Addr);
10693 } else {
10694 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10695 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10696 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10697 "Entry not initialized!");
10698 if (Entry.getVarSize() == 0) {
10699 Entry.setVarSize(VarSize);
10700 Entry.setLinkage(Linkage);
10701 }
10702 return;
10703 }
10704 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10705 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10706 Addr, VarSize, Flags, Linkage,
10707 VarName.str());
10708 else
10709 OffloadEntriesDeviceGlobalVar.try_emplace(
10710 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10711 ++OffloadingEntriesNum;
10712 }
10713}
10714
10715void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10716 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10717 // Scan all target region entries and perform the provided action.
10718 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10719 Action(E.getKey(), E.getValue());
10720}
10721
10722//===----------------------------------------------------------------------===//
10723// CanonicalLoopInfo
10724//===----------------------------------------------------------------------===//
10725
10726void CanonicalLoopInfo::collectControlBlocks(
10728 // We only count those BBs as control block for which we do not need to
10729 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10730 // flow. For consistency, this also means we do not add the Body block, which
10731 // is just the entry to the body code.
10732 BBs.reserve(BBs.size() + 6);
10733 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10734}
10735
10736BasicBlock *CanonicalLoopInfo::getPreheader() const {
10737 assert(isValid() && "Requires a valid canonical loop");
10738 for (BasicBlock *Pred : predecessors(Header)) {
10739 if (Pred != Latch)
10740 return Pred;
10741 }
10742 llvm_unreachable("Missing preheader");
10743}
10744
10745void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10746 assert(isValid() && "Requires a valid canonical loop");
10747
10748 Instruction *CmpI = &getCond()->front();
10749 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10750 CmpI->setOperand(1, TripCount);
10751
10752#ifndef NDEBUG
10753 assertOK();
10754#endif
10755}
10756
10757void CanonicalLoopInfo::mapIndVar(
10758 llvm::function_ref<Value *(Instruction *)> Updater) {
10759 assert(isValid() && "Requires a valid canonical loop");
10760
10761 Instruction *OldIV = getIndVar();
10762
10763 // Record all uses excluding those introduced by the updater. Uses by the
10764 // CanonicalLoopInfo itself to keep track of the number of iterations are
10765 // excluded.
10766 SmallVector<Use *> ReplacableUses;
10767 for (Use &U : OldIV->uses()) {
10768 auto *User = dyn_cast<Instruction>(U.getUser());
10769 if (!User)
10770 continue;
10771 if (User->getParent() == getCond())
10772 continue;
10773 if (User->getParent() == getLatch())
10774 continue;
10775 ReplacableUses.push_back(&U);
10776 }
10777
10778 // Run the updater that may introduce new uses
10779 Value *NewIV = Updater(OldIV);
10780
10781 // Replace the old uses with the value returned by the updater.
10782 for (Use *U : ReplacableUses)
10783 U->set(NewIV);
10784
10785#ifndef NDEBUG
10786 assertOK();
10787#endif
10788}
10789
10790void CanonicalLoopInfo::assertOK() const {
10791#ifndef NDEBUG
10792 // No constraints if this object currently does not describe a loop.
10793 if (!isValid())
10794 return;
10795
10796 BasicBlock *Preheader = getPreheader();
10797 BasicBlock *Body = getBody();
10798 BasicBlock *After = getAfter();
10799
10800 // Verify standard control-flow we use for OpenMP loops.
10801 assert(Preheader);
10802 assert(isa<BranchInst>(Preheader->getTerminator()) &&
10803 "Preheader must terminate with unconditional branch");
10804 assert(Preheader->getSingleSuccessor() == Header &&
10805 "Preheader must jump to header");
10806
10807 assert(Header);
10808 assert(isa<BranchInst>(Header->getTerminator()) &&
10809 "Header must terminate with unconditional branch");
10810 assert(Header->getSingleSuccessor() == Cond &&
10811 "Header must jump to exiting block");
10812
10813 assert(Cond);
10814 assert(Cond->getSinglePredecessor() == Header &&
10815 "Exiting block only reachable from header");
10816
10817 assert(isa<BranchInst>(Cond->getTerminator()) &&
10818 "Exiting block must terminate with conditional branch");
10819 assert(size(successors(Cond)) == 2 &&
10820 "Exiting block must have two successors");
10821 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
10822 "Exiting block's first successor jump to the body");
10823 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
10824 "Exiting block's second successor must exit the loop");
10825
10826 assert(Body);
10827 assert(Body->getSinglePredecessor() == Cond &&
10828 "Body only reachable from exiting block");
10829 assert(!isa<PHINode>(Body->front()));
10830
10831 assert(Latch);
10833 "Latch must terminate with unconditional branch");
10834 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
10835 // TODO: To support simple redirecting of the end of the body code that has
10836 // multiple; introduce another auxiliary basic block like preheader and after.
10837 assert(Latch->getSinglePredecessor() != nullptr);
10838 assert(!isa<PHINode>(Latch->front()));
10839
10840 assert(Exit);
10841 assert(isa<BranchInst>(Exit->getTerminator()) &&
10842 "Exit block must terminate with unconditional branch");
10843 assert(Exit->getSingleSuccessor() == After &&
10844 "Exit block must jump to after block");
10845
10846 assert(After);
10847 assert(After->getSinglePredecessor() == Exit &&
10848 "After block only reachable from exit block");
10849 assert(After->empty() || !isa<PHINode>(After->front()));
10850
10851 Instruction *IndVar = getIndVar();
10852 assert(IndVar && "Canonical induction variable not found?");
10853 assert(isa<IntegerType>(IndVar->getType()) &&
10854 "Induction variable must be an integer");
10855 assert(cast<PHINode>(IndVar)->getParent() == Header &&
10856 "Induction variable must be a PHI in the loop header");
10857 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
10858 assert(
10859 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
10860 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
10861
10862 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
10863 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
10864 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
10865 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
10866 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
10867 ->isOne());
10868
10869 Value *TripCount = getTripCount();
10870 assert(TripCount && "Loop trip count not found?");
10871 assert(IndVar->getType() == TripCount->getType() &&
10872 "Trip count and induction variable must have the same type");
10873
10874 auto *CmpI = cast<CmpInst>(&Cond->front());
10875 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
10876 "Exit condition must be a signed less-than comparison");
10877 assert(CmpI->getOperand(0) == IndVar &&
10878 "Exit condition must compare the induction variable");
10879 assert(CmpI->getOperand(1) == TripCount &&
10880 "Exit condition must compare with the trip count");
10881#endif
10882}
10883
10884void CanonicalLoopInfo::invalidate() {
10885 Header = nullptr;
10886 Cond = nullptr;
10887 Latch = nullptr;
10888 Exit = nullptr;
10889}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void addSimdMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause)
Determine the schedule type using schedule and ordering clause arguments.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:138
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:637
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:444
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:762
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:665
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:59
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:229
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:414
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:620
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1050
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1112
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:413
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1128
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:297
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:281
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:382
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:538
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:948
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:546
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:180
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:314
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:345
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...