LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/LLVMContext.h"
44#include "llvm/IR/MDBuilder.h"
45#include "llvm/IR/Metadata.h"
47#include "llvm/IR/PassManager.h"
49#include "llvm/IR/Value.h"
62
63#include <cstdint>
64#include <optional>
65
66#define DEBUG_TYPE "openmp-ir-builder"
67
68using namespace llvm;
69using namespace omp;
70
71static cl::opt<bool>
72 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
73 cl::desc("Use optimistic attributes describing "
74 "'as-if' properties of runtime calls."),
75 cl::init(false));
76
78 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
79 cl::desc("Factor for the unroll threshold to account for code "
80 "simplifications still taking place"),
81 cl::init(1.5));
82
83#ifndef NDEBUG
84/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
85/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
86/// an InsertPoint stores the instruction before something is inserted. For
87/// instance, if both point to the same instruction, two IRBuilders alternating
88/// creating instruction will cause the instructions to be interleaved.
91 if (!IP1.isSet() || !IP2.isSet())
92 return false;
93 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
94}
95
97 // Valid ordered/unordered and base algorithm combinations.
98 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
99 case OMPScheduleType::UnorderedStaticChunked:
100 case OMPScheduleType::UnorderedStatic:
101 case OMPScheduleType::UnorderedDynamicChunked:
102 case OMPScheduleType::UnorderedGuidedChunked:
103 case OMPScheduleType::UnorderedRuntime:
104 case OMPScheduleType::UnorderedAuto:
105 case OMPScheduleType::UnorderedTrapezoidal:
106 case OMPScheduleType::UnorderedGreedy:
107 case OMPScheduleType::UnorderedBalanced:
108 case OMPScheduleType::UnorderedGuidedIterativeChunked:
109 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
110 case OMPScheduleType::UnorderedSteal:
111 case OMPScheduleType::UnorderedStaticBalancedChunked:
112 case OMPScheduleType::UnorderedGuidedSimd:
113 case OMPScheduleType::UnorderedRuntimeSimd:
114 case OMPScheduleType::OrderedStaticChunked:
115 case OMPScheduleType::OrderedStatic:
116 case OMPScheduleType::OrderedDynamicChunked:
117 case OMPScheduleType::OrderedGuidedChunked:
118 case OMPScheduleType::OrderedRuntime:
119 case OMPScheduleType::OrderedAuto:
120 case OMPScheduleType::OrderdTrapezoidal:
121 case OMPScheduleType::NomergeUnorderedStaticChunked:
122 case OMPScheduleType::NomergeUnorderedStatic:
123 case OMPScheduleType::NomergeUnorderedDynamicChunked:
124 case OMPScheduleType::NomergeUnorderedGuidedChunked:
125 case OMPScheduleType::NomergeUnorderedRuntime:
126 case OMPScheduleType::NomergeUnorderedAuto:
127 case OMPScheduleType::NomergeUnorderedTrapezoidal:
128 case OMPScheduleType::NomergeUnorderedGreedy:
129 case OMPScheduleType::NomergeUnorderedBalanced:
130 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
131 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
132 case OMPScheduleType::NomergeUnorderedSteal:
133 case OMPScheduleType::NomergeOrderedStaticChunked:
134 case OMPScheduleType::NomergeOrderedStatic:
135 case OMPScheduleType::NomergeOrderedDynamicChunked:
136 case OMPScheduleType::NomergeOrderedGuidedChunked:
137 case OMPScheduleType::NomergeOrderedRuntime:
138 case OMPScheduleType::NomergeOrderedAuto:
139 case OMPScheduleType::NomergeOrderedTrapezoidal:
140 case OMPScheduleType::OrderedDistributeChunked:
141 case OMPScheduleType::OrderedDistribute:
142 break;
143 default:
144 return false;
145 }
146
147 // Must not set both monotonicity modifiers at the same time.
148 OMPScheduleType MonotonicityFlags =
149 SchedType & OMPScheduleType::MonotonicityMask;
150 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
151 return false;
152
153 return true;
154}
155#endif
156
157/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
158/// debug location to the last instruction in the specified basic block if the
159/// insert point points to the end of the block.
162 Builder.restoreIP(IP);
163 llvm::BasicBlock *BB = Builder.GetInsertBlock();
164 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
165 if (!BB->empty() && I == BB->end())
166 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
167}
168
169static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
170 if (T.isAMDGPU()) {
171 StringRef Features =
172 Kernel->getFnAttribute("target-features").getValueAsString();
173 if (Features.count("+wavefrontsize64"))
176 }
177 if (T.isNVPTX())
179 if (T.isSPIRV())
181 llvm_unreachable("No grid value available for this architecture!");
182}
183
184/// Determine which scheduling algorithm to use, determined from schedule clause
185/// arguments.
186static OMPScheduleType
187getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
188 bool HasSimdModifier, bool HasDistScheduleChunks) {
189 // Currently, the default schedule it static.
190 switch (ClauseKind) {
191 case OMP_SCHEDULE_Default:
192 case OMP_SCHEDULE_Static:
193 return HasChunks ? OMPScheduleType::BaseStaticChunked
194 : OMPScheduleType::BaseStatic;
195 case OMP_SCHEDULE_Dynamic:
196 return OMPScheduleType::BaseDynamicChunked;
197 case OMP_SCHEDULE_Guided:
198 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
199 : OMPScheduleType::BaseGuidedChunked;
200 case OMP_SCHEDULE_Auto:
202 case OMP_SCHEDULE_Runtime:
203 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
204 : OMPScheduleType::BaseRuntime;
205 case OMP_SCHEDULE_Distribute:
206 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
207 : OMPScheduleType::BaseDistribute;
208 }
209 llvm_unreachable("unhandled schedule clause argument");
210}
211
212/// Adds ordering modifier flags to schedule type.
213static OMPScheduleType
215 bool HasOrderedClause) {
216 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
217 OMPScheduleType::None &&
218 "Must not have ordering nor monotonicity flags already set");
219
220 OMPScheduleType OrderingModifier = HasOrderedClause
221 ? OMPScheduleType::ModifierOrdered
222 : OMPScheduleType::ModifierUnordered;
223 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
224
225 // Unsupported combinations
226 if (OrderingScheduleType ==
227 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
228 return OMPScheduleType::OrderedGuidedChunked;
229 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
230 OMPScheduleType::ModifierOrdered))
231 return OMPScheduleType::OrderedRuntime;
232
233 return OrderingScheduleType;
234}
235
236/// Adds monotonicity modifier flags to schedule type.
237static OMPScheduleType
239 bool HasSimdModifier, bool HasMonotonic,
240 bool HasNonmonotonic, bool HasOrderedClause) {
241 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
242 OMPScheduleType::None &&
243 "Must not have monotonicity flags already set");
244 assert((!HasMonotonic || !HasNonmonotonic) &&
245 "Monotonic and Nonmonotonic are contradicting each other");
246
247 if (HasMonotonic) {
248 return ScheduleType | OMPScheduleType::ModifierMonotonic;
249 } else if (HasNonmonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
251 } else {
252 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
253 // If the static schedule kind is specified or if the ordered clause is
254 // specified, and if the nonmonotonic modifier is not specified, the
255 // effect is as if the monotonic modifier is specified. Otherwise, unless
256 // the monotonic modifier is specified, the effect is as if the
257 // nonmonotonic modifier is specified.
258 OMPScheduleType BaseScheduleType =
259 ScheduleType & ~OMPScheduleType::ModifierMask;
260 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
261 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
262 HasOrderedClause) {
263 // The monotonic is used by default in openmp runtime library, so no need
264 // to set it.
265 return ScheduleType;
266 } else {
267 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
268 }
269 }
270}
271
272/// Determine the schedule type using schedule and ordering clause arguments.
273static OMPScheduleType
274computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
275 bool HasSimdModifier, bool HasMonotonicModifier,
276 bool HasNonmonotonicModifier, bool HasOrderedClause,
277 bool HasDistScheduleChunks) {
279 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
280 OMPScheduleType OrderedSchedule =
281 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
283 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
284 HasNonmonotonicModifier, HasOrderedClause);
285
287 return Result;
288}
289
290/// Make \p Source branch to \p Target.
291///
292/// Handles two situations:
293/// * \p Source already has an unconditional branch.
294/// * \p Source is a degenerate block (no terminator because the BB is
295/// the current head of the IR construction).
297 if (Instruction *Term = Source->getTerminator()) {
298 auto *Br = cast<BranchInst>(Term);
299 assert(!Br->isConditional() &&
300 "BB's terminator must be an unconditional branch (or degenerate)");
301 BasicBlock *Succ = Br->getSuccessor(0);
302 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
303 Br->setSuccessor(0, Target);
304 return;
305 }
306
307 auto *NewBr = BranchInst::Create(Target, Source);
308 NewBr->setDebugLoc(DL);
309}
310
311void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
312 bool CreateBranch, DebugLoc DL) {
313 assert(New->getFirstInsertionPt() == New->begin() &&
314 "Target BB must not have PHI nodes");
315
316 // Move instructions to new block.
317 BasicBlock *Old = IP.getBlock();
318 // If the `Old` block is empty then there are no instructions to move. But in
319 // the new debug scheme, it could have trailing debug records which will be
320 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
321 // reasons:
322 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
323 // 2. Even if `New` is not empty, the rationale to move those records to `New`
324 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
325 // assumes that `Old` is optimized out and is going away. This is not the case
326 // here. The `Old` block is still being used e.g. a branch instruction is
327 // added to it later in this function.
328 // So we call `BasicBlock::splice` only when `Old` is not empty.
329 if (!Old->empty())
330 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
331
332 if (CreateBranch) {
333 auto *NewBr = BranchInst::Create(New, Old);
334 NewBr->setDebugLoc(DL);
335 }
336}
337
338void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
339 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
340 BasicBlock *Old = Builder.GetInsertBlock();
341
342 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
343 if (CreateBranch)
344 Builder.SetInsertPoint(Old->getTerminator());
345 else
346 Builder.SetInsertPoint(Old);
347
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
350 Builder.SetCurrentDebugLocation(DebugLoc);
351}
352
353BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
354 DebugLoc DL, llvm::Twine Name) {
355 BasicBlock *Old = IP.getBlock();
357 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
358 Old->getParent(), Old->getNextNode());
359 spliceBB(IP, New, CreateBranch, DL);
360 New->replaceSuccessorsPhiUsesWith(Old, New);
361 return New;
362}
363
364BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
365 llvm::Twine Name) {
366 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
367 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
368 if (CreateBranch)
369 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
370 else
371 Builder.SetInsertPoint(Builder.GetInsertBlock());
372 // SetInsertPoint also updates the Builder's debug location, but we want to
373 // keep the one the Builder was configured to use.
374 Builder.SetCurrentDebugLocation(DebugLoc);
375 return New;
376}
377
378BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
379 llvm::Twine Name) {
380 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
381 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
382 if (CreateBranch)
383 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
384 else
385 Builder.SetInsertPoint(Builder.GetInsertBlock());
386 // SetInsertPoint also updates the Builder's debug location, but we want to
387 // keep the one the Builder was configured to use.
388 Builder.SetCurrentDebugLocation(DebugLoc);
389 return New;
390}
391
392BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
393 llvm::Twine Suffix) {
394 BasicBlock *Old = Builder.GetInsertBlock();
395 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
396}
397
398// This function creates a fake integer value and a fake use for the integer
399// value. It returns the fake value created. This is useful in modeling the
400// extra arguments to the outlined functions.
402 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
404 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
405 const Twine &Name = "", bool AsPtr = true) {
406 Builder.restoreIP(OuterAllocaIP);
407 Instruction *FakeVal;
408 AllocaInst *FakeValAddr =
409 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
410 ToBeDeleted.push_back(FakeValAddr);
411
412 if (AsPtr) {
413 FakeVal = FakeValAddr;
414 } else {
415 FakeVal =
416 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
417 ToBeDeleted.push_back(FakeVal);
418 }
419
420 // Generate a fake use of this value
421 Builder.restoreIP(InnerAllocaIP);
422 Instruction *UseFakeVal;
423 if (AsPtr) {
424 UseFakeVal =
425 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
426 } else {
427 UseFakeVal =
428 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
429 }
430 ToBeDeleted.push_back(UseFakeVal);
431 return FakeVal;
432}
433
434//===----------------------------------------------------------------------===//
435// OpenMPIRBuilderConfig
436//===----------------------------------------------------------------------===//
437
438namespace {
440/// Values for bit flags for marking which requires clauses have been used.
441enum OpenMPOffloadingRequiresDirFlags {
442 /// flag undefined.
443 OMP_REQ_UNDEFINED = 0x000,
444 /// no requires directive present.
445 OMP_REQ_NONE = 0x001,
446 /// reverse_offload clause.
447 OMP_REQ_REVERSE_OFFLOAD = 0x002,
448 /// unified_address clause.
449 OMP_REQ_UNIFIED_ADDRESS = 0x004,
450 /// unified_shared_memory clause.
451 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
452 /// dynamic_allocators clause.
453 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
454 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
455};
456
457} // anonymous namespace
458
459OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
460 : RequiresFlags(OMP_REQ_UNDEFINED) {}
461
462OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
463 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
464 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
465 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
466 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
467 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
468 RequiresFlags(OMP_REQ_UNDEFINED) {
469 if (HasRequiresReverseOffload)
470 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
471 if (HasRequiresUnifiedAddress)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 if (HasRequiresUnifiedSharedMemory)
474 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
475 if (HasRequiresDynamicAllocators)
476 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
477}
478
479bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
480 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
481}
482
483bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
484 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
485}
486
487bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
488 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
489}
490
491bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
492 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
493}
494
495int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
496 return hasRequiresFlags() ? RequiresFlags
497 : static_cast<int64_t>(OMP_REQ_NONE);
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
503 else
504 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
517 else
518 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
519}
520
521void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
522 if (Value)
523 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
524 else
525 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
526}
527
528//===----------------------------------------------------------------------===//
529// OpenMPIRBuilder
530//===----------------------------------------------------------------------===//
531
532void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
533 IRBuilderBase &Builder,
534 SmallVector<Value *> &ArgsVector) {
535 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
536 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
537 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
538 constexpr size_t MaxDim = 3;
539 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
540
541 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
542
543 Value *DynCGroupMemFallbackFlag =
544 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
545 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
546 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
547
548 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
549
550 Value *NumTeams3D =
551 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
552 Value *NumThreads3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
554 for (unsigned I :
555 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
556 NumTeams3D =
557 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
558 for (unsigned I :
559 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
560 NumThreads3D =
561 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
562
563 ArgsVector = {Version,
564 PointerNum,
565 KernelArgs.RTArgs.BasePointersArray,
566 KernelArgs.RTArgs.PointersArray,
567 KernelArgs.RTArgs.SizesArray,
568 KernelArgs.RTArgs.MapTypesArray,
569 KernelArgs.RTArgs.MapNamesArray,
570 KernelArgs.RTArgs.MappersArray,
571 KernelArgs.NumIterations,
572 Flags,
573 NumTeams3D,
574 NumThreads3D,
575 KernelArgs.DynCGroupMem};
576}
577
578void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
579 LLVMContext &Ctx = Fn.getContext();
580
581 // Get the function's current attributes.
582 auto Attrs = Fn.getAttributes();
583 auto FnAttrs = Attrs.getFnAttrs();
584 auto RetAttrs = Attrs.getRetAttrs();
586 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
587 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
588
589 // Add AS to FnAS while taking special care with integer extensions.
590 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
591 bool Param = true) -> void {
592 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
593 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
594 if (HasSignExt || HasZeroExt) {
595 assert(AS.getNumAttributes() == 1 &&
596 "Currently not handling extension attr combined with others.");
597 if (Param) {
598 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
599 FnAS = FnAS.addAttribute(Ctx, AK);
600 } else if (auto AK =
601 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
602 FnAS = FnAS.addAttribute(Ctx, AK);
603 } else {
604 FnAS = FnAS.addAttributes(Ctx, AS);
605 }
606 };
607
608#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
609#include "llvm/Frontend/OpenMP/OMPKinds.def"
610
611 // Add attributes to the function declaration.
612 switch (FnID) {
613#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
614 case Enum: \
615 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
616 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
617 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
618 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
619 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
620 break;
621#include "llvm/Frontend/OpenMP/OMPKinds.def"
622 default:
623 // Attributes are optional.
624 break;
625 }
626}
627
629OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
630 FunctionType *FnTy = nullptr;
631 Function *Fn = nullptr;
632
633 // Try to find the declation in the module first.
634 switch (FnID) {
635#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
636 case Enum: \
637 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
638 IsVarArg); \
639 Fn = M.getFunction(Str); \
640 break;
641#include "llvm/Frontend/OpenMP/OMPKinds.def"
642 }
643
644 if (!Fn) {
645 // Create a new declaration if we need one.
646 switch (FnID) {
647#define OMP_RTL(Enum, Str, ...) \
648 case Enum: \
649 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
650 break;
651#include "llvm/Frontend/OpenMP/OMPKinds.def"
652 }
653 Fn->setCallingConv(Config.getRuntimeCC());
654 // Add information if the runtime function takes a callback function
655 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
656 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
657 LLVMContext &Ctx = Fn->getContext();
658 MDBuilder MDB(Ctx);
659 // Annotate the callback behavior of the runtime function:
660 // - The callback callee is argument number 2 (microtask).
661 // - The first two arguments of the callback callee are unknown (-1).
662 // - All variadic arguments to the runtime function are passed to the
663 // callback callee.
664 Fn->addMetadata(
665 LLVMContext::MD_callback,
666 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
667 2, {-1, -1}, /* VarArgsArePassed */ true)}));
668 }
669 }
670
671 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
672 << " with type " << *Fn->getFunctionType() << "\n");
673 addAttributes(FnID, *Fn);
674
675 } else {
676 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 }
679
680 assert(Fn && "Failed to create OpenMP runtime function");
681
682 return {FnTy, Fn};
683}
684
685Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
686 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
687 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
688 assert(Fn && "Failed to create OpenMP runtime function pointer");
689 return Fn;
690}
691
692CallInst *OpenMPIRBuilder::createRuntimeFunctionCall(FunctionCallee Callee,
694 StringRef Name) {
695 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
696 Call->setCallingConv(Config.getRuntimeCC());
697 return Call;
698}
699
700void OpenMPIRBuilder::initialize() { initializeTypes(M); }
701
704 BasicBlock &EntryBlock = Function->getEntryBlock();
705 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
706
707 // Loop over blocks looking for constant allocas, skipping the entry block
708 // as any allocas there are already in the desired location.
709 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
710 Block++) {
711 for (auto Inst = Block->getReverseIterator()->begin();
712 Inst != Block->getReverseIterator()->end();) {
714 Inst++;
716 continue;
717 AllocaInst->moveBeforePreserving(MoveLocInst);
718 } else {
719 Inst++;
720 }
721 }
722 }
723}
724
725void OpenMPIRBuilder::finalize(Function *Fn) {
726 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
728 SmallVector<OutlineInfo, 16> DeferredOutlines;
729 for (OutlineInfo &OI : OutlineInfos) {
730 // Skip functions that have not finalized yet; may happen with nested
731 // function generation.
732 if (Fn && OI.getFunction() != Fn) {
733 DeferredOutlines.push_back(OI);
734 continue;
735 }
736
737 ParallelRegionBlockSet.clear();
738 Blocks.clear();
739 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
740
741 Function *OuterFn = OI.getFunction();
742 CodeExtractorAnalysisCache CEAC(*OuterFn);
743 // If we generate code for the target device, we need to allocate
744 // struct for aggregate params in the device default alloca address space.
745 // OpenMP runtime requires that the params of the extracted functions are
746 // passed as zero address space pointers. This flag ensures that
747 // CodeExtractor generates correct code for extracted functions
748 // which are used by OpenMP runtime.
749 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
750 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
751 /* AggregateArgs */ true,
752 /* BlockFrequencyInfo */ nullptr,
753 /* BranchProbabilityInfo */ nullptr,
754 /* AssumptionCache */ nullptr,
755 /* AllowVarArgs */ true,
756 /* AllowAlloca */ true,
757 /* AllocaBlock*/ OI.OuterAllocaBB,
758 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
759
760 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
761 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
762 << " Exit: " << OI.ExitBB->getName() << "\n");
763 assert(Extractor.isEligible() &&
764 "Expected OpenMP outlining to be possible!");
765
766 for (auto *V : OI.ExcludeArgsFromAggregate)
767 Extractor.excludeArgFromAggregate(V);
768
769 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
770
771 // Forward target-cpu, target-features attributes to the outlined function.
772 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
773 if (TargetCpuAttr.isStringAttribute())
774 OutlinedFn->addFnAttr(TargetCpuAttr);
775
776 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
777 if (TargetFeaturesAttr.isStringAttribute())
778 OutlinedFn->addFnAttr(TargetFeaturesAttr);
779
780 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
781 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
782 assert(OutlinedFn->getReturnType()->isVoidTy() &&
783 "OpenMP outlined functions should not return a value!");
784
785 // For compability with the clang CG we move the outlined function after the
786 // one with the parallel region.
787 OutlinedFn->removeFromParent();
788 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
789
790 // Remove the artificial entry introduced by the extractor right away, we
791 // made our own entry block after all.
792 {
793 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
794 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
795 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
796 // Move instructions from the to-be-deleted ArtificialEntry to the entry
797 // basic block of the parallel region. CodeExtractor generates
798 // instructions to unwrap the aggregate argument and may sink
799 // allocas/bitcasts for values that are solely used in the outlined region
800 // and do not escape.
801 assert(!ArtificialEntry.empty() &&
802 "Expected instructions to add in the outlined region entry");
803 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
804 End = ArtificialEntry.rend();
805 It != End;) {
806 Instruction &I = *It;
807 It++;
808
809 if (I.isTerminator()) {
810 // Absorb any debug value that terminator may have
811 if (OI.EntryBB->getTerminator())
812 OI.EntryBB->getTerminator()->adoptDbgRecords(
813 &ArtificialEntry, I.getIterator(), false);
814 continue;
815 }
816
817 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
818 }
819
820 OI.EntryBB->moveBefore(&ArtificialEntry);
821 ArtificialEntry.eraseFromParent();
822 }
823 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
824 assert(OutlinedFn && OutlinedFn->hasNUses(1));
825
826 // Run a user callback, e.g. to add attributes.
827 if (OI.PostOutlineCB)
828 OI.PostOutlineCB(*OutlinedFn);
829 }
830
831 // Remove work items that have been completed.
832 OutlineInfos = std::move(DeferredOutlines);
833
834 // The createTarget functions embeds user written code into
835 // the target region which may inject allocas which need to
836 // be moved to the entry block of our target or risk malformed
837 // optimisations by later passes, this is only relevant for
838 // the device pass which appears to be a little more delicate
839 // when it comes to optimisations (however, we do not block on
840 // that here, it's up to the inserter to the list to do so).
841 // This notbaly has to occur after the OutlinedInfo candidates
842 // have been extracted so we have an end product that will not
843 // be implicitly adversely affected by any raises unless
844 // intentionally appended to the list.
845 // NOTE: This only does so for ConstantData, it could be extended
846 // to ConstantExpr's with further effort, however, they should
847 // largely be folded when they get here. Extending it to runtime
848 // defined/read+writeable allocation sizes would be non-trivial
849 // (need to factor in movement of any stores to variables the
850 // allocation size depends on, as well as the usual loads,
851 // otherwise it'll yield the wrong result after movement) and
852 // likely be more suitable as an LLVM optimisation pass.
853 for (Function *F : ConstantAllocaRaiseCandidates)
855
856 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
857 [](EmitMetadataErrorKind Kind,
858 const TargetRegionEntryInfo &EntryInfo) -> void {
859 errs() << "Error of kind: " << Kind
860 << " when emitting offload entries and metadata during "
861 "OMPIRBuilder finalization \n";
862 };
863
864 if (!OffloadInfoManager.empty())
865 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
866
867 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
868 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
869 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
870 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
871 }
872
873 IsFinalized = true;
874}
875
876bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
877
878OpenMPIRBuilder::~OpenMPIRBuilder() {
879 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
880}
881
882GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
883 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
884 auto *GV =
885 new GlobalVariable(M, I32Ty,
886 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
887 ConstantInt::get(I32Ty, Value), Name);
888 GV->setVisibility(GlobalValue::HiddenVisibility);
889
890 return GV;
891}
892
893void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
894 if (List.empty())
895 return;
896
897 // Convert List to what ConstantArray needs.
899 UsedArray.resize(List.size());
900 for (unsigned I = 0, E = List.size(); I != E; ++I)
902 cast<Constant>(&*List[I]), Builder.getPtrTy());
903
904 if (UsedArray.empty())
905 return;
906 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
907
908 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
909 ConstantArray::get(ATy, UsedArray), Name);
910
911 GV->setSection("llvm.metadata");
912}
913
915OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
917 auto *Int8Ty = Builder.getInt8Ty();
918 auto *GVMode = new GlobalVariable(
919 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
920 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
921 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
922 return GVMode;
923}
924
925Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
926 uint32_t SrcLocStrSize,
927 IdentFlag LocFlags,
928 unsigned Reserve2Flags) {
929 // Enable "C-mode".
930 LocFlags |= OMP_IDENT_FLAG_KMPC;
931
932 Constant *&Ident =
933 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
934 if (!Ident) {
935 Constant *I32Null = ConstantInt::getNullValue(Int32);
936 Constant *IdentData[] = {I32Null,
937 ConstantInt::get(Int32, uint32_t(LocFlags)),
938 ConstantInt::get(Int32, Reserve2Flags),
939 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
940
941 size_t SrcLocStrArgIdx = 4;
942 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
944 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
945 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
946 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
947 Constant *Initializer =
948 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
949
950 // Look for existing encoding of the location + flags, not needed but
951 // minimizes the difference to the existing solution while we transition.
952 for (GlobalVariable &GV : M.globals())
953 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
954 if (GV.getInitializer() == Initializer)
955 Ident = &GV;
956
957 if (!Ident) {
958 auto *GV = new GlobalVariable(
959 M, OpenMPIRBuilder::Ident,
960 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
962 M.getDataLayout().getDefaultGlobalsAddressSpace());
963 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
964 GV->setAlignment(Align(8));
965 Ident = GV;
966 }
967 }
968
970}
971
972Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
973 uint32_t &SrcLocStrSize) {
974 SrcLocStrSize = LocStr.size();
975 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
976 if (!SrcLocStr) {
977 Constant *Initializer =
978 ConstantDataArray::getString(M.getContext(), LocStr);
979
980 // Look for existing encoding of the location, not needed but minimizes the
981 // difference to the existing solution while we transition.
982 for (GlobalVariable &GV : M.globals())
983 if (GV.isConstant() && GV.hasInitializer() &&
984 GV.getInitializer() == Initializer)
985 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
986
987 SrcLocStr = Builder.CreateGlobalString(
988 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
989 &M);
990 }
991 return SrcLocStr;
992}
993
994Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
995 StringRef FileName,
996 unsigned Line, unsigned Column,
997 uint32_t &SrcLocStrSize) {
998 SmallString<128> Buffer;
999 Buffer.push_back(';');
1000 Buffer.append(FileName);
1001 Buffer.push_back(';');
1002 Buffer.append(FunctionName);
1003 Buffer.push_back(';');
1004 Buffer.append(std::to_string(Line));
1005 Buffer.push_back(';');
1006 Buffer.append(std::to_string(Column));
1007 Buffer.push_back(';');
1008 Buffer.push_back(';');
1009 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1010}
1011
1012Constant *
1013OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
1014 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1015 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1016}
1017
1018Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
1019 uint32_t &SrcLocStrSize,
1020 Function *F) {
1021 DILocation *DIL = DL.get();
1022 if (!DIL)
1023 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1024 StringRef FileName = M.getName();
1025 if (DIFile *DIF = DIL->getFile())
1026 if (std::optional<StringRef> Source = DIF->getSource())
1027 FileName = *Source;
1028 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1029 if (Function.empty() && F)
1030 Function = F->getName();
1031 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1032 DIL->getColumn(), SrcLocStrSize);
1033}
1034
1035Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1036 uint32_t &SrcLocStrSize) {
1037 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1038 Loc.IP.getBlock()->getParent());
1039}
1040
1041Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1042 return createRuntimeFunctionCall(
1043 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1044 "omp_global_thread_num");
1045}
1046
1047OpenMPIRBuilder::InsertPointOrErrorTy
1048OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1049 bool ForceSimpleCall, bool CheckCancelFlag) {
1050 if (!updateToLocation(Loc))
1051 return Loc.IP;
1052
1053 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1054 // __kmpc_barrier(loc, thread_id);
1055
1056 IdentFlag BarrierLocFlags;
1057 switch (Kind) {
1058 case OMPD_for:
1059 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1060 break;
1061 case OMPD_sections:
1062 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1063 break;
1064 case OMPD_single:
1065 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1066 break;
1067 case OMPD_barrier:
1068 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1069 break;
1070 default:
1071 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1072 break;
1073 }
1074
1075 uint32_t SrcLocStrSize;
1076 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1077 Value *Args[] = {
1078 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1079 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1080
1081 // If we are in a cancellable parallel region, barriers are cancellation
1082 // points.
1083 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1084 bool UseCancelBarrier =
1085 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1086
1087 Value *Result = createRuntimeFunctionCall(
1088 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1089 ? OMPRTL___kmpc_cancel_barrier
1090 : OMPRTL___kmpc_barrier),
1091 Args);
1092
1093 if (UseCancelBarrier && CheckCancelFlag)
1094 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1095 return Err;
1096
1097 return Builder.saveIP();
1098}
1099
1100OpenMPIRBuilder::InsertPointOrErrorTy
1101OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1102 Value *IfCondition,
1103 omp::Directive CanceledDirective) {
1104 if (!updateToLocation(Loc))
1105 return Loc.IP;
1106
1107 // LLVM utilities like blocks with terminators.
1108 auto *UI = Builder.CreateUnreachable();
1109
1110 Instruction *ThenTI = UI, *ElseTI = nullptr;
1111 if (IfCondition)
1112 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1113 Builder.SetInsertPoint(ThenTI);
1114
1115 Value *CancelKind = nullptr;
1116 switch (CanceledDirective) {
1117#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1118 case DirectiveEnum: \
1119 CancelKind = Builder.getInt32(Value); \
1120 break;
1121#include "llvm/Frontend/OpenMP/OMPKinds.def"
1122 default:
1123 llvm_unreachable("Unknown cancel kind!");
1124 }
1125
1126 uint32_t SrcLocStrSize;
1127 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1128 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1129 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1130 Value *Result = createRuntimeFunctionCall(
1131 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1132 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1133 if (CanceledDirective == OMPD_parallel) {
1134 IRBuilder<>::InsertPointGuard IPG(Builder);
1135 Builder.restoreIP(IP);
1136 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1137 omp::Directive::OMPD_unknown,
1138 /* ForceSimpleCall */ false,
1139 /* CheckCancelFlag */ false)
1140 .takeError();
1141 }
1142 return Error::success();
1143 };
1144
1145 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1146 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1147 return Err;
1148
1149 // Update the insertion point and remove the terminator we introduced.
1150 Builder.SetInsertPoint(UI->getParent());
1151 UI->eraseFromParent();
1152
1153 return Builder.saveIP();
1154}
1155
1156OpenMPIRBuilder::InsertPointOrErrorTy
1157OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1158 omp::Directive CanceledDirective) {
1159 if (!updateToLocation(Loc))
1160 return Loc.IP;
1161
1162 // LLVM utilities like blocks with terminators.
1163 auto *UI = Builder.CreateUnreachable();
1164 Builder.SetInsertPoint(UI);
1165
1166 Value *CancelKind = nullptr;
1167 switch (CanceledDirective) {
1168#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1169 case DirectiveEnum: \
1170 CancelKind = Builder.getInt32(Value); \
1171 break;
1172#include "llvm/Frontend/OpenMP/OMPKinds.def"
1173 default:
1174 llvm_unreachable("Unknown cancel kind!");
1175 }
1176
1177 uint32_t SrcLocStrSize;
1178 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1179 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1180 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1181 Value *Result = createRuntimeFunctionCall(
1182 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1183 auto ExitCB = [this, CanceledDirective, Loc](InsertPointTy IP) -> Error {
1184 if (CanceledDirective == OMPD_parallel) {
1185 IRBuilder<>::InsertPointGuard IPG(Builder);
1186 Builder.restoreIP(IP);
1187 return createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
1188 omp::Directive::OMPD_unknown,
1189 /* ForceSimpleCall */ false,
1190 /* CheckCancelFlag */ false)
1191 .takeError();
1192 }
1193 return Error::success();
1194 };
1195
1196 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1197 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective, ExitCB))
1198 return Err;
1199
1200 // Update the insertion point and remove the terminator we introduced.
1201 Builder.SetInsertPoint(UI->getParent());
1202 UI->eraseFromParent();
1203
1204 return Builder.saveIP();
1205}
1206
1207OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1208 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1209 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1210 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1211 if (!updateToLocation(Loc))
1212 return Loc.IP;
1213
1214 Builder.restoreIP(AllocaIP);
1215 auto *KernelArgsPtr =
1216 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1217 updateToLocation(Loc);
1218
1219 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1220 llvm::Value *Arg =
1221 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1222 Builder.CreateAlignedStore(
1223 KernelArgs[I], Arg,
1224 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1225 }
1226
1227 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1228 NumThreads, HostPtr, KernelArgsPtr};
1229
1230 Return = createRuntimeFunctionCall(
1231 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1232 OffloadingArgs);
1233
1234 return Builder.saveIP();
1235}
1236
1237OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1238 const LocationDescription &Loc, Value *OutlinedFnID,
1239 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1240 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1241
1242 if (!updateToLocation(Loc))
1243 return Loc.IP;
1244
1245 // On top of the arrays that were filled up, the target offloading call
1246 // takes as arguments the device id as well as the host pointer. The host
1247 // pointer is used by the runtime library to identify the current target
1248 // region, so it only has to be unique and not necessarily point to
1249 // anything. It could be the pointer to the outlined function that
1250 // implements the target region, but we aren't using that so that the
1251 // compiler doesn't need to keep that, and could therefore inline the host
1252 // function if proven worthwhile during optimization.
1253
1254 // From this point on, we need to have an ID of the target region defined.
1255 assert(OutlinedFnID && "Invalid outlined function ID!");
1256 (void)OutlinedFnID;
1257
1258 // Return value of the runtime offloading call.
1259 Value *Return = nullptr;
1260
1261 // Arguments for the target kernel.
1262 SmallVector<Value *> ArgsVector;
1263 getKernelArgsVector(Args, Builder, ArgsVector);
1264
1265 // The target region is an outlined function launched by the runtime
1266 // via calls to __tgt_target_kernel().
1267 //
1268 // Note that on the host and CPU targets, the runtime implementation of
1269 // these calls simply call the outlined function without forking threads.
1270 // The outlined functions themselves have runtime calls to
1271 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1272 // the compiler in emitTeamsCall() and emitParallelCall().
1273 //
1274 // In contrast, on the NVPTX target, the implementation of
1275 // __tgt_target_teams() launches a GPU kernel with the requested number
1276 // of teams and threads so no additional calls to the runtime are required.
1277 // Check the error code and execute the host version if required.
1278 Builder.restoreIP(emitTargetKernel(
1279 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1280 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1281
1282 BasicBlock *OffloadFailedBlock =
1283 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1284 BasicBlock *OffloadContBlock =
1285 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1286 Value *Failed = Builder.CreateIsNotNull(Return);
1287 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1288
1289 auto CurFn = Builder.GetInsertBlock()->getParent();
1290 emitBlock(OffloadFailedBlock, CurFn);
1291 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1292 if (!AfterIP)
1293 return AfterIP.takeError();
1294 Builder.restoreIP(*AfterIP);
1295 emitBranch(OffloadContBlock);
1296 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1297 return Builder.saveIP();
1298}
1299
1300Error OpenMPIRBuilder::emitCancelationCheckImpl(
1301 Value *CancelFlag, omp::Directive CanceledDirective,
1302 FinalizeCallbackTy ExitCB) {
1303 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1304 "Unexpected cancellation!");
1305
1306 // For a cancel barrier we create two new blocks.
1307 BasicBlock *BB = Builder.GetInsertBlock();
1308 BasicBlock *NonCancellationBlock;
1309 if (Builder.GetInsertPoint() == BB->end()) {
1310 // TODO: This branch will not be needed once we moved to the
1311 // OpenMPIRBuilder codegen completely.
1312 NonCancellationBlock = BasicBlock::Create(
1313 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1314 } else {
1315 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1317 Builder.SetInsertPoint(BB);
1318 }
1319 BasicBlock *CancellationBlock = BasicBlock::Create(
1320 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1321
1322 // Jump to them based on the return value.
1323 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1324 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1325 /* TODO weight */ nullptr, nullptr);
1326
1327 // From the cancellation block we finalize all variables and go to the
1328 // post finalization block that is known to the FiniCB callback.
1329 Builder.SetInsertPoint(CancellationBlock);
1330 if (ExitCB)
1331 if (Error Err = ExitCB(Builder.saveIP()))
1332 return Err;
1333 auto &FI = FinalizationStack.back();
1334 if (Error Err = FI.FiniCB(Builder.saveIP()))
1335 return Err;
1336
1337 // The continuation block is where code generation continues.
1338 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1339 return Error::success();
1340}
1341
1342// Callback used to create OpenMP runtime calls to support
1343// omp parallel clause for the device.
1344// We need to use this callback to replace call to the OutlinedFn in OuterFn
1345// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_51)
1347 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1348 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1349 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1350 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1351 // Add some known attributes.
1352 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1353 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1354 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1355 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1356 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1357 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1358
1359 assert(OutlinedFn.arg_size() >= 2 &&
1360 "Expected at least tid and bounded tid as arguments");
1361 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1362
1363 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1364 assert(CI && "Expected call instruction to outlined function");
1365 CI->getParent()->setName("omp_parallel");
1366
1367 Builder.SetInsertPoint(CI);
1368 Type *PtrTy = OMPIRBuilder->VoidPtr;
1369 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1370
1371 // Add alloca for kernel args
1372 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1373 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1374 AllocaInst *ArgsAlloca =
1375 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1376 Value *Args = ArgsAlloca;
1377 // Add address space cast if array for storing arguments is not allocated
1378 // in address space 0
1379 if (ArgsAlloca->getAddressSpace())
1380 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1381 Builder.restoreIP(CurrentIP);
1382
1383 // Store captured vars which are used by kmpc_parallel_51
1384 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1385 Value *V = *(CI->arg_begin() + 2 + Idx);
1386 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1387 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1388 Builder.CreateStore(V, StoreAddress);
1389 }
1390
1391 Value *Cond =
1392 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1393 : Builder.getInt32(1);
1394
1395 // Build kmpc_parallel_51 call
1396 Value *Parallel51CallArgs[] = {
1397 /* identifier*/ Ident,
1398 /* global thread num*/ ThreadID,
1399 /* if expression */ Cond,
1400 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1401 /* Proc bind */ Builder.getInt32(-1),
1402 /* outlined function */ &OutlinedFn,
1403 /* wrapper function */ NullPtrValue,
1404 /* arguments of the outlined funciton*/ Args,
1405 /* number of arguments */ Builder.getInt64(NumCapturedVars)};
1406
1407 FunctionCallee RTLFn =
1408 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_51);
1409
1410 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel51CallArgs);
1411
1412 LLVM_DEBUG(dbgs() << "With kmpc_parallel_51 placed: "
1413 << *Builder.GetInsertBlock()->getParent() << "\n");
1414
1415 // Initialize the local TID stack location with the argument value.
1416 Builder.SetInsertPoint(PrivTID);
1417 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1418 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1419 PrivTIDAddr);
1420
1421 // Remove redundant call to the outlined function.
1422 CI->eraseFromParent();
1423
1424 for (Instruction *I : ToBeDeleted) {
1425 I->eraseFromParent();
1426 }
1427}
1428
1429// Callback used to create OpenMP runtime calls to support
1430// omp parallel clause for the host.
1431// We need to use this callback to replace call to the OutlinedFn in OuterFn
1432// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1433static void
1434hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1435 Function *OuterFn, Value *Ident, Value *IfCondition,
1436 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1437 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1438 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1439 FunctionCallee RTLFn;
1440 if (IfCondition) {
1441 RTLFn =
1442 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1443 } else {
1444 RTLFn =
1445 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1446 }
1447 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1448 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1449 LLVMContext &Ctx = F->getContext();
1450 MDBuilder MDB(Ctx);
1451 // Annotate the callback behavior of the __kmpc_fork_call:
1452 // - The callback callee is argument number 2 (microtask).
1453 // - The first two arguments of the callback callee are unknown (-1).
1454 // - All variadic arguments to the __kmpc_fork_call are passed to the
1455 // callback callee.
1456 F->addMetadata(LLVMContext::MD_callback,
1458 2, {-1, -1},
1459 /* VarArgsArePassed */ true)}));
1460 }
1461 }
1462 // Add some known attributes.
1463 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1464 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1465 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1466
1467 assert(OutlinedFn.arg_size() >= 2 &&
1468 "Expected at least tid and bounded tid as arguments");
1469 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1470
1471 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1472 CI->getParent()->setName("omp_parallel");
1473 Builder.SetInsertPoint(CI);
1474
1475 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1476 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1477 &OutlinedFn};
1478
1479 SmallVector<Value *, 16> RealArgs;
1480 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1481 if (IfCondition) {
1482 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1483 RealArgs.push_back(Cond);
1484 }
1485 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1486
1487 // __kmpc_fork_call_if always expects a void ptr as the last argument
1488 // If there are no arguments, pass a null pointer.
1489 auto PtrTy = OMPIRBuilder->VoidPtr;
1490 if (IfCondition && NumCapturedVars == 0) {
1491 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1492 RealArgs.push_back(NullPtrValue);
1493 }
1494
1495 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1496
1497 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1498 << *Builder.GetInsertBlock()->getParent() << "\n");
1499
1500 // Initialize the local TID stack location with the argument value.
1501 Builder.SetInsertPoint(PrivTID);
1502 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1503 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1504 PrivTIDAddr);
1505
1506 // Remove redundant call to the outlined function.
1507 CI->eraseFromParent();
1508
1509 for (Instruction *I : ToBeDeleted) {
1510 I->eraseFromParent();
1511 }
1512}
1513
1514OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1515 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1516 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1517 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1518 omp::ProcBindKind ProcBind, bool IsCancellable) {
1519 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1520
1521 if (!updateToLocation(Loc))
1522 return Loc.IP;
1523
1524 uint32_t SrcLocStrSize;
1525 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1526 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1527 Value *ThreadID = getOrCreateThreadID(Ident);
1528 // If we generate code for the target device, we need to allocate
1529 // struct for aggregate params in the device default alloca address space.
1530 // OpenMP runtime requires that the params of the extracted functions are
1531 // passed as zero address space pointers. This flag ensures that extracted
1532 // function arguments are declared in zero address space
1533 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1534
1535 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1536 // only if we compile for host side.
1537 if (NumThreads && !Config.isTargetDevice()) {
1538 Value *Args[] = {
1539 Ident, ThreadID,
1540 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1541 createRuntimeFunctionCall(
1542 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1543 }
1544
1545 if (ProcBind != OMP_PROC_BIND_default) {
1546 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1547 Value *Args[] = {
1548 Ident, ThreadID,
1549 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1550 createRuntimeFunctionCall(
1551 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1552 }
1553
1554 BasicBlock *InsertBB = Builder.GetInsertBlock();
1555 Function *OuterFn = InsertBB->getParent();
1556
1557 // Save the outer alloca block because the insertion iterator may get
1558 // invalidated and we still need this later.
1559 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1560
1561 // Vector to remember instructions we used only during the modeling but which
1562 // we want to delete at the end.
1564
1565 // Change the location to the outer alloca insertion point to create and
1566 // initialize the allocas we pass into the parallel region.
1567 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1568 Builder.restoreIP(NewOuter);
1569 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1570 AllocaInst *ZeroAddrAlloca =
1571 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1572 Instruction *TIDAddr = TIDAddrAlloca;
1573 Instruction *ZeroAddr = ZeroAddrAlloca;
1574 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1575 // Add additional casts to enforce pointers in zero address space
1576 TIDAddr = new AddrSpaceCastInst(
1577 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1578 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1579 ToBeDeleted.push_back(TIDAddr);
1580 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1581 PointerType ::get(M.getContext(), 0),
1582 "zero.addr.ascast");
1583 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1584 ToBeDeleted.push_back(ZeroAddr);
1585 }
1586
1587 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1588 // associated arguments in the outlined function, so we delete them later.
1589 ToBeDeleted.push_back(TIDAddrAlloca);
1590 ToBeDeleted.push_back(ZeroAddrAlloca);
1591
1592 // Create an artificial insertion point that will also ensure the blocks we
1593 // are about to split are not degenerated.
1594 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1595
1596 BasicBlock *EntryBB = UI->getParent();
1597 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1598 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1599 BasicBlock *PRegPreFiniBB =
1600 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1601 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1602
1603 auto FiniCBWrapper = [&](InsertPointTy IP) {
1604 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1605 // target to the region exit block.
1606 if (IP.getBlock()->end() == IP.getPoint()) {
1607 IRBuilder<>::InsertPointGuard IPG(Builder);
1608 Builder.restoreIP(IP);
1609 Instruction *I = Builder.CreateBr(PRegExitBB);
1610 IP = InsertPointTy(I->getParent(), I->getIterator());
1611 }
1613 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1614 "Unexpected insertion point for finalization call!");
1615 return FiniCB(IP);
1616 };
1617
1618 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1619
1620 // Generate the privatization allocas in the block that will become the entry
1621 // of the outlined function.
1622 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1623 InsertPointTy InnerAllocaIP = Builder.saveIP();
1624
1625 AllocaInst *PrivTIDAddr =
1626 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1627 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1628
1629 // Add some fake uses for OpenMP provided arguments.
1630 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1631 Instruction *ZeroAddrUse =
1632 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1633 ToBeDeleted.push_back(ZeroAddrUse);
1634
1635 // EntryBB
1636 // |
1637 // V
1638 // PRegionEntryBB <- Privatization allocas are placed here.
1639 // |
1640 // V
1641 // PRegionBodyBB <- BodeGen is invoked here.
1642 // |
1643 // V
1644 // PRegPreFiniBB <- The block we will start finalization from.
1645 // |
1646 // V
1647 // PRegionExitBB <- A common exit to simplify block collection.
1648 //
1649
1650 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1651
1652 // Let the caller create the body.
1653 assert(BodyGenCB && "Expected body generation callback!");
1654 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1655 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1656 return Err;
1657
1658 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1659
1660 OutlineInfo OI;
1661 if (Config.isTargetDevice()) {
1662 // Generate OpenMP target specific runtime call
1663 OI.PostOutlineCB = [=, ToBeDeletedVec =
1664 std::move(ToBeDeleted)](Function &OutlinedFn) {
1665 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1666 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1667 ThreadID, ToBeDeletedVec);
1668 };
1669 } else {
1670 // Generate OpenMP host runtime call
1671 OI.PostOutlineCB = [=, ToBeDeletedVec =
1672 std::move(ToBeDeleted)](Function &OutlinedFn) {
1673 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1674 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1675 };
1676 }
1677
1678 OI.OuterAllocaBB = OuterAllocaBlock;
1679 OI.EntryBB = PRegEntryBB;
1680 OI.ExitBB = PRegExitBB;
1681
1682 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1684 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1685
1686 CodeExtractorAnalysisCache CEAC(*OuterFn);
1687 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1688 /* AggregateArgs */ false,
1689 /* BlockFrequencyInfo */ nullptr,
1690 /* BranchProbabilityInfo */ nullptr,
1691 /* AssumptionCache */ nullptr,
1692 /* AllowVarArgs */ true,
1693 /* AllowAlloca */ true,
1694 /* AllocationBlock */ OuterAllocaBlock,
1695 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1696
1697 // Find inputs to, outputs from the code region.
1698 BasicBlock *CommonExit = nullptr;
1699 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1700 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1701
1702 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1703 /*CollectGlobalInputs=*/true);
1704
1705 Inputs.remove_if([&](Value *I) {
1707 return GV->getValueType() == OpenMPIRBuilder::Ident;
1708
1709 return false;
1710 });
1711
1712 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1713
1714 FunctionCallee TIDRTLFn =
1715 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1716
1717 auto PrivHelper = [&](Value &V) -> Error {
1718 if (&V == TIDAddr || &V == ZeroAddr) {
1719 OI.ExcludeArgsFromAggregate.push_back(&V);
1720 return Error::success();
1721 }
1722
1724 for (Use &U : V.uses())
1725 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1726 if (ParallelRegionBlockSet.count(UserI->getParent()))
1727 Uses.insert(&U);
1728
1729 // __kmpc_fork_call expects extra arguments as pointers. If the input
1730 // already has a pointer type, everything is fine. Otherwise, store the
1731 // value onto stack and load it back inside the to-be-outlined region. This
1732 // will ensure only the pointer will be passed to the function.
1733 // FIXME: if there are more than 15 trailing arguments, they must be
1734 // additionally packed in a struct.
1735 Value *Inner = &V;
1736 if (!V.getType()->isPointerTy()) {
1737 IRBuilder<>::InsertPointGuard Guard(Builder);
1738 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1739
1740 Builder.restoreIP(OuterAllocaIP);
1741 Value *Ptr =
1742 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1743
1744 // Store to stack at end of the block that currently branches to the entry
1745 // block of the to-be-outlined region.
1746 Builder.SetInsertPoint(InsertBB,
1747 InsertBB->getTerminator()->getIterator());
1748 Builder.CreateStore(&V, Ptr);
1749
1750 // Load back next to allocations in the to-be-outlined region.
1751 Builder.restoreIP(InnerAllocaIP);
1752 Inner = Builder.CreateLoad(V.getType(), Ptr);
1753 }
1754
1755 Value *ReplacementValue = nullptr;
1756 CallInst *CI = dyn_cast<CallInst>(&V);
1757 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1758 ReplacementValue = PrivTID;
1759 } else {
1760 InsertPointOrErrorTy AfterIP =
1761 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1762 if (!AfterIP)
1763 return AfterIP.takeError();
1764 Builder.restoreIP(*AfterIP);
1765 InnerAllocaIP = {
1766 InnerAllocaIP.getBlock(),
1767 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1768
1769 assert(ReplacementValue &&
1770 "Expected copy/create callback to set replacement value!");
1771 if (ReplacementValue == &V)
1772 return Error::success();
1773 }
1774
1775 for (Use *UPtr : Uses)
1776 UPtr->set(ReplacementValue);
1777
1778 return Error::success();
1779 };
1780
1781 // Reset the inner alloca insertion as it will be used for loading the values
1782 // wrapped into pointers before passing them into the to-be-outlined region.
1783 // Configure it to insert immediately after the fake use of zero address so
1784 // that they are available in the generated body and so that the
1785 // OpenMP-related values (thread ID and zero address pointers) remain leading
1786 // in the argument list.
1787 InnerAllocaIP = IRBuilder<>::InsertPoint(
1788 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1789
1790 // Reset the outer alloca insertion point to the entry of the relevant block
1791 // in case it was invalidated.
1792 OuterAllocaIP = IRBuilder<>::InsertPoint(
1793 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1794
1795 for (Value *Input : Inputs) {
1796 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1797 if (Error Err = PrivHelper(*Input))
1798 return Err;
1799 }
1800 LLVM_DEBUG({
1801 for (Value *Output : Outputs)
1802 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1803 });
1804 assert(Outputs.empty() &&
1805 "OpenMP outlining should not produce live-out values!");
1806
1807 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1808 LLVM_DEBUG({
1809 for (auto *BB : Blocks)
1810 dbgs() << " PBR: " << BB->getName() << "\n";
1811 });
1812
1813 // Adjust the finalization stack, verify the adjustment, and call the
1814 // finalize function a last time to finalize values between the pre-fini
1815 // block and the exit block if we left the parallel "the normal way".
1816 auto FiniInfo = FinalizationStack.pop_back_val();
1817 (void)FiniInfo;
1818 assert(FiniInfo.DK == OMPD_parallel &&
1819 "Unexpected finalization stack state!");
1820
1821 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1822
1823 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1824 if (Error Err = FiniCB(PreFiniIP))
1825 return Err;
1826
1827 // Register the outlined info.
1828 addOutlineInfo(std::move(OI));
1829
1830 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1831 UI->eraseFromParent();
1832
1833 return AfterIP;
1834}
1835
1836void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1837 // Build call void __kmpc_flush(ident_t *loc)
1838 uint32_t SrcLocStrSize;
1839 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1840 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1841
1842 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush),
1843 Args);
1844}
1845
1846void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1847 if (!updateToLocation(Loc))
1848 return;
1849 emitFlush(Loc);
1850}
1851
1852void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1853 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1854 // global_tid);
1855 uint32_t SrcLocStrSize;
1856 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1857 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1858 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1859
1860 // Ignore return result until untied tasks are supported.
1861 createRuntimeFunctionCall(
1862 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1863}
1864
1865void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1866 if (!updateToLocation(Loc))
1867 return;
1868 emitTaskwaitImpl(Loc);
1869}
1870
1871void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1872 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1873 uint32_t SrcLocStrSize;
1874 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1875 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1876 Constant *I32Null = ConstantInt::getNullValue(Int32);
1877 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1878
1879 createRuntimeFunctionCall(
1880 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1881}
1882
1883void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1884 if (!updateToLocation(Loc))
1885 return;
1886 emitTaskyieldImpl(Loc);
1887}
1888
1889// Processes the dependencies in Dependencies and does the following
1890// - Allocates space on the stack of an array of DependInfo objects
1891// - Populates each DependInfo object with relevant information of
1892// the corresponding dependence.
1893// - All code is inserted in the entry block of the current function.
1895 OpenMPIRBuilder &OMPBuilder,
1897 // Early return if we have no dependencies to process
1898 if (Dependencies.empty())
1899 return nullptr;
1900
1901 // Given a vector of DependData objects, in this function we create an
1902 // array on the stack that holds kmp_dep_info objects corresponding
1903 // to each dependency. This is then passed to the OpenMP runtime.
1904 // For example, if there are 'n' dependencies then the following psedo
1905 // code is generated. Assume the first dependence is on a variable 'a'
1906 //
1907 // \code{c}
1908 // DepArray = alloc(n x sizeof(kmp_depend_info);
1909 // idx = 0;
1910 // DepArray[idx].base_addr = ptrtoint(&a);
1911 // DepArray[idx].len = 8;
1912 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1913 // ++idx;
1914 // DepArray[idx].base_addr = ...;
1915 // \endcode
1916
1917 IRBuilderBase &Builder = OMPBuilder.Builder;
1918 Type *DependInfo = OMPBuilder.DependInfo;
1919 Module &M = OMPBuilder.M;
1920
1921 Value *DepArray = nullptr;
1922 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1923 Builder.SetInsertPoint(
1924 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1925
1926 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1927 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1928
1929 Builder.restoreIP(OldIP);
1930
1931 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1932 Value *Base =
1933 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1934 // Store the pointer to the variable
1935 Value *Addr = Builder.CreateStructGEP(
1936 DependInfo, Base,
1937 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1938 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1939 Builder.CreateStore(DepValPtr, Addr);
1940 // Store the size of the variable
1941 Value *Size = Builder.CreateStructGEP(
1942 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1943 Builder.CreateStore(
1944 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1945 Size);
1946 // Store the dependency kind
1947 Value *Flags = Builder.CreateStructGEP(
1948 DependInfo, Base,
1949 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1950 Builder.CreateStore(
1951 ConstantInt::get(Builder.getInt8Ty(),
1952 static_cast<unsigned int>(Dep.DepKind)),
1953 Flags);
1954 }
1955 return DepArray;
1956}
1957
1958OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1959 const LocationDescription &Loc, InsertPointTy AllocaIP,
1960 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1961 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
1962 Value *Priority) {
1963
1964 if (!updateToLocation(Loc))
1965 return InsertPointTy();
1966
1967 uint32_t SrcLocStrSize;
1968 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1969 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1970 // The current basic block is split into four basic blocks. After outlining,
1971 // they will be mapped as follows:
1972 // ```
1973 // def current_fn() {
1974 // current_basic_block:
1975 // br label %task.exit
1976 // task.exit:
1977 // ; instructions after task
1978 // }
1979 // def outlined_fn() {
1980 // task.alloca:
1981 // br label %task.body
1982 // task.body:
1983 // ret void
1984 // }
1985 // ```
1986 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
1987 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
1988 BasicBlock *TaskAllocaBB =
1989 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
1990
1991 InsertPointTy TaskAllocaIP =
1992 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
1993 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
1994 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
1995 return Err;
1996
1997 OutlineInfo OI;
1998 OI.EntryBB = TaskAllocaBB;
1999 OI.OuterAllocaBB = AllocaIP.getBlock();
2000 OI.ExitBB = TaskExitBB;
2001
2002 // Add the thread ID argument.
2004 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2005 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2006
2007 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2008 Mergeable, Priority, EventHandle, TaskAllocaBB,
2009 ToBeDeleted](Function &OutlinedFn) mutable {
2010 // Replace the Stale CI by appropriate RTL function call.
2011 assert(OutlinedFn.hasOneUse() &&
2012 "there must be a single user for the outlined function");
2013 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2014
2015 // HasShareds is true if any variables are captured in the outlined region,
2016 // false otherwise.
2017 bool HasShareds = StaleCI->arg_size() > 1;
2018 Builder.SetInsertPoint(StaleCI);
2019
2020 // Gather the arguments for emitting the runtime call for
2021 // @__kmpc_omp_task_alloc
2022 Function *TaskAllocFn =
2023 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2024
2025 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2026 // call.
2027 Value *ThreadID = getOrCreateThreadID(Ident);
2028
2029 // Argument - `flags`
2030 // Task is tied iff (Flags & 1) == 1.
2031 // Task is untied iff (Flags & 1) == 0.
2032 // Task is final iff (Flags & 2) == 2.
2033 // Task is not final iff (Flags & 2) == 0.
2034 // Task is mergeable iff (Flags & 4) == 4.
2035 // Task is not mergeable iff (Flags & 4) == 0.
2036 // Task is priority iff (Flags & 32) == 32.
2037 // Task is not priority iff (Flags & 32) == 0.
2038 // TODO: Handle the other flags.
2039 Value *Flags = Builder.getInt32(Tied);
2040 if (Final) {
2041 Value *FinalFlag =
2042 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2043 Flags = Builder.CreateOr(FinalFlag, Flags);
2044 }
2045
2046 if (Mergeable)
2047 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2048 if (Priority)
2049 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2050
2051 // Argument - `sizeof_kmp_task_t` (TaskSize)
2052 // Tasksize refers to the size in bytes of kmp_task_t data structure
2053 // including private vars accessed in task.
2054 // TODO: add kmp_task_t_with_privates (privates)
2055 Value *TaskSize = Builder.getInt64(
2056 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2057
2058 // Argument - `sizeof_shareds` (SharedsSize)
2059 // SharedsSize refers to the shareds array size in the kmp_task_t data
2060 // structure.
2061 Value *SharedsSize = Builder.getInt64(0);
2062 if (HasShareds) {
2063 AllocaInst *ArgStructAlloca =
2065 assert(ArgStructAlloca &&
2066 "Unable to find the alloca instruction corresponding to arguments "
2067 "for extracted function");
2068 StructType *ArgStructType =
2069 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2070 assert(ArgStructType && "Unable to find struct type corresponding to "
2071 "arguments for extracted function");
2072 SharedsSize =
2073 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2074 }
2075 // Emit the @__kmpc_omp_task_alloc runtime call
2076 // The runtime call returns a pointer to an area where the task captured
2077 // variables must be copied before the task is run (TaskData)
2078 CallInst *TaskData = createRuntimeFunctionCall(
2079 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2080 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2081 /*task_func=*/&OutlinedFn});
2082
2083 // Emit detach clause initialization.
2084 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2085 // task_descriptor);
2086 if (EventHandle) {
2087 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2088 OMPRTL___kmpc_task_allow_completion_event);
2089 llvm::Value *EventVal =
2090 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2091 llvm::Value *EventHandleAddr =
2092 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2093 Builder.getPtrTy(0));
2094 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2095 Builder.CreateStore(EventVal, EventHandleAddr);
2096 }
2097 // Copy the arguments for outlined function
2098 if (HasShareds) {
2099 Value *Shareds = StaleCI->getArgOperand(1);
2100 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2101 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2102 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2103 SharedsSize);
2104 }
2105
2106 if (Priority) {
2107 //
2108 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2109 // we populate the priority information into the "kmp_task_t" here
2110 //
2111 // The struct "kmp_task_t" definition is available in kmp.h
2112 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2113 // data2 is used for priority
2114 //
2115 Type *Int32Ty = Builder.getInt32Ty();
2116 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2117 // kmp_task_t* => { ptr }
2118 Type *TaskPtr = StructType::get(VoidPtr);
2119 Value *TaskGEP =
2120 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2121 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2122 Type *TaskStructType = StructType::get(
2123 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2124 Value *PriorityData = Builder.CreateInBoundsGEP(
2125 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2126 // kmp_cmplrdata_t => { ptr, ptr }
2127 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2128 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2129 PriorityData, {Zero, Zero});
2130 Builder.CreateStore(Priority, CmplrData);
2131 }
2132
2133 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2134
2135 // In the presence of the `if` clause, the following IR is generated:
2136 // ...
2137 // %data = call @__kmpc_omp_task_alloc(...)
2138 // br i1 %if_condition, label %then, label %else
2139 // then:
2140 // call @__kmpc_omp_task(...)
2141 // br label %exit
2142 // else:
2143 // ;; Wait for resolution of dependencies, if any, before
2144 // ;; beginning the task
2145 // call @__kmpc_omp_wait_deps(...)
2146 // call @__kmpc_omp_task_begin_if0(...)
2147 // call @outlined_fn(...)
2148 // call @__kmpc_omp_task_complete_if0(...)
2149 // br label %exit
2150 // exit:
2151 // ...
2152 if (IfCondition) {
2153 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2154 // terminator.
2155 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2156 Instruction *IfTerminator =
2157 Builder.GetInsertPoint()->getParent()->getTerminator();
2158 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2159 Builder.SetInsertPoint(IfTerminator);
2160 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2161 &ElseTI);
2162 Builder.SetInsertPoint(ElseTI);
2163
2164 if (Dependencies.size()) {
2165 Function *TaskWaitFn =
2166 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2167 createRuntimeFunctionCall(
2168 TaskWaitFn,
2169 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2170 ConstantInt::get(Builder.getInt32Ty(), 0),
2172 }
2173 Function *TaskBeginFn =
2174 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2175 Function *TaskCompleteFn =
2176 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2177 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2178 CallInst *CI = nullptr;
2179 if (HasShareds)
2180 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2181 else
2182 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2183 CI->setDebugLoc(StaleCI->getDebugLoc());
2184 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2185 Builder.SetInsertPoint(ThenTI);
2186 }
2187
2188 if (Dependencies.size()) {
2189 Function *TaskFn =
2190 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2191 createRuntimeFunctionCall(
2192 TaskFn,
2193 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2194 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2196
2197 } else {
2198 // Emit the @__kmpc_omp_task runtime call to spawn the task
2199 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2200 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2201 }
2202
2203 StaleCI->eraseFromParent();
2204
2205 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2206 if (HasShareds) {
2207 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2208 OutlinedFn.getArg(1)->replaceUsesWithIf(
2209 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2210 }
2211
2212 for (Instruction *I : llvm::reverse(ToBeDeleted))
2213 I->eraseFromParent();
2214 };
2215
2216 addOutlineInfo(std::move(OI));
2217 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2218
2219 return Builder.saveIP();
2220}
2221
2222OpenMPIRBuilder::InsertPointOrErrorTy
2223OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2224 InsertPointTy AllocaIP,
2225 BodyGenCallbackTy BodyGenCB) {
2226 if (!updateToLocation(Loc))
2227 return InsertPointTy();
2228
2229 uint32_t SrcLocStrSize;
2230 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2231 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2232 Value *ThreadID = getOrCreateThreadID(Ident);
2233
2234 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2235 Function *TaskgroupFn =
2236 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2237 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2238
2239 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2240 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2241 return Err;
2242
2243 Builder.SetInsertPoint(TaskgroupExitBB);
2244 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2245 Function *EndTaskgroupFn =
2246 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2247 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2248
2249 return Builder.saveIP();
2250}
2251
2252OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2253 const LocationDescription &Loc, InsertPointTy AllocaIP,
2254 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2255 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2256 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2257
2258 if (!updateToLocation(Loc))
2259 return Loc.IP;
2260
2261 // FiniCBWrapper needs to create a branch to the loop finalization block, but
2262 // this has not been created yet at some times when this callback runs.
2263 SmallVector<BranchInst *> CancellationBranches;
2264 auto FiniCBWrapper = [&](InsertPointTy IP) {
2265 if (IP.getBlock()->end() != IP.getPoint())
2266 return FiniCB(IP);
2267 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2268 // will fail because that function requires the Finalization Basic Block to
2269 // have a terminator, which is already removed by EmitOMPRegionBody.
2270 // IP is currently at cancelation block.
2271 BranchInst *DummyBranch = Builder.CreateBr(IP.getBlock());
2272 IP = InsertPointTy(DummyBranch->getParent(), DummyBranch->getIterator());
2273 CancellationBranches.push_back(DummyBranch);
2274 return FiniCB(IP);
2275 };
2276
2277 FinalizationStack.push_back({FiniCBWrapper, OMPD_sections, IsCancellable});
2278
2279 // Each section is emitted as a switch case
2280 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2281 // -> OMP.createSection() which generates the IR for each section
2282 // Iterate through all sections and emit a switch construct:
2283 // switch (IV) {
2284 // case 0:
2285 // <SectionStmt[0]>;
2286 // break;
2287 // ...
2288 // case <NumSection> - 1:
2289 // <SectionStmt[<NumSection> - 1]>;
2290 // break;
2291 // }
2292 // ...
2293 // section_loop.after:
2294 // <FiniCB>;
2295 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2296 Builder.restoreIP(CodeGenIP);
2298 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2299 Function *CurFn = Continue->getParent();
2300 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2301
2302 unsigned CaseNumber = 0;
2303 for (auto SectionCB : SectionCBs) {
2305 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2306 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2307 Builder.SetInsertPoint(CaseBB);
2308 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2309 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2310 CaseEndBr->getIterator()}))
2311 return Err;
2312 CaseNumber++;
2313 }
2314 // remove the existing terminator from body BB since there can be no
2315 // terminators after switch/case
2316 return Error::success();
2317 };
2318 // Loop body ends here
2319 // LowerBound, UpperBound, and STride for createCanonicalLoop
2320 Type *I32Ty = Type::getInt32Ty(M.getContext());
2321 Value *LB = ConstantInt::get(I32Ty, 0);
2322 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2323 Value *ST = ConstantInt::get(I32Ty, 1);
2324 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2325 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2326 if (!LoopInfo)
2327 return LoopInfo.takeError();
2328
2329 InsertPointOrErrorTy WsloopIP =
2330 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2331 WorksharingLoopType::ForStaticLoop, !IsNowait);
2332 if (!WsloopIP)
2333 return WsloopIP.takeError();
2334 InsertPointTy AfterIP = *WsloopIP;
2335
2336 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2337 assert(LoopFini && "Bad structure of static workshare loop finalization");
2338
2339 // Apply the finalization callback in LoopAfterBB
2340 auto FiniInfo = FinalizationStack.pop_back_val();
2341 assert(FiniInfo.DK == OMPD_sections &&
2342 "Unexpected finalization stack state!");
2343 if (FinalizeCallbackTy &CB = FiniInfo.FiniCB) {
2344 Builder.restoreIP(AfterIP);
2345 BasicBlock *FiniBB =
2346 splitBBWithSuffix(Builder, /*CreateBranch=*/true, "sections.fini");
2347 if (Error Err = CB(Builder.saveIP()))
2348 return Err;
2349 AfterIP = {FiniBB, FiniBB->begin()};
2350 }
2351
2352 // Now we can fix the dummy branch to point to the right place
2353 for (BranchInst *DummyBranch : CancellationBranches) {
2354 assert(DummyBranch->getNumSuccessors() == 1);
2355 DummyBranch->setSuccessor(0, LoopFini);
2356 }
2357
2358 return AfterIP;
2359}
2360
2361OpenMPIRBuilder::InsertPointOrErrorTy
2362OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2363 BodyGenCallbackTy BodyGenCB,
2364 FinalizeCallbackTy FiniCB) {
2365 if (!updateToLocation(Loc))
2366 return Loc.IP;
2367
2368 auto FiniCBWrapper = [&](InsertPointTy IP) {
2369 if (IP.getBlock()->end() != IP.getPoint())
2370 return FiniCB(IP);
2371 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2372 // will fail because that function requires the Finalization Basic Block to
2373 // have a terminator, which is already removed by EmitOMPRegionBody.
2374 // IP is currently at cancelation block.
2375 // We need to backtrack to the condition block to fetch
2376 // the exit block and create a branch from cancelation
2377 // to exit block.
2378 IRBuilder<>::InsertPointGuard IPG(Builder);
2379 Builder.restoreIP(IP);
2380 auto *CaseBB = Loc.IP.getBlock();
2381 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2382 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2383 Instruction *I = Builder.CreateBr(ExitBB);
2384 IP = InsertPointTy(I->getParent(), I->getIterator());
2385 return FiniCB(IP);
2386 };
2387
2388 Directive OMPD = Directive::OMPD_sections;
2389 // Since we are using Finalization Callback here, HasFinalize
2390 // and IsCancellable have to be true
2391 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2392 /*Conditional*/ false, /*hasFinalize*/ true,
2393 /*IsCancellable*/ true);
2394}
2395
2396static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2398 IT++;
2399 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2400}
2401
2402Value *OpenMPIRBuilder::getGPUThreadID() {
2403 return createRuntimeFunctionCall(
2404 getOrCreateRuntimeFunction(M,
2405 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2406 {});
2407}
2408
2409Value *OpenMPIRBuilder::getGPUWarpSize() {
2410 return createRuntimeFunctionCall(
2411 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2412}
2413
2414Value *OpenMPIRBuilder::getNVPTXWarpID() {
2415 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2416 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2417}
2418
2419Value *OpenMPIRBuilder::getNVPTXLaneID() {
2420 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2421 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2422 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2423 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2424 "nvptx_lane_id");
2425}
2426
2427Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2428 Type *ToType) {
2429 Type *FromType = From->getType();
2430 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2431 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2432 assert(FromSize > 0 && "From size must be greater than zero");
2433 assert(ToSize > 0 && "To size must be greater than zero");
2434 if (FromType == ToType)
2435 return From;
2436 if (FromSize == ToSize)
2437 return Builder.CreateBitCast(From, ToType);
2438 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2439 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2440 InsertPointTy SaveIP = Builder.saveIP();
2441 Builder.restoreIP(AllocaIP);
2442 Value *CastItem = Builder.CreateAlloca(ToType);
2443 Builder.restoreIP(SaveIP);
2444
2445 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2446 CastItem, Builder.getPtrTy(0));
2447 Builder.CreateStore(From, ValCastItem);
2448 return Builder.CreateLoad(ToType, CastItem);
2449}
2450
2451Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2452 Value *Element,
2453 Type *ElementType,
2454 Value *Offset) {
2455 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2456 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2457
2458 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2459 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2460 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2461 Value *WarpSize =
2462 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2463 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2464 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2465 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2466 Value *WarpSizeCast =
2467 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2468 Value *ShuffleCall =
2469 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2470 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2471}
2472
2473void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2474 Value *DstAddr, Type *ElemType,
2475 Value *Offset, Type *ReductionArrayTy,
2476 bool IsByRefElem) {
2477 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2478 // Create the loop over the big sized data.
2479 // ptr = (void*)Elem;
2480 // ptrEnd = (void*) Elem + 1;
2481 // Step = 8;
2482 // while (ptr + Step < ptrEnd)
2483 // shuffle((int64_t)*ptr);
2484 // Step = 4;
2485 // while (ptr + Step < ptrEnd)
2486 // shuffle((int32_t)*ptr);
2487 // ...
2488 Type *IndexTy = Builder.getIndexTy(
2489 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2490 Value *ElemPtr = DstAddr;
2491 Value *Ptr = SrcAddr;
2492 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2493 if (Size < IntSize)
2494 continue;
2495 Type *IntType = Builder.getIntNTy(IntSize * 8);
2496 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2497 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2498 Value *SrcAddrGEP =
2499 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2500 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2501 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2502
2503 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2504 if ((Size / IntSize) > 1) {
2505 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2506 SrcAddrGEP, Builder.getPtrTy());
2507 BasicBlock *PreCondBB =
2508 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2509 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2510 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2511 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2512 emitBlock(PreCondBB, CurFunc);
2513 PHINode *PhiSrc =
2514 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2515 PhiSrc->addIncoming(Ptr, CurrentBB);
2516 PHINode *PhiDest =
2517 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2518 PhiDest->addIncoming(ElemPtr, CurrentBB);
2519 Ptr = PhiSrc;
2520 ElemPtr = PhiDest;
2521 Value *PtrDiff = Builder.CreatePtrDiff(
2522 Builder.getInt8Ty(), PtrEnd,
2523 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2524 Builder.CreateCondBr(
2525 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2526 ExitBB);
2527 emitBlock(ThenBB, CurFunc);
2528 Value *Res = createRuntimeShuffleFunction(
2529 AllocaIP,
2530 Builder.CreateAlignedLoad(
2531 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2532 IntType, Offset);
2533 Builder.CreateAlignedStore(Res, ElemPtr,
2534 M.getDataLayout().getPrefTypeAlign(ElemType));
2535 Value *LocalPtr =
2536 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2537 Value *LocalElemPtr =
2538 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2539 PhiSrc->addIncoming(LocalPtr, ThenBB);
2540 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2541 emitBranch(PreCondBB);
2542 emitBlock(ExitBB, CurFunc);
2543 } else {
2544 Value *Res = createRuntimeShuffleFunction(
2545 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2546 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2547 Res->getType()->getScalarSizeInBits())
2548 Res = Builder.CreateTrunc(Res, ElemType);
2549 Builder.CreateStore(Res, ElemPtr);
2550 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2551 ElemPtr =
2552 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2553 }
2554 Size = Size % IntSize;
2555 }
2556}
2557
2558Error OpenMPIRBuilder::emitReductionListCopy(
2559 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2560 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2561 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
2562 Type *IndexTy = Builder.getIndexTy(
2563 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2564 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2565
2566 // Iterates, element-by-element, through the source Reduce list and
2567 // make a copy.
2568 for (auto En : enumerate(ReductionInfos)) {
2569 const ReductionInfo &RI = En.value();
2570 Value *SrcElementAddr = nullptr;
2571 AllocaInst *DestAlloca = nullptr;
2572 Value *DestElementAddr = nullptr;
2573 Value *DestElementPtrAddr = nullptr;
2574 // Should we shuffle in an element from a remote lane?
2575 bool ShuffleInElement = false;
2576 // Set to true to update the pointer in the dest Reduce list to a
2577 // newly created element.
2578 bool UpdateDestListPtr = false;
2579
2580 // Step 1.1: Get the address for the src element in the Reduce list.
2581 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2582 ReductionArrayTy, SrcBase,
2583 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2584 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2585
2586 // Step 1.2: Create a temporary to store the element in the destination
2587 // Reduce list.
2588 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2589 ReductionArrayTy, DestBase,
2590 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2591 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
2592 switch (Action) {
2593 case CopyAction::RemoteLaneToThread: {
2594 InsertPointTy CurIP = Builder.saveIP();
2595 Builder.restoreIP(AllocaIP);
2596
2597 Type *DestAllocaType =
2598 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
2599 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
2600 ".omp.reduction.element");
2601 DestAlloca->setAlignment(
2602 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
2603 DestElementAddr = DestAlloca;
2604 DestElementAddr =
2605 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2606 DestElementAddr->getName() + ".ascast");
2607 Builder.restoreIP(CurIP);
2608 ShuffleInElement = true;
2609 UpdateDestListPtr = true;
2610 break;
2611 }
2612 case CopyAction::ThreadCopy: {
2613 DestElementAddr =
2614 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2615 break;
2616 }
2617 }
2618
2619 // Now that all active lanes have read the element in the
2620 // Reduce list, shuffle over the value from the remote lane.
2621 if (ShuffleInElement) {
2622 Type *ShuffleType = RI.ElementType;
2623 Value *ShuffleSrcAddr = SrcElementAddr;
2624 Value *ShuffleDestAddr = DestElementAddr;
2625 AllocaInst *LocalStorage = nullptr;
2626
2627 if (IsByRefElem) {
2628 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
2629 assert(RI.ByRefAllocatedType &&
2630 "Expected by-ref allocated type to be set");
2631 // For by-ref reductions, we need to copy from the remote lane the
2632 // actual value of the partial reduction computed by that remote lane;
2633 // rather than, for example, a pointer to that data or, even worse, a
2634 // pointer to the descriptor of the by-ref reduction element.
2635 ShuffleType = RI.ByRefElementType;
2636
2637 InsertPointOrErrorTy GenResult =
2638 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
2639
2640 if (!GenResult)
2641 return GenResult.takeError();
2642
2643 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
2644
2645 {
2646 InsertPointTy OldIP = Builder.saveIP();
2647 Builder.restoreIP(AllocaIP);
2648
2649 LocalStorage = Builder.CreateAlloca(ShuffleType);
2650 Builder.restoreIP(OldIP);
2651 ShuffleDestAddr = LocalStorage;
2652 }
2653 }
2654
2655 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
2656 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
2657
2658 if (IsByRefElem) {
2659 Value *GEP;
2660 InsertPointOrErrorTy GenResult =
2661 RI.DataPtrPtrGen(Builder.saveIP(),
2662 Builder.CreatePointerBitCastOrAddrSpaceCast(
2663 DestAlloca, Builder.getPtrTy(), ".ascast"),
2664 GEP);
2665
2666 if (!GenResult)
2667 return GenResult.takeError();
2668
2669 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
2670 LocalStorage, Builder.getPtrTy(), ".ascast"),
2671 GEP);
2672 }
2673 } else {
2674 switch (RI.EvaluationKind) {
2675 case EvalKind::Scalar: {
2676 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2677 // Store the source element value to the dest element address.
2678 Builder.CreateStore(Elem, DestElementAddr);
2679 break;
2680 }
2681 case EvalKind::Complex: {
2682 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2683 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2684 Value *SrcReal = Builder.CreateLoad(
2685 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2686 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2687 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2688 Value *SrcImg = Builder.CreateLoad(
2689 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2690
2691 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2692 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2693 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2694 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2695 Builder.CreateStore(SrcReal, DestRealPtr);
2696 Builder.CreateStore(SrcImg, DestImgPtr);
2697 break;
2698 }
2699 case EvalKind::Aggregate: {
2700 Value *SizeVal = Builder.getInt64(
2701 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2702 Builder.CreateMemCpy(
2703 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2704 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2705 SizeVal, false);
2706 break;
2707 }
2708 };
2709 }
2710
2711 // Step 3.1: Modify reference in dest Reduce list as needed.
2712 // Modifying the reference in Reduce list to point to the newly
2713 // created element. The element is live in the current function
2714 // scope and that of functions it invokes (i.e., reduce_function).
2715 // RemoteReduceData[i] = (void*)&RemoteElem
2716 if (UpdateDestListPtr) {
2717 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2718 DestElementAddr, Builder.getPtrTy(),
2719 DestElementAddr->getName() + ".ascast");
2720 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2721 }
2722 }
2723
2724 return Error::success();
2725}
2726
2727Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2728 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2729 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
2730 InsertPointTy SavedIP = Builder.saveIP();
2731 LLVMContext &Ctx = M.getContext();
2733 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2734 /* IsVarArg */ false);
2735 Function *WcFunc =
2737 "_omp_reduction_inter_warp_copy_func", &M);
2738 WcFunc->setAttributes(FuncAttrs);
2739 WcFunc->addParamAttr(0, Attribute::NoUndef);
2740 WcFunc->addParamAttr(1, Attribute::NoUndef);
2741 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2742 Builder.SetInsertPoint(EntryBB);
2743
2744 // ReduceList: thread local Reduce list.
2745 // At the stage of the computation when this function is called, partially
2746 // aggregated values reside in the first lane of every active warp.
2747 Argument *ReduceListArg = WcFunc->getArg(0);
2748 // NumWarps: number of warps active in the parallel region. This could
2749 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2750 Argument *NumWarpsArg = WcFunc->getArg(1);
2751
2752 // This array is used as a medium to transfer, one reduce element at a time,
2753 // the data from the first lane of every warp to lanes in the first warp
2754 // in order to perform the final step of a reduction in a parallel region
2755 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2756 // for reduced latency, as well as to have a distinct copy for concurrently
2757 // executing target regions. The array is declared with common linkage so
2758 // as to be shared across compilation units.
2759 StringRef TransferMediumName =
2760 "__openmp_nvptx_data_transfer_temporary_storage";
2761 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2762 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2763 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2764 if (!TransferMedium) {
2765 TransferMedium = new GlobalVariable(
2766 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2767 UndefValue::get(ArrayTy), TransferMediumName,
2768 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2769 /*AddressSpace=*/3);
2770 }
2771
2772 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2773 Value *GPUThreadID = getGPUThreadID();
2774 // nvptx_lane_id = nvptx_id % warpsize
2775 Value *LaneID = getNVPTXLaneID();
2776 // nvptx_warp_id = nvptx_id / warpsize
2777 Value *WarpID = getNVPTXWarpID();
2778
2779 InsertPointTy AllocaIP =
2780 InsertPointTy(Builder.GetInsertBlock(),
2781 Builder.GetInsertBlock()->getFirstInsertionPt());
2782 Type *Arg0Type = ReduceListArg->getType();
2783 Type *Arg1Type = NumWarpsArg->getType();
2784 Builder.restoreIP(AllocaIP);
2785 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2786 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2787 AllocaInst *NumWarpsAlloca =
2788 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2789 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2790 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2791 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2792 NumWarpsAlloca, Builder.getPtrTy(0),
2793 NumWarpsAlloca->getName() + ".ascast");
2794 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2795 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2796 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2797 InsertPointTy CodeGenIP =
2798 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2799 Builder.restoreIP(CodeGenIP);
2800
2801 Value *ReduceList =
2802 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2803
2804 for (auto En : enumerate(ReductionInfos)) {
2805 //
2806 // Warp master copies reduce element to transfer medium in __shared__
2807 // memory.
2808 //
2809 const ReductionInfo &RI = En.value();
2810 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
2811 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
2812 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
2813 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2814 Type *CType = Builder.getIntNTy(TySize * 8);
2815
2816 unsigned NumIters = RealTySize / TySize;
2817 if (NumIters == 0)
2818 continue;
2819 Value *Cnt = nullptr;
2820 Value *CntAddr = nullptr;
2821 BasicBlock *PrecondBB = nullptr;
2822 BasicBlock *ExitBB = nullptr;
2823 if (NumIters > 1) {
2824 CodeGenIP = Builder.saveIP();
2825 Builder.restoreIP(AllocaIP);
2826 CntAddr =
2827 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2828
2829 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2830 CntAddr->getName() + ".ascast");
2831 Builder.restoreIP(CodeGenIP);
2832 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2833 CntAddr,
2834 /*Volatile=*/false);
2835 PrecondBB = BasicBlock::Create(Ctx, "precond");
2836 ExitBB = BasicBlock::Create(Ctx, "exit");
2837 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2838 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2839 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2840 /*Volatile=*/false);
2841 Value *Cmp = Builder.CreateICmpULT(
2842 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2843 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2844 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2845 }
2846
2847 // kmpc_barrier.
2848 InsertPointOrErrorTy BarrierIP1 =
2849 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2850 omp::Directive::OMPD_unknown,
2851 /* ForceSimpleCall */ false,
2852 /* CheckCancelFlag */ true);
2853 if (!BarrierIP1)
2854 return BarrierIP1.takeError();
2855 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2856 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2857 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2858
2859 // if (lane_id == 0)
2860 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2861 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2862 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2863
2864 // Reduce element = LocalReduceList[i]
2865 auto *RedListArrayTy =
2866 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2867 Type *IndexTy = Builder.getIndexTy(
2868 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2869 Value *ElemPtrPtr =
2870 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2871 {ConstantInt::get(IndexTy, 0),
2872 ConstantInt::get(IndexTy, En.index())});
2873 // elemptr = ((CopyType*)(elemptrptr)) + I
2874 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2875
2876 if (IsByRefElem) {
2877 InsertPointOrErrorTy GenRes =
2878 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
2879
2880 if (!GenRes)
2881 return GenRes.takeError();
2882
2883 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
2884 }
2885
2886 if (NumIters > 1)
2887 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2888
2889 // Get pointer to location in transfer medium.
2890 // MediumPtr = &medium[warp_id]
2891 Value *MediumPtr = Builder.CreateInBoundsGEP(
2892 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2893 // elem = *elemptr
2894 //*MediumPtr = elem
2895 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2896 // Store the source element value to the dest element address.
2897 Builder.CreateStore(Elem, MediumPtr,
2898 /*IsVolatile*/ true);
2899 Builder.CreateBr(MergeBB);
2900
2901 // else
2902 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2903 Builder.CreateBr(MergeBB);
2904
2905 // endif
2906 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2907 InsertPointOrErrorTy BarrierIP2 =
2908 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2909 omp::Directive::OMPD_unknown,
2910 /* ForceSimpleCall */ false,
2911 /* CheckCancelFlag */ true);
2912 if (!BarrierIP2)
2913 return BarrierIP2.takeError();
2914
2915 // Warp 0 copies reduce element from transfer medium
2916 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2917 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2918 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2919
2920 Value *NumWarpsVal =
2921 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2922 // Up to 32 threads in warp 0 are active.
2923 Value *IsActiveThread =
2924 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2925 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2926
2927 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2928
2929 // SecMediumPtr = &medium[tid]
2930 // SrcMediumVal = *SrcMediumPtr
2931 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2932 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2933 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2934 Value *TargetElemPtrPtr =
2935 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2936 {ConstantInt::get(IndexTy, 0),
2937 ConstantInt::get(IndexTy, En.index())});
2938 Value *TargetElemPtrVal =
2939 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2940 Value *TargetElemPtr = TargetElemPtrVal;
2941
2942 if (IsByRefElem) {
2943 InsertPointOrErrorTy GenRes =
2944 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
2945
2946 if (!GenRes)
2947 return GenRes.takeError();
2948
2949 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
2950 }
2951
2952 if (NumIters > 1)
2953 TargetElemPtr =
2954 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2955
2956 // *TargetElemPtr = SrcMediumVal;
2957 Value *SrcMediumValue =
2958 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2959 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2960 Builder.CreateBr(W0MergeBB);
2961
2962 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2963 Builder.CreateBr(W0MergeBB);
2964
2965 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2966
2967 if (NumIters > 1) {
2968 Cnt = Builder.CreateNSWAdd(
2969 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2970 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2971
2972 auto *CurFn = Builder.GetInsertBlock()->getParent();
2973 emitBranch(PrecondBB);
2974 emitBlock(ExitBB, CurFn);
2975 }
2976 RealTySize %= TySize;
2977 }
2978 }
2979
2980 Builder.CreateRetVoid();
2981 Builder.restoreIP(SavedIP);
2982
2983 return WcFunc;
2984}
2985
2986Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
2987 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2988 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
2989 LLVMContext &Ctx = M.getContext();
2990 FunctionType *FuncTy =
2991 FunctionType::get(Builder.getVoidTy(),
2992 {Builder.getPtrTy(), Builder.getInt16Ty(),
2993 Builder.getInt16Ty(), Builder.getInt16Ty()},
2994 /* IsVarArg */ false);
2995 Function *SarFunc =
2997 "_omp_reduction_shuffle_and_reduce_func", &M);
2998 SarFunc->setAttributes(FuncAttrs);
2999 SarFunc->addParamAttr(0, Attribute::NoUndef);
3000 SarFunc->addParamAttr(1, Attribute::NoUndef);
3001 SarFunc->addParamAttr(2, Attribute::NoUndef);
3002 SarFunc->addParamAttr(3, Attribute::NoUndef);
3003 SarFunc->addParamAttr(1, Attribute::SExt);
3004 SarFunc->addParamAttr(2, Attribute::SExt);
3005 SarFunc->addParamAttr(3, Attribute::SExt);
3006 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3007 Builder.SetInsertPoint(EntryBB);
3008
3009 // Thread local Reduce list used to host the values of data to be reduced.
3010 Argument *ReduceListArg = SarFunc->getArg(0);
3011 // Current lane id; could be logical.
3012 Argument *LaneIDArg = SarFunc->getArg(1);
3013 // Offset of the remote source lane relative to the current lane.
3014 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3015 // Algorithm version. This is expected to be known at compile time.
3016 Argument *AlgoVerArg = SarFunc->getArg(3);
3017
3018 Type *ReduceListArgType = ReduceListArg->getType();
3019 Type *LaneIDArgType = LaneIDArg->getType();
3020 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3021 Value *ReduceListAlloca = Builder.CreateAlloca(
3022 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3023 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3024 LaneIDArg->getName() + ".addr");
3025 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3026 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3027 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3028 AlgoVerArg->getName() + ".addr");
3029 ArrayType *RedListArrayTy =
3030 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3031
3032 // Create a local thread-private variable to host the Reduce list
3033 // from a remote lane.
3034 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3035 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3036
3037 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3038 ReduceListAlloca, ReduceListArgType,
3039 ReduceListAlloca->getName() + ".ascast");
3040 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3041 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3042 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3043 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3044 RemoteLaneOffsetAlloca->getName() + ".ascast");
3045 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3046 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3047 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3048 RemoteReductionListAlloca, Builder.getPtrTy(),
3049 RemoteReductionListAlloca->getName() + ".ascast");
3050
3051 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3052 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3053 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3054 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3055
3056 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3057 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3058 Value *RemoteLaneOffset =
3059 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3060 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3061
3062 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3063
3064 // This loop iterates through the list of reduce elements and copies,
3065 // element by element, from a remote lane in the warp to RemoteReduceList,
3066 // hosted on the thread's stack.
3067 Error EmitRedLsCpRes = emitReductionListCopy(
3068 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3069 ReduceList, RemoteListAddrCast, IsByRef,
3070 {RemoteLaneOffset, nullptr, nullptr});
3071
3072 if (EmitRedLsCpRes)
3073 return EmitRedLsCpRes;
3074
3075 // The actions to be performed on the Remote Reduce list is dependent
3076 // on the algorithm version.
3077 //
3078 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3079 // LaneId % 2 == 0 && Offset > 0):
3080 // do the reduction value aggregation
3081 //
3082 // The thread local variable Reduce list is mutated in place to host the
3083 // reduced data, which is the aggregated value produced from local and
3084 // remote lanes.
3085 //
3086 // Note that AlgoVer is expected to be a constant integer known at compile
3087 // time.
3088 // When AlgoVer==0, the first conjunction evaluates to true, making
3089 // the entire predicate true during compile time.
3090 // When AlgoVer==1, the second conjunction has only the second part to be
3091 // evaluated during runtime. Other conjunctions evaluates to false
3092 // during compile time.
3093 // When AlgoVer==2, the third conjunction has only the second part to be
3094 // evaluated during runtime. Other conjunctions evaluates to false
3095 // during compile time.
3096 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3097 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3098 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3099 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3100 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3101 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3102 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3103 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3104 Value *RemoteOffsetComp =
3105 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3106 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3107 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3108 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3109
3110 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3111 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3112 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3113
3114 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3115 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3116 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3117 ReduceList, Builder.getPtrTy());
3118 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3119 RemoteListAddrCast, Builder.getPtrTy());
3120 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3121 ->addFnAttr(Attribute::NoUnwind);
3122 Builder.CreateBr(MergeBB);
3123
3124 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3125 Builder.CreateBr(MergeBB);
3126
3127 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3128
3129 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3130 // Reduce list.
3131 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3132 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3133 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3134
3135 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3136 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3137 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3138 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3139
3140 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3141
3142 EmitRedLsCpRes = emitReductionListCopy(
3143 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3144 RemoteListAddrCast, ReduceList, IsByRef);
3145
3146 if (EmitRedLsCpRes)
3147 return EmitRedLsCpRes;
3148
3149 Builder.CreateBr(CpyMergeBB);
3150
3151 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3152 Builder.CreateBr(CpyMergeBB);
3153
3154 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3155
3156 Builder.CreateRetVoid();
3157
3158 return SarFunc;
3159}
3160
3161Function *OpenMPIRBuilder::emitListToGlobalCopyFunction(
3162 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3163 AttributeList FuncAttrs) {
3164 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3165 LLVMContext &Ctx = M.getContext();
3167 Builder.getVoidTy(),
3168 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3169 /* IsVarArg */ false);
3170 Function *LtGCFunc =
3172 "_omp_reduction_list_to_global_copy_func", &M);
3173 LtGCFunc->setAttributes(FuncAttrs);
3174 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3175 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3176 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3177
3178 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3179 Builder.SetInsertPoint(EntryBlock);
3180
3181 // Buffer: global reduction buffer.
3182 Argument *BufferArg = LtGCFunc->getArg(0);
3183 // Idx: index of the buffer.
3184 Argument *IdxArg = LtGCFunc->getArg(1);
3185 // ReduceList: thread local Reduce list.
3186 Argument *ReduceListArg = LtGCFunc->getArg(2);
3187
3188 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3189 BufferArg->getName() + ".addr");
3190 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3191 IdxArg->getName() + ".addr");
3192 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3193 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3194 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3195 BufferArgAlloca, Builder.getPtrTy(),
3196 BufferArgAlloca->getName() + ".ascast");
3197 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3198 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3199 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3200 ReduceListArgAlloca, Builder.getPtrTy(),
3201 ReduceListArgAlloca->getName() + ".ascast");
3202
3203 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3204 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3205 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3206
3207 Value *LocalReduceList =
3208 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3209 Value *BufferArgVal =
3210 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3211 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3212 Type *IndexTy = Builder.getIndexTy(
3213 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3214 for (auto En : enumerate(ReductionInfos)) {
3215 const ReductionInfo &RI = En.value();
3216 auto *RedListArrayTy =
3217 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3218 // Reduce element = LocalReduceList[i]
3219 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3220 RedListArrayTy, LocalReduceList,
3221 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3222 // elemptr = ((CopyType*)(elemptrptr)) + I
3223 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3224
3225 // Global = Buffer.VD[Idx];
3226 Value *BufferVD =
3227 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3228 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3229 ReductionsBufferTy, BufferVD, 0, En.index());
3230
3231 switch (RI.EvaluationKind) {
3232 case EvalKind::Scalar: {
3233 Value *TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3234 Builder.CreateStore(TargetElement, GlobVal);
3235 break;
3236 }
3237 case EvalKind::Complex: {
3238 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3239 RI.ElementType, ElemPtr, 0, 0, ".realp");
3240 Value *SrcReal = Builder.CreateLoad(
3241 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3242 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3243 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3244 Value *SrcImg = Builder.CreateLoad(
3245 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3246
3247 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3248 RI.ElementType, GlobVal, 0, 0, ".realp");
3249 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3250 RI.ElementType, GlobVal, 0, 1, ".imagp");
3251 Builder.CreateStore(SrcReal, DestRealPtr);
3252 Builder.CreateStore(SrcImg, DestImgPtr);
3253 break;
3254 }
3255 case EvalKind::Aggregate: {
3256 Value *SizeVal =
3257 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3258 Builder.CreateMemCpy(
3259 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3260 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3261 break;
3262 }
3263 }
3264 }
3265
3266 Builder.CreateRetVoid();
3267 Builder.restoreIP(OldIP);
3268 return LtGCFunc;
3269}
3270
3271Function *OpenMPIRBuilder::emitListToGlobalReduceFunction(
3272 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3273 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3274 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3275 LLVMContext &Ctx = M.getContext();
3277 Builder.getVoidTy(),
3278 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3279 /* IsVarArg */ false);
3280 Function *LtGRFunc =
3282 "_omp_reduction_list_to_global_reduce_func", &M);
3283 LtGRFunc->setAttributes(FuncAttrs);
3284 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3285 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3286 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3287
3288 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3289 Builder.SetInsertPoint(EntryBlock);
3290
3291 // Buffer: global reduction buffer.
3292 Argument *BufferArg = LtGRFunc->getArg(0);
3293 // Idx: index of the buffer.
3294 Argument *IdxArg = LtGRFunc->getArg(1);
3295 // ReduceList: thread local Reduce list.
3296 Argument *ReduceListArg = LtGRFunc->getArg(2);
3297
3298 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3299 BufferArg->getName() + ".addr");
3300 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3301 IdxArg->getName() + ".addr");
3302 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3303 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3304 auto *RedListArrayTy =
3305 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3306
3307 // 1. Build a list of reduction variables.
3308 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3309 Value *LocalReduceList =
3310 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3311
3312 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3313 BufferArgAlloca, Builder.getPtrTy(),
3314 BufferArgAlloca->getName() + ".ascast");
3315 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3316 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3317 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3318 ReduceListArgAlloca, Builder.getPtrTy(),
3319 ReduceListArgAlloca->getName() + ".ascast");
3320 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3321 LocalReduceList, Builder.getPtrTy(),
3322 LocalReduceList->getName() + ".ascast");
3323
3324 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3325 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3326 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3327
3328 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3329 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3330 Type *IndexTy = Builder.getIndexTy(
3331 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3332 for (auto En : enumerate(ReductionInfos)) {
3333 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3334 RedListArrayTy, LocalReduceListAddrCast,
3335 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3336 Value *BufferVD =
3337 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3338 // Global = Buffer.VD[Idx];
3339 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3340 ReductionsBufferTy, BufferVD, 0, En.index());
3341 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3342 }
3343
3344 // Call reduce_function(GlobalReduceList, ReduceList)
3345 Value *ReduceList =
3346 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3347 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3348 ->addFnAttr(Attribute::NoUnwind);
3349 Builder.CreateRetVoid();
3350 Builder.restoreIP(OldIP);
3351 return LtGRFunc;
3352}
3353
3354Function *OpenMPIRBuilder::emitGlobalToListCopyFunction(
3355 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3356 AttributeList FuncAttrs) {
3357 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3358 LLVMContext &Ctx = M.getContext();
3360 Builder.getVoidTy(),
3361 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3362 /* IsVarArg */ false);
3363 Function *LtGCFunc =
3365 "_omp_reduction_global_to_list_copy_func", &M);
3366 LtGCFunc->setAttributes(FuncAttrs);
3367 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3368 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3369 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3370
3371 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3372 Builder.SetInsertPoint(EntryBlock);
3373
3374 // Buffer: global reduction buffer.
3375 Argument *BufferArg = LtGCFunc->getArg(0);
3376 // Idx: index of the buffer.
3377 Argument *IdxArg = LtGCFunc->getArg(1);
3378 // ReduceList: thread local Reduce list.
3379 Argument *ReduceListArg = LtGCFunc->getArg(2);
3380
3381 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3382 BufferArg->getName() + ".addr");
3383 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3384 IdxArg->getName() + ".addr");
3385 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3386 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3387 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3388 BufferArgAlloca, Builder.getPtrTy(),
3389 BufferArgAlloca->getName() + ".ascast");
3390 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3391 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3392 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3393 ReduceListArgAlloca, Builder.getPtrTy(),
3394 ReduceListArgAlloca->getName() + ".ascast");
3395 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3396 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3397 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3398
3399 Value *LocalReduceList =
3400 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3401 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3402 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3403 Type *IndexTy = Builder.getIndexTy(
3404 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3405 for (auto En : enumerate(ReductionInfos)) {
3406 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3407 auto *RedListArrayTy =
3408 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3409 // Reduce element = LocalReduceList[i]
3410 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3411 RedListArrayTy, LocalReduceList,
3412 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3413 // elemptr = ((CopyType*)(elemptrptr)) + I
3414 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3415 // Global = Buffer.VD[Idx];
3416 Value *BufferVD =
3417 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3418 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3419 ReductionsBufferTy, BufferVD, 0, En.index());
3420
3421 switch (RI.EvaluationKind) {
3422 case EvalKind::Scalar: {
3423 Value *TargetElement = Builder.CreateLoad(RI.ElementType, GlobValPtr);
3424 Builder.CreateStore(TargetElement, ElemPtr);
3425 break;
3426 }
3427 case EvalKind::Complex: {
3428 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3429 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3430 Value *SrcReal = Builder.CreateLoad(
3431 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3432 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3433 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3434 Value *SrcImg = Builder.CreateLoad(
3435 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3436
3437 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3438 RI.ElementType, ElemPtr, 0, 0, ".realp");
3439 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3440 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3441 Builder.CreateStore(SrcReal, DestRealPtr);
3442 Builder.CreateStore(SrcImg, DestImgPtr);
3443 break;
3444 }
3445 case EvalKind::Aggregate: {
3446 Value *SizeVal =
3447 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3448 Builder.CreateMemCpy(
3449 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3450 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3451 SizeVal, false);
3452 break;
3453 }
3454 }
3455 }
3456
3457 Builder.CreateRetVoid();
3458 Builder.restoreIP(OldIP);
3459 return LtGCFunc;
3460}
3461
3462Function *OpenMPIRBuilder::emitGlobalToListReduceFunction(
3463 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3464 Type *ReductionsBufferTy, AttributeList FuncAttrs) {
3465 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3466 LLVMContext &Ctx = M.getContext();
3467 auto *FuncTy = FunctionType::get(
3468 Builder.getVoidTy(),
3469 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3470 /* IsVarArg */ false);
3471 Function *LtGRFunc =
3473 "_omp_reduction_global_to_list_reduce_func", &M);
3474 LtGRFunc->setAttributes(FuncAttrs);
3475 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3476 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3477 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3478
3479 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3480 Builder.SetInsertPoint(EntryBlock);
3481
3482 // Buffer: global reduction buffer.
3483 Argument *BufferArg = LtGRFunc->getArg(0);
3484 // Idx: index of the buffer.
3485 Argument *IdxArg = LtGRFunc->getArg(1);
3486 // ReduceList: thread local Reduce list.
3487 Argument *ReduceListArg = LtGRFunc->getArg(2);
3488
3489 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3490 BufferArg->getName() + ".addr");
3491 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3492 IdxArg->getName() + ".addr");
3493 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3494 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3495 ArrayType *RedListArrayTy =
3496 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3497
3498 // 1. Build a list of reduction variables.
3499 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3500 Value *LocalReduceList =
3501 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3502
3503 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3504 BufferArgAlloca, Builder.getPtrTy(),
3505 BufferArgAlloca->getName() + ".ascast");
3506 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3507 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3508 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3509 ReduceListArgAlloca, Builder.getPtrTy(),
3510 ReduceListArgAlloca->getName() + ".ascast");
3511 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3512 LocalReduceList, Builder.getPtrTy(),
3513 LocalReduceList->getName() + ".ascast");
3514
3515 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3516 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3517 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3518
3519 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3520 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3521 Type *IndexTy = Builder.getIndexTy(
3522 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3523 for (auto En : enumerate(ReductionInfos)) {
3524 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3525 RedListArrayTy, ReductionList,
3526 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3527 // Global = Buffer.VD[Idx];
3528 Value *BufferVD =
3529 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3530 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3531 ReductionsBufferTy, BufferVD, 0, En.index());
3532 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3533 }
3534
3535 // Call reduce_function(ReduceList, GlobalReduceList)
3536 Value *ReduceList =
3537 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3538 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
3539 ->addFnAttr(Attribute::NoUnwind);
3540 Builder.CreateRetVoid();
3541 Builder.restoreIP(OldIP);
3542 return LtGRFunc;
3543}
3544
3545std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3546 std::string Suffix =
3547 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3548 return (Name + Suffix).str();
3549}
3550
3551Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3552 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3553 ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind,
3554 AttributeList FuncAttrs) {
3555 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3556 {Builder.getPtrTy(), Builder.getPtrTy()},
3557 /* IsVarArg */ false);
3558 std::string Name = getReductionFuncName(ReducerName);
3559 Function *ReductionFunc =
3561 ReductionFunc->setAttributes(FuncAttrs);
3562 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3563 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3564 BasicBlock *EntryBB =
3565 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3566 Builder.SetInsertPoint(EntryBB);
3567
3568 // Need to alloca memory here and deal with the pointers before getting
3569 // LHS/RHS pointers out
3570 Value *LHSArrayPtr = nullptr;
3571 Value *RHSArrayPtr = nullptr;
3572 Argument *Arg0 = ReductionFunc->getArg(0);
3573 Argument *Arg1 = ReductionFunc->getArg(1);
3574 Type *Arg0Type = Arg0->getType();
3575 Type *Arg1Type = Arg1->getType();
3576
3577 Value *LHSAlloca =
3578 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3579 Value *RHSAlloca =
3580 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3581 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3582 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3583 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3584 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3585 Builder.CreateStore(Arg0, LHSAddrCast);
3586 Builder.CreateStore(Arg1, RHSAddrCast);
3587 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3588 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3589
3590 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3591 Type *IndexTy = Builder.getIndexTy(
3592 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3593 SmallVector<Value *> LHSPtrs, RHSPtrs;
3594 for (auto En : enumerate(ReductionInfos)) {
3595 const ReductionInfo &RI = En.value();
3596 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3597 RedArrayTy, RHSArrayPtr,
3598 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3599 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3600 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3601 RHSI8Ptr, RI.PrivateVariable->getType(),
3602 RHSI8Ptr->getName() + ".ascast");
3603
3604 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3605 RedArrayTy, LHSArrayPtr,
3606 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3607 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3608 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3609 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3610
3611 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3612 LHSPtrs.emplace_back(LHSPtr);
3613 RHSPtrs.emplace_back(RHSPtr);
3614 } else {
3615 Value *LHS = LHSPtr;
3616 Value *RHS = RHSPtr;
3617
3618 if (!IsByRef.empty() && !IsByRef[En.index()]) {
3619 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3620 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3621 }
3622
3623 Value *Reduced;
3624 InsertPointOrErrorTy AfterIP =
3625 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3626 if (!AfterIP)
3627 return AfterIP.takeError();
3628 if (!Builder.GetInsertBlock())
3629 return ReductionFunc;
3630
3631 Builder.restoreIP(*AfterIP);
3632
3633 if (!IsByRef.empty() && !IsByRef[En.index()])
3634 Builder.CreateStore(Reduced, LHSPtr);
3635 }
3636 }
3637
3638 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3639 for (auto En : enumerate(ReductionInfos)) {
3640 unsigned Index = En.index();
3641 const ReductionInfo &RI = En.value();
3642 Value *LHSFixupPtr, *RHSFixupPtr;
3643 Builder.restoreIP(RI.ReductionGenClang(
3644 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3645
3646 // Fix the CallBack code genereated to use the correct Values for the LHS
3647 // and RHS
3648 LHSFixupPtr->replaceUsesWithIf(
3649 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3650 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3651 ReductionFunc;
3652 });
3653 RHSFixupPtr->replaceUsesWithIf(
3654 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3655 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3656 ReductionFunc;
3657 });
3658 }
3659
3660 Builder.CreateRetVoid();
3661 return ReductionFunc;
3662}
3663
3664static void
3666 bool IsGPU) {
3667 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3668 (void)RI;
3669 assert(RI.Variable && "expected non-null variable");
3670 assert(RI.PrivateVariable && "expected non-null private variable");
3671 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3672 "expected non-null reduction generator callback");
3673 if (!IsGPU) {
3674 assert(
3675 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3676 "expected variables and their private equivalents to have the same "
3677 "type");
3678 }
3679 assert(RI.Variable->getType()->isPointerTy() &&
3680 "expected variables to be pointers");
3681 }
3682}
3683
3684OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3685 const LocationDescription &Loc, InsertPointTy AllocaIP,
3686 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3687 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
3688 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3689 unsigned ReductionBufNum, Value *SrcLocInfo) {
3690 if (!updateToLocation(Loc))
3691 return InsertPointTy();
3692 Builder.restoreIP(CodeGenIP);
3693 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3694 LLVMContext &Ctx = M.getContext();
3695
3696 // Source location for the ident struct
3697 if (!SrcLocInfo) {
3698 uint32_t SrcLocStrSize;
3699 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3700 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3701 }
3702
3703 if (ReductionInfos.size() == 0)
3704 return Builder.saveIP();
3705
3706 BasicBlock *ContinuationBlock = nullptr;
3707 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3708 // Copied code from createReductions
3709 BasicBlock *InsertBlock = Loc.IP.getBlock();
3710 ContinuationBlock =
3711 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3712 InsertBlock->getTerminator()->eraseFromParent();
3713 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3714 }
3715
3716 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3717 AttributeList FuncAttrs;
3718 AttrBuilder AttrBldr(Ctx);
3719 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3720 AttrBldr.addAttribute(Attr);
3721 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3722 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3723
3724 CodeGenIP = Builder.saveIP();
3725 Expected<Function *> ReductionResult = createReductionFunction(
3726 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
3727 ReductionGenCBKind, FuncAttrs);
3728 if (!ReductionResult)
3729 return ReductionResult.takeError();
3730 Function *ReductionFunc = *ReductionResult;
3731 Builder.restoreIP(CodeGenIP);
3732
3733 // Set the grid value in the config needed for lowering later on
3734 if (GridValue.has_value())
3735 Config.setGridValue(GridValue.value());
3736 else
3737 Config.setGridValue(getGridValue(T, ReductionFunc));
3738
3739 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3740 // RedList, shuffle_reduce_func, interwarp_copy_func);
3741 // or
3742 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3743 Value *Res;
3744
3745 // 1. Build a list of reduction variables.
3746 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3747 auto Size = ReductionInfos.size();
3748 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
3749 Type *FuncPtrTy =
3750 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
3751 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3752 CodeGenIP = Builder.saveIP();
3753 Builder.restoreIP(AllocaIP);
3754 Value *ReductionListAlloca =
3755 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3756 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3757 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3758 Builder.restoreIP(CodeGenIP);
3759 Type *IndexTy = Builder.getIndexTy(
3760 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3761 for (auto En : enumerate(ReductionInfos)) {
3762 const ReductionInfo &RI = En.value();
3763 Value *ElemPtr = Builder.CreateInBoundsGEP(
3764 RedArrayTy, ReductionList,
3765 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3766
3767 Value *PrivateVar = RI.PrivateVariable;
3768 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3769 if (IsByRefElem)
3770 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
3771
3772 Value *CastElem =
3773 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
3774 Builder.CreateStore(CastElem, ElemPtr);
3775 }
3776 CodeGenIP = Builder.saveIP();
3777 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
3778 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
3779
3780 if (!SarFunc)
3781 return SarFunc.takeError();
3782
3783 Expected<Function *> CopyResult =
3784 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
3785 if (!CopyResult)
3786 return CopyResult.takeError();
3787 Function *WcFunc = *CopyResult;
3788 Builder.restoreIP(CodeGenIP);
3789
3790 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3791
3792 unsigned MaxDataSize = 0;
3793 SmallVector<Type *> ReductionTypeArgs;
3794 for (auto En : enumerate(ReductionInfos)) {
3795 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3796 if (Size > MaxDataSize)
3797 MaxDataSize = Size;
3798 ReductionTypeArgs.emplace_back(En.value().ElementType);
3799 }
3800 Value *ReductionDataSize =
3801 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3802 if (!IsTeamsReduction) {
3803 Value *SarFuncCast =
3804 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
3805 Value *WcFuncCast =
3806 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
3807 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3808 WcFuncCast};
3809 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3810 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3811 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
3812 } else {
3813 CodeGenIP = Builder.saveIP();
3814 StructType *ReductionsBufferTy = StructType::create(
3815 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3816 Function *RedFixedBuferFn = getOrCreateRuntimeFunctionPtr(
3817 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3818 Function *LtGCFunc = emitListToGlobalCopyFunction(
3819 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3820 Function *LtGRFunc = emitListToGlobalReduceFunction(
3821 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3822 Function *GtLCFunc = emitGlobalToListCopyFunction(
3823 ReductionInfos, ReductionsBufferTy, FuncAttrs);
3824 Function *GtLRFunc = emitGlobalToListReduceFunction(
3825 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs);
3826 Builder.restoreIP(CodeGenIP);
3827
3828 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
3829 RedFixedBuferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3830
3831 Value *Args3[] = {SrcLocInfo,
3832 KernelTeamsReductionPtr,
3833 Builder.getInt32(ReductionBufNum),
3834 ReductionDataSize,
3835 RL,
3836 *SarFunc,
3837 WcFunc,
3838 LtGCFunc,
3839 LtGRFunc,
3840 GtLCFunc,
3841 GtLRFunc};
3842
3843 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3844 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3845 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
3846 }
3847
3848 // 5. Build if (res == 1)
3849 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3850 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3851 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3852 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3853
3854 // 6. Build then branch: where we have reduced values in the master
3855 // thread in each team.
3856 // __kmpc_end_reduce{_nowait}(<gtid>);
3857 // break;
3858 emitBlock(ThenBB, CurFunc);
3859
3860 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3861 for (auto En : enumerate(ReductionInfos)) {
3862 const ReductionInfo &RI = En.value();
3863 Type *ValueType = RI.ElementType;
3864 Value *RedValue = RI.Variable;
3865 Value *RHS =
3866 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3867
3868 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3869 Value *LHSPtr, *RHSPtr;
3870 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3871 &LHSPtr, &RHSPtr, CurFunc));
3872
3873 // Fix the CallBack code genereated to use the correct Values for the LHS
3874 // and RHS
3875 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
3876 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3877 ReductionFunc;
3878 });
3879 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3880 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3881 ReductionFunc;
3882 });
3883 } else {
3884 if (IsByRef.empty() || !IsByRef[En.index()]) {
3885 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3886 "red.value." + Twine(En.index()));
3887 }
3888 Value *PrivateRedValue = Builder.CreateLoad(
3889 ValueType, RHS, "red.private.value" + Twine(En.index()));
3890 Value *Reduced;
3891 InsertPointOrErrorTy AfterIP =
3892 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
3893 if (!AfterIP)
3894 return AfterIP.takeError();
3895 Builder.restoreIP(*AfterIP);
3896
3897 if (!IsByRef.empty() && !IsByRef[En.index()])
3898 Builder.CreateStore(Reduced, RI.Variable);
3899 }
3900 }
3901 emitBlock(ExitBB, CurFunc);
3902 if (ContinuationBlock) {
3903 Builder.CreateBr(ContinuationBlock);
3904 Builder.SetInsertPoint(ContinuationBlock);
3905 }
3906 Config.setEmitLLVMUsed();
3907
3908 return Builder.saveIP();
3909}
3910
3912 Type *VoidTy = Type::getVoidTy(M.getContext());
3913 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
3914 auto *FuncTy =
3915 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
3917 ".omp.reduction.func", &M);
3918}
3919
3921 Function *ReductionFunc,
3923 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
3924 Module *Module = ReductionFunc->getParent();
3925 BasicBlock *ReductionFuncBlock =
3926 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
3927 Builder.SetInsertPoint(ReductionFuncBlock);
3928 Value *LHSArrayPtr = nullptr;
3929 Value *RHSArrayPtr = nullptr;
3930 if (IsGPU) {
3931 // Need to alloca memory here and deal with the pointers before getting
3932 // LHS/RHS pointers out
3933 //
3934 Argument *Arg0 = ReductionFunc->getArg(0);
3935 Argument *Arg1 = ReductionFunc->getArg(1);
3936 Type *Arg0Type = Arg0->getType();
3937 Type *Arg1Type = Arg1->getType();
3938
3939 Value *LHSAlloca =
3940 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3941 Value *RHSAlloca =
3942 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3943 Value *LHSAddrCast =
3944 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
3945 Value *RHSAddrCast =
3946 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
3947 Builder.CreateStore(Arg0, LHSAddrCast);
3948 Builder.CreateStore(Arg1, RHSAddrCast);
3949 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3950 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3951 } else {
3952 LHSArrayPtr = ReductionFunc->getArg(0);
3953 RHSArrayPtr = ReductionFunc->getArg(1);
3954 }
3955
3956 unsigned NumReductions = ReductionInfos.size();
3957 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
3958
3959 for (auto En : enumerate(ReductionInfos)) {
3960 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3961 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3962 RedArrayTy, LHSArrayPtr, 0, En.index());
3963 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3964 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3965 LHSI8Ptr, RI.Variable->getType());
3966 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3967 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
3968 RedArrayTy, RHSArrayPtr, 0, En.index());
3969 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3970 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3971 RHSI8Ptr, RI.PrivateVariable->getType());
3972 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3973 Value *Reduced;
3974 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
3975 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3976 if (!AfterIP)
3977 return AfterIP.takeError();
3978
3979 Builder.restoreIP(*AfterIP);
3980 // TODO: Consider flagging an error.
3981 if (!Builder.GetInsertBlock())
3982 return Error::success();
3983
3984 // store is inside of the reduction region when using by-ref
3985 if (!IsByRef[En.index()])
3986 Builder.CreateStore(Reduced, LHSPtr);
3987 }
3988 Builder.CreateRetVoid();
3989 return Error::success();
3990}
3991
3992OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
3993 const LocationDescription &Loc, InsertPointTy AllocaIP,
3994 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
3995 bool IsNoWait, bool IsTeamsReduction) {
3996 assert(ReductionInfos.size() == IsByRef.size());
3997 if (Config.isGPU())
3998 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
3999 IsByRef, IsNoWait, IsTeamsReduction);
4000
4001 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4002
4003 if (!updateToLocation(Loc))
4004 return InsertPointTy();
4005
4006 if (ReductionInfos.size() == 0)
4007 return Builder.saveIP();
4008
4009 BasicBlock *InsertBlock = Loc.IP.getBlock();
4010 BasicBlock *ContinuationBlock =
4011 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4012 InsertBlock->getTerminator()->eraseFromParent();
4013
4014 // Create and populate array of type-erased pointers to private reduction
4015 // values.
4016 unsigned NumReductions = ReductionInfos.size();
4017 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4018 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4019 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4020
4021 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4022
4023 for (auto En : enumerate(ReductionInfos)) {
4024 unsigned Index = En.index();
4025 const ReductionInfo &RI = En.value();
4026 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4027 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4028 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4029 }
4030
4031 // Emit a call to the runtime function that orchestrates the reduction.
4032 // Declare the reduction function in the process.
4033 Type *IndexTy = Builder.getIndexTy(
4034 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4035 Function *Func = Builder.GetInsertBlock()->getParent();
4036 Module *Module = Func->getParent();
4037 uint32_t SrcLocStrSize;
4038 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4039 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4040 return RI.AtomicReductionGen;
4041 });
4042 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4043 CanGenerateAtomic
4044 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4045 : IdentFlag(0));
4046 Value *ThreadId = getOrCreateThreadID(Ident);
4047 Constant *NumVariables = Builder.getInt32(NumReductions);
4048 const DataLayout &DL = Module->getDataLayout();
4049 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4050 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4051 Function *ReductionFunc = getFreshReductionFunc(*Module);
4052 Value *Lock = getOMPCriticalRegionLock(".reduction");
4053 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
4054 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4055 : RuntimeFunction::OMPRTL___kmpc_reduce);
4056 CallInst *ReduceCall =
4057 createRuntimeFunctionCall(ReduceFunc,
4058 {Ident, ThreadId, NumVariables, RedArraySize,
4059 RedArray, ReductionFunc, Lock},
4060 "reduce");
4061
4062 // Create final reduction entry blocks for the atomic and non-atomic case.
4063 // Emit IR that dispatches control flow to one of the blocks based on the
4064 // reduction supporting the atomic mode.
4065 BasicBlock *NonAtomicRedBlock =
4066 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4067 BasicBlock *AtomicRedBlock =
4068 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4070 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4071 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4072 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4073
4074 // Populate the non-atomic reduction using the elementwise reduction function.
4075 // This loads the elements from the global and private variables and reduces
4076 // them before storing back the result to the global variable.
4077 Builder.SetInsertPoint(NonAtomicRedBlock);
4078 for (auto En : enumerate(ReductionInfos)) {
4079 const ReductionInfo &RI = En.value();
4080 Type *ValueType = RI.ElementType;
4081 // We have one less load for by-ref case because that load is now inside of
4082 // the reduction region
4083 Value *RedValue = RI.Variable;
4084 if (!IsByRef[En.index()]) {
4085 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4086 "red.value." + Twine(En.index()));
4087 }
4088 Value *PrivateRedValue =
4089 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4090 "red.private.value." + Twine(En.index()));
4091 Value *Reduced;
4092 InsertPointOrErrorTy AfterIP =
4093 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4094 if (!AfterIP)
4095 return AfterIP.takeError();
4096 Builder.restoreIP(*AfterIP);
4097
4098 if (!Builder.GetInsertBlock())
4099 return InsertPointTy();
4100 // for by-ref case, the load is inside of the reduction region
4101 if (!IsByRef[En.index()])
4102 Builder.CreateStore(Reduced, RI.Variable);
4103 }
4104 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4105 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4106 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4107 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4108 Builder.CreateBr(ContinuationBlock);
4109
4110 // Populate the atomic reduction using the atomic elementwise reduction
4111 // function. There are no loads/stores here because they will be happening
4112 // inside the atomic elementwise reduction.
4113 Builder.SetInsertPoint(AtomicRedBlock);
4114 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4115 for (const ReductionInfo &RI : ReductionInfos) {
4116 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
4117 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4118 if (!AfterIP)
4119 return AfterIP.takeError();
4120 Builder.restoreIP(*AfterIP);
4121 if (!Builder.GetInsertBlock())
4122 return InsertPointTy();
4123 }
4124 Builder.CreateBr(ContinuationBlock);
4125 } else {
4126 Builder.CreateUnreachable();
4127 }
4128
4129 // Populate the outlined reduction function using the elementwise reduction
4130 // function. Partial values are extracted from the type-erased array of
4131 // pointers to private variables.
4132 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4133 IsByRef, /*isGPU=*/false);
4134 if (Err)
4135 return Err;
4136
4137 if (!Builder.GetInsertBlock())
4138 return InsertPointTy();
4139
4140 Builder.SetInsertPoint(ContinuationBlock);
4141 return Builder.saveIP();
4142}
4143
4144OpenMPIRBuilder::InsertPointOrErrorTy
4145OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4146 BodyGenCallbackTy BodyGenCB,
4147 FinalizeCallbackTy FiniCB) {
4148 if (!updateToLocation(Loc))
4149 return Loc.IP;
4150
4151 Directive OMPD = Directive::OMPD_master;
4152 uint32_t SrcLocStrSize;
4153 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4154 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4155 Value *ThreadId = getOrCreateThreadID(Ident);
4156 Value *Args[] = {Ident, ThreadId};
4157
4158 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4159 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4160
4161 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4162 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4163
4164 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4165 /*Conditional*/ true, /*hasFinalize*/ true);
4166}
4167
4168OpenMPIRBuilder::InsertPointOrErrorTy
4169OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4170 BodyGenCallbackTy BodyGenCB,
4171 FinalizeCallbackTy FiniCB, Value *Filter) {
4172 if (!updateToLocation(Loc))
4173 return Loc.IP;
4174
4175 Directive OMPD = Directive::OMPD_masked;
4176 uint32_t SrcLocStrSize;
4177 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4178 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4179 Value *ThreadId = getOrCreateThreadID(Ident);
4180 Value *Args[] = {Ident, ThreadId, Filter};
4181 Value *ArgsEnd[] = {Ident, ThreadId};
4182
4183 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4184 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4185
4186 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4187 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4188
4189 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4190 /*Conditional*/ true, /*hasFinalize*/ true);
4191}
4192
4194 llvm::FunctionCallee Callee,
4196 const llvm::Twine &Name) {
4197 llvm::CallInst *Call = Builder.CreateCall(
4198 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4199 Call->setDoesNotThrow();
4200 return Call;
4201}
4202
4203// Expects input basic block is dominated by BeforeScanBB.
4204// Once Scan directive is encountered, the code after scan directive should be
4205// dominated by AfterScanBB. Scan directive splits the code sequence to
4206// scan and input phase. Based on whether inclusive or exclusive
4207// clause is used in the scan directive and whether input loop or scan loop
4208// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4209// input loop and second is the scan loop. The code generated handles only
4210// inclusive scans now.
4211OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4212 const LocationDescription &Loc, InsertPointTy AllocaIP,
4213 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4214 bool IsInclusive, ScanInfo *ScanRedInfo) {
4215 if (ScanRedInfo->OMPFirstScanLoop) {
4216 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4217 ScanVarsType, ScanRedInfo);
4218 if (Err)
4219 return Err;
4220 }
4221 if (!updateToLocation(Loc))
4222 return Loc.IP;
4223
4224 llvm::Value *IV = ScanRedInfo->IV;
4225
4226 if (ScanRedInfo->OMPFirstScanLoop) {
4227 // Emit buffer[i] = red; at the end of the input phase.
4228 for (size_t i = 0; i < ScanVars.size(); i++) {
4229 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4230 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4231 Type *DestTy = ScanVarsType[i];
4232 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4233 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4234
4235 Builder.CreateStore(Src, Val);
4236 }
4237 }
4238 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4239 emitBlock(ScanRedInfo->OMPScanDispatch,
4240 Builder.GetInsertBlock()->getParent());
4241
4242 if (!ScanRedInfo->OMPFirstScanLoop) {
4243 IV = ScanRedInfo->IV;
4244 // Emit red = buffer[i]; at the entrance to the scan phase.
4245 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4246 for (size_t i = 0; i < ScanVars.size(); i++) {
4247 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4248 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4249 Type *DestTy = ScanVarsType[i];
4250 Value *SrcPtr =
4251 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4252 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4253 Builder.CreateStore(Src, ScanVars[i]);
4254 }
4255 }
4256
4257 // TODO: Update it to CreateBr and remove dead blocks
4258 llvm::Value *CmpI = Builder.getInt1(true);
4259 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4260 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4261 ScanRedInfo->OMPAfterScanBlock);
4262 } else {
4263 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4264 ScanRedInfo->OMPBeforeScanBlock);
4265 }
4266 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4267 Builder.GetInsertBlock()->getParent());
4268 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4269 return Builder.saveIP();
4270}
4271
4272Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4273 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4274 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4275
4276 Builder.restoreIP(AllocaIP);
4277 // Create the shared pointer at alloca IP.
4278 for (size_t i = 0; i < ScanVars.size(); i++) {
4279 llvm::Value *BuffPtr =
4280 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4281 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4282 }
4283
4284 // Allocate temporary buffer by master thread
4285 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4286 InsertPointTy CodeGenIP) -> Error {
4287 Builder.restoreIP(CodeGenIP);
4288 Value *AllocSpan =
4289 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4290 for (size_t i = 0; i < ScanVars.size(); i++) {
4291 Type *IntPtrTy = Builder.getInt32Ty();
4292 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4293 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4294 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4295 AllocSpan, nullptr, "arr");
4296 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4297 }
4298 return Error::success();
4299 };
4300 // TODO: Perform finalization actions for variables. This has to be
4301 // called for variables which have destructors/finalizers.
4302 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4303
4304 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4305 llvm::Value *FilterVal = Builder.getInt32(0);
4306 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4307 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4308
4309 if (!AfterIP)
4310 return AfterIP.takeError();
4311 Builder.restoreIP(*AfterIP);
4312 BasicBlock *InputBB = Builder.GetInsertBlock();
4313 if (InputBB->getTerminator())
4314 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4315 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4316 if (!AfterIP)
4317 return AfterIP.takeError();
4318 Builder.restoreIP(*AfterIP);
4319
4320 return Error::success();
4321}
4322
4323Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4324 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4325 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4326 InsertPointTy CodeGenIP) -> Error {
4327 Builder.restoreIP(CodeGenIP);
4328 for (ReductionInfo RedInfo : ReductionInfos) {
4329 Value *PrivateVar = RedInfo.PrivateVariable;
4330 Value *OrigVar = RedInfo.Variable;
4331 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4332 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4333
4334 Type *SrcTy = RedInfo.ElementType;
4335 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4336 "arrayOffset");
4337 Value *Src = Builder.CreateLoad(SrcTy, Val);
4338
4339 Builder.CreateStore(Src, OrigVar);
4340 Builder.CreateFree(Buff);
4341 }
4342 return Error::success();
4343 };
4344 // TODO: Perform finalization actions for variables. This has to be
4345 // called for variables which have destructors/finalizers.
4346 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4347
4348 if (ScanRedInfo->OMPScanFinish->getTerminator())
4349 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4350 else
4351 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4352
4353 llvm::Value *FilterVal = Builder.getInt32(0);
4354 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4355 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4356
4357 if (!AfterIP)
4358 return AfterIP.takeError();
4359 Builder.restoreIP(*AfterIP);
4360 BasicBlock *InputBB = Builder.GetInsertBlock();
4361 if (InputBB->getTerminator())
4362 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4363 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4364 if (!AfterIP)
4365 return AfterIP.takeError();
4366 Builder.restoreIP(*AfterIP);
4367 return Error::success();
4368}
4369
4370OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4371 const LocationDescription &Loc,
4373 ScanInfo *ScanRedInfo) {
4374
4375 if (!updateToLocation(Loc))
4376 return Loc.IP;
4377 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4378 InsertPointTy CodeGenIP) -> Error {
4379 Builder.restoreIP(CodeGenIP);
4380 Function *CurFn = Builder.GetInsertBlock()->getParent();
4381 // for (int k = 0; k <= ceil(log2(n)); ++k)
4382 llvm::BasicBlock *LoopBB =
4383 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4384 llvm::BasicBlock *ExitBB =
4385 splitBB(Builder, false, "omp.outer.log.scan.exit");
4387 Builder.GetInsertBlock()->getModule(),
4388 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4389 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4390 llvm::Value *Arg =
4391 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4392 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4394 Builder.GetInsertBlock()->getModule(),
4395 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4396 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4397 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4398 llvm::Value *NMin1 = Builder.CreateNUWSub(
4399 ScanRedInfo->Span,
4400 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4401 Builder.SetInsertPoint(InputBB);
4402 Builder.CreateBr(LoopBB);
4403 emitBlock(LoopBB, CurFn);
4404 Builder.SetInsertPoint(LoopBB);
4405
4406 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4407 // size pow2k = 1;
4408 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4409 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4410 InputBB);
4411 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4412 InputBB);
4413 // for (size i = n - 1; i >= 2 ^ k; --i)
4414 // tmp[i] op= tmp[i-pow2k];
4415 llvm::BasicBlock *InnerLoopBB =
4416 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4417 llvm::BasicBlock *InnerExitBB =
4418 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4419 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4420 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4421 emitBlock(InnerLoopBB, CurFn);
4422 Builder.SetInsertPoint(InnerLoopBB);
4423 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4424 IVal->addIncoming(NMin1, LoopBB);
4425 for (ReductionInfo RedInfo : ReductionInfos) {
4426 Value *ReductionVal = RedInfo.PrivateVariable;
4427 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4428 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4429 Type *DestTy = RedInfo.ElementType;
4430 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4431 Value *LHSPtr =
4432 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4433 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4434 Value *RHSPtr =
4435 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4436 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4437 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4439 InsertPointOrErrorTy AfterIP =
4440 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4441 if (!AfterIP)
4442 return AfterIP.takeError();
4443 Builder.CreateStore(Result, LHSPtr);
4444 }
4445 llvm::Value *NextIVal = Builder.CreateNUWSub(
4446 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4447 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4448 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4449 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4450 emitBlock(InnerExitBB, CurFn);
4451 llvm::Value *Next = Builder.CreateNUWAdd(
4452 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4453 Counter->addIncoming(Next, Builder.GetInsertBlock());
4454 // pow2k <<= 1;
4455 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4456 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4457 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4458 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4459 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4460 return Error::success();
4461 };
4462
4463 // TODO: Perform finalization actions for variables. This has to be
4464 // called for variables which have destructors/finalizers.
4465 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4466
4467 llvm::Value *FilterVal = Builder.getInt32(0);
4468 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4469 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4470
4471 if (!AfterIP)
4472 return AfterIP.takeError();
4473 Builder.restoreIP(*AfterIP);
4474 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4475
4476 if (!AfterIP)
4477 return AfterIP.takeError();
4478 Builder.restoreIP(*AfterIP);
4479 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4480 if (Err)
4481 return Err;
4482
4483 return AfterIP;
4484}
4485
4486Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4487 llvm::function_ref<Error()> InputLoopGen,
4488 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4489 ScanInfo *ScanRedInfo) {
4490
4491 {
4492 // Emit loop with input phase:
4493 // for (i: 0..<num_iters>) {
4494 // <input phase>;
4495 // buffer[i] = red;
4496 // }
4497 ScanRedInfo->OMPFirstScanLoop = true;
4498 Error Err = InputLoopGen();
4499 if (Err)
4500 return Err;
4501 }
4502 {
4503 // Emit loop with scan phase:
4504 // for (i: 0..<num_iters>) {
4505 // red = buffer[i];
4506 // <scan phase>;
4507 // }
4508 ScanRedInfo->OMPFirstScanLoop = false;
4509 Error Err = ScanLoopGen(Builder.saveIP());
4510 if (Err)
4511 return Err;
4512 }
4513 return Error::success();
4514}
4515
4516void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4517 Function *Fun = Builder.GetInsertBlock()->getParent();
4518 ScanRedInfo->OMPScanDispatch =
4519 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4520 ScanRedInfo->OMPAfterScanBlock =
4521 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4522 ScanRedInfo->OMPBeforeScanBlock =
4523 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4524 ScanRedInfo->OMPScanLoopExit =
4525 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4526}
4527CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4528 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4529 BasicBlock *PostInsertBefore, const Twine &Name) {
4530 Module *M = F->getParent();
4531 LLVMContext &Ctx = M->getContext();
4532 Type *IndVarTy = TripCount->getType();
4533
4534 // Create the basic block structure.
4535 BasicBlock *Preheader =
4536 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4537 BasicBlock *Header =
4538 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4539 BasicBlock *Cond =
4540 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4541 BasicBlock *Body =
4542 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4543 BasicBlock *Latch =
4544 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4545 BasicBlock *Exit =
4546 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4547 BasicBlock *After =
4548 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4549
4550 // Use specified DebugLoc for new instructions.
4551 Builder.SetCurrentDebugLocation(DL);
4552
4553 Builder.SetInsertPoint(Preheader);
4554 Builder.CreateBr(Header);
4555
4556 Builder.SetInsertPoint(Header);
4557 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4558 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4559 Builder.CreateBr(Cond);
4560
4561 Builder.SetInsertPoint(Cond);
4562 Value *Cmp =
4563 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4564 Builder.CreateCondBr(Cmp, Body, Exit);
4565
4566 Builder.SetInsertPoint(Body);
4567 Builder.CreateBr(Latch);
4568
4569 Builder.SetInsertPoint(Latch);
4570 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4571 "omp_" + Name + ".next", /*HasNUW=*/true);
4572 Builder.CreateBr(Header);
4573 IndVarPHI->addIncoming(Next, Latch);
4574
4575 Builder.SetInsertPoint(Exit);
4576 Builder.CreateBr(After);
4577
4578 // Remember and return the canonical control flow.
4579 LoopInfos.emplace_front();
4580 CanonicalLoopInfo *CL = &LoopInfos.front();
4581
4582 CL->Header = Header;
4583 CL->Cond = Cond;
4584 CL->Latch = Latch;
4585 CL->Exit = Exit;
4586
4587#ifndef NDEBUG
4588 CL->assertOK();
4589#endif
4590 return CL;
4591}
4592
4594OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4595 LoopBodyGenCallbackTy BodyGenCB,
4596 Value *TripCount, const Twine &Name) {
4597 BasicBlock *BB = Loc.IP.getBlock();
4598 BasicBlock *NextBB = BB->getNextNode();
4599
4600 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4601 NextBB, NextBB, Name);
4602 BasicBlock *After = CL->getAfter();
4603
4604 // If location is not set, don't connect the loop.
4605 if (updateToLocation(Loc)) {
4606 // Split the loop at the insertion point: Branch to the preheader and move
4607 // every following instruction to after the loop (the After BB). Also, the
4608 // new successor is the loop's after block.
4609 spliceBB(Builder, After, /*CreateBranch=*/false);
4610 Builder.CreateBr(CL->getPreheader());
4611 }
4612
4613 // Emit the body content. We do it after connecting the loop to the CFG to
4614 // avoid that the callback encounters degenerate BBs.
4615 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4616 return Err;
4617
4618#ifndef NDEBUG
4619 CL->assertOK();
4620#endif
4621 return CL;
4622}
4623
4624Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4625 ScanInfos.emplace_front();
4626 ScanInfo *Result = &ScanInfos.front();
4627 return Result;
4628}
4629
4631OpenMPIRBuilder::createCanonicalScanLoops(
4632 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4633 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4634 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4635 LocationDescription ComputeLoc =
4636 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4637 updateToLocation(ComputeLoc);
4638
4640
4641 Value *TripCount = calculateCanonicalLoopTripCount(
4642 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4643 ScanRedInfo->Span = TripCount;
4644 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4645 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4646
4647 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4648 Builder.restoreIP(CodeGenIP);
4649 ScanRedInfo->IV = IV;
4650 createScanBBs(ScanRedInfo);
4651 BasicBlock *InputBlock = Builder.GetInsertBlock();
4652 Instruction *Terminator = InputBlock->getTerminator();
4653 assert(Terminator->getNumSuccessors() == 1);
4654 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4655 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4656 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4657 Builder.GetInsertBlock()->getParent());
4658 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4659 emitBlock(ScanRedInfo->OMPScanLoopExit,
4660 Builder.GetInsertBlock()->getParent());
4661 Builder.CreateBr(ContinueBlock);
4662 Builder.SetInsertPoint(
4663 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4664 return BodyGenCB(Builder.saveIP(), IV);
4665 };
4666
4667 const auto &&InputLoopGen = [&]() -> Error {
4668 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4669 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4670 ComputeIP, Name, true, ScanRedInfo);
4671 if (!LoopInfo)
4672 return LoopInfo.takeError();
4673 Result.push_back(*LoopInfo);
4674 Builder.restoreIP((*LoopInfo)->getAfterIP());
4675 return Error::success();
4676 };
4677 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4679 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4680 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4681 if (!LoopInfo)
4682 return LoopInfo.takeError();
4683 Result.push_back(*LoopInfo);
4684 Builder.restoreIP((*LoopInfo)->getAfterIP());
4685 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4686 return Error::success();
4687 };
4688 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4689 if (Err)
4690 return Err;
4691 return Result;
4692}
4693
4694Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4695 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4696 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4697
4698 // Consider the following difficulties (assuming 8-bit signed integers):
4699 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4700 // DO I = 1, 100, 50
4701 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4702 // DO I = 100, 0, -128
4703
4704 // Start, Stop and Step must be of the same integer type.
4705 auto *IndVarTy = cast<IntegerType>(Start->getType());
4706 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4707 assert(IndVarTy == Step->getType() && "Step type mismatch");
4708
4709 updateToLocation(Loc);
4710
4711 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4712 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4713
4714 // Like Step, but always positive.
4715 Value *Incr = Step;
4716
4717 // Distance between Start and Stop; always positive.
4718 Value *Span;
4719
4720 // Condition whether there are no iterations are executed at all, e.g. because
4721 // UB < LB.
4722 Value *ZeroCmp;
4723
4724 if (IsSigned) {
4725 // Ensure that increment is positive. If not, negate and invert LB and UB.
4726 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4727 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4728 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4729 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4730 Span = Builder.CreateSub(UB, LB, "", false, true);
4731 ZeroCmp = Builder.CreateICmp(
4732 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4733 } else {
4734 Span = Builder.CreateSub(Stop, Start, "", true);
4735 ZeroCmp = Builder.CreateICmp(
4736 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4737 }
4738
4739 Value *CountIfLooping;
4740 if (InclusiveStop) {
4741 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4742 } else {
4743 // Avoid incrementing past stop since it could overflow.
4744 Value *CountIfTwo = Builder.CreateAdd(
4745 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4746 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4747 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4748 }
4749
4750 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4751 "omp_" + Name + ".tripcount");
4752}
4753
4754Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4755 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4756 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4757 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4758 ScanInfo *ScanRedInfo) {
4759 LocationDescription ComputeLoc =
4760 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4761
4762 Value *TripCount = calculateCanonicalLoopTripCount(
4763 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4764
4765 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4766 Builder.restoreIP(CodeGenIP);
4767 Value *Span = Builder.CreateMul(IV, Step);
4768 Value *IndVar = Builder.CreateAdd(Span, Start);
4769 if (InScan)
4770 ScanRedInfo->IV = IndVar;
4771 return BodyGenCB(Builder.saveIP(), IndVar);
4772 };
4773 LocationDescription LoopLoc =
4774 ComputeIP.isSet()
4775 ? Loc
4776 : LocationDescription(Builder.saveIP(),
4777 Builder.getCurrentDebugLocation());
4778 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4779}
4780
4781// Returns an LLVM function to call for initializing loop bounds using OpenMP
4782// static scheduling for composite `distribute parallel for` depending on
4783// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4784// integers as unsigned similarly to CanonicalLoopInfo.
4785static FunctionCallee
4787 OpenMPIRBuilder &OMPBuilder) {
4788 unsigned Bitwidth = Ty->getIntegerBitWidth();
4789 if (Bitwidth == 32)
4790 return OMPBuilder.getOrCreateRuntimeFunction(
4791 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4792 if (Bitwidth == 64)
4793 return OMPBuilder.getOrCreateRuntimeFunction(
4794 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4795 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4796}
4797
4798// Returns an LLVM function to call for initializing loop bounds using OpenMP
4799// static scheduling depending on `type`. Only i32 and i64 are supported by the
4800// runtime. Always interpret integers as unsigned similarly to
4801// CanonicalLoopInfo.
4803 OpenMPIRBuilder &OMPBuilder) {
4804 unsigned Bitwidth = Ty->getIntegerBitWidth();
4805 if (Bitwidth == 32)
4806 return OMPBuilder.getOrCreateRuntimeFunction(
4807 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4808 if (Bitwidth == 64)
4809 return OMPBuilder.getOrCreateRuntimeFunction(
4810 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4811 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4812}
4813
4814OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4815 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4816 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
4817 OMPScheduleType DistScheduleSchedType) {
4818 assert(CLI->isValid() && "Requires a valid canonical loop");
4819 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4820 "Require dedicated allocate IP");
4821
4822 // Set up the source location value for OpenMP runtime.
4823 Builder.restoreIP(CLI->getPreheaderIP());
4824 Builder.SetCurrentDebugLocation(DL);
4825
4826 uint32_t SrcLocStrSize;
4827 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4828 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4829
4830 // Declare useful OpenMP runtime functions.
4831 Value *IV = CLI->getIndVar();
4832 Type *IVTy = IV->getType();
4833 FunctionCallee StaticInit =
4834 LoopType == WorksharingLoopType::DistributeForStaticLoop
4835 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4836 : getKmpcForStaticInitForType(IVTy, M, *this);
4837 FunctionCallee StaticFini =
4838 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4839
4840 // Allocate space for computed loop bounds as expected by the "init" function.
4841 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4842
4843 Type *I32Type = Type::getInt32Ty(M.getContext());
4844 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4845 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4846 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4847 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4848 CLI->setLastIter(PLastIter);
4849
4850 // At the end of the preheader, prepare for calling the "init" function by
4851 // storing the current loop bounds into the allocated space. A canonical loop
4852 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4853 // and produces an inclusive upper bound.
4854 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4855 Constant *Zero = ConstantInt::get(IVTy, 0);
4856 Constant *One = ConstantInt::get(IVTy, 1);
4857 Builder.CreateStore(Zero, PLowerBound);
4858 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4859 Builder.CreateStore(UpperBound, PUpperBound);
4860 Builder.CreateStore(One, PStride);
4861
4862 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4863
4864 OMPScheduleType SchedType =
4865 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4866 ? OMPScheduleType::OrderedDistribute
4868 Constant *SchedulingType =
4869 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4870
4871 // Call the "init" function and update the trip count of the loop with the
4872 // value it produced.
4873 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
4874 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
4875 this](Value *SchedulingType, auto &Builder) {
4876 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
4877 PLowerBound, PUpperBound});
4878 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4879 Value *PDistUpperBound =
4880 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4881 Args.push_back(PDistUpperBound);
4882 }
4883 Args.append({PStride, One, Zero});
4884 createRuntimeFunctionCall(StaticInit, Args);
4885 };
4886 BuildInitCall(SchedulingType, Builder);
4887 if (HasDistSchedule &&
4888 LoopType != WorksharingLoopType::DistributeStaticLoop) {
4889 Constant *DistScheduleSchedType = ConstantInt::get(
4890 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
4891 // We want to emit a second init function call for the dist_schedule clause
4892 // to the Distribute construct. This should only be done however if a
4893 // Workshare Loop is nested within a Distribute Construct
4894 BuildInitCall(DistScheduleSchedType, Builder);
4895 }
4896 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
4897 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
4898 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
4899 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
4900 CLI->setTripCount(TripCount);
4901
4902 // Update all uses of the induction variable except the one in the condition
4903 // block that compares it with the actual upper bound, and the increment in
4904 // the latch block.
4905
4906 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
4907 Builder.SetInsertPoint(CLI->getBody(),
4908 CLI->getBody()->getFirstInsertionPt());
4909 Builder.SetCurrentDebugLocation(DL);
4910 return Builder.CreateAdd(OldIV, LowerBound);
4911 });
4912
4913 // In the "exit" block, call the "fini" function.
4914 Builder.SetInsertPoint(CLI->getExit(),
4915 CLI->getExit()->getTerminator()->getIterator());
4916 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
4917
4918 // Add the barrier if requested.
4919 if (NeedsBarrier) {
4920 InsertPointOrErrorTy BarrierIP =
4921 createBarrier(LocationDescription(Builder.saveIP(), DL),
4922 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
4923 /* CheckCancelFlag */ false);
4924 if (!BarrierIP)
4925 return BarrierIP.takeError();
4926 }
4927
4928 InsertPointTy AfterIP = CLI->getAfterIP();
4929 CLI->invalidate();
4930
4931 return AfterIP;
4932}
4933
4934static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
4935 LoopInfo &LI);
4936static void addLoopMetadata(CanonicalLoopInfo *Loop,
4937 ArrayRef<Metadata *> Properties);
4938
4939static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI,
4940 LLVMContext &Ctx, Loop *Loop,
4942 SmallVector<Metadata *> &LoopMDList) {
4943 SmallSet<BasicBlock *, 8> Reachable;
4944
4945 // Get the basic blocks from the loop in which memref instructions
4946 // can be found.
4947 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
4948 // preferably without running any passes.
4949 for (BasicBlock *Block : Loop->getBlocks()) {
4950 if (Block == CLI->getCond() || Block == CLI->getHeader())
4951 continue;
4952 Reachable.insert(Block);
4953 }
4954
4955 // Add access group metadata to memory-access instructions.
4956 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
4957 for (BasicBlock *BB : Reachable)
4958 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
4959 // TODO: If the loop has existing parallel access metadata, have
4960 // to combine two lists.
4961 LoopMDList.push_back(MDNode::get(
4962 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
4963}
4964
4965OpenMPIRBuilder::InsertPointOrErrorTy
4966OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
4967 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4968 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
4969 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
4970 assert(CLI->isValid() && "Requires a valid canonical loop");
4971 assert(ChunkSize || DistScheduleChunkSize && "Chunk size is required");
4972
4973 LLVMContext &Ctx = CLI->getFunction()->getContext();
4974 Value *IV = CLI->getIndVar();
4975 Value *OrigTripCount = CLI->getTripCount();
4976 Type *IVTy = IV->getType();
4977 assert(IVTy->getIntegerBitWidth() <= 64 &&
4978 "Max supported tripcount bitwidth is 64 bits");
4979 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
4980 : Type::getInt64Ty(Ctx);
4981 Type *I32Type = Type::getInt32Ty(M.getContext());
4982 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
4983 Constant *One = ConstantInt::get(InternalIVTy, 1);
4984
4985 Function *F = CLI->getFunction();
4987 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
4988 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
4989 LoopAnalysis LIA;
4990 LoopInfo &&LI = LIA.run(*F, FAM);
4991 Loop *L = LI.getLoopFor(CLI->getHeader());
4992 SmallVector<Metadata *> LoopMDList;
4993 if (ChunkSize || DistScheduleChunkSize)
4994 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
4995 addLoopMetadata(CLI, LoopMDList);
4996
4997 // Declare useful OpenMP runtime functions.
4998 FunctionCallee StaticInit =
4999 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5000 FunctionCallee StaticFini =
5001 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5002
5003 // Allocate space for computed loop bounds as expected by the "init" function.
5004 Builder.restoreIP(AllocaIP);
5005 Builder.SetCurrentDebugLocation(DL);
5006 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5007 Value *PLowerBound =
5008 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5009 Value *PUpperBound =
5010 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5011 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5012 CLI->setLastIter(PLastIter);
5013
5014 // Set up the source location value for the OpenMP runtime.
5015 Builder.restoreIP(CLI->getPreheaderIP());
5016 Builder.SetCurrentDebugLocation(DL);
5017
5018 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5019 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5020 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5021 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5022 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5023 "distschedulechunksize");
5024 Value *CastedTripCount =
5025 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5026
5027 Constant *SchedulingType =
5028 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5029 Constant *DistSchedulingType =
5030 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5031 Builder.CreateStore(Zero, PLowerBound);
5032 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5033 Builder.CreateStore(OrigUpperBound, PUpperBound);
5034 Builder.CreateStore(One, PStride);
5035
5036 // Call the "init" function and update the trip count of the loop with the
5037 // value it produced.
5038 uint32_t SrcLocStrSize;
5039 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5040 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5041 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5042 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5043 PUpperBound, PStride, One,
5044 this](Value *SchedulingType, Value *ChunkSize,
5045 auto &Builder) {
5046 createRuntimeFunctionCall(
5047 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5048 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5049 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5050 /*pstride=*/PStride, /*incr=*/One,
5051 /*chunk=*/ChunkSize});
5052 };
5053 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5054 if (DistScheduleSchedType != OMPScheduleType::None &&
5055 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5056 SchedType != OMPScheduleType::OrderedDistribute) {
5057 // We want to emit a second init function call for the dist_schedule clause
5058 // to the Distribute construct. This should only be done however if a
5059 // Workshare Loop is nested within a Distribute Construct
5060 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5061 }
5062
5063 // Load values written by the "init" function.
5064 Value *FirstChunkStart =
5065 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5066 Value *FirstChunkStop =
5067 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5068 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5069 Value *ChunkRange =
5070 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5071 Value *NextChunkStride =
5072 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5073
5074 // Create outer "dispatch" loop for enumerating the chunks.
5075 BasicBlock *DispatchEnter = splitBB(Builder, true);
5076 Value *DispatchCounter;
5077
5078 // It is safe to assume this didn't return an error because the callback
5079 // passed into createCanonicalLoop is the only possible error source, and it
5080 // always returns success.
5081 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5082 {Builder.saveIP(), DL},
5083 [&](InsertPointTy BodyIP, Value *Counter) {
5084 DispatchCounter = Counter;
5085 return Error::success();
5086 },
5087 FirstChunkStart, CastedTripCount, NextChunkStride,
5088 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5089 "dispatch"));
5090
5091 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5092 // not have to preserve the canonical invariant.
5093 BasicBlock *DispatchBody = DispatchCLI->getBody();
5094 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5095 BasicBlock *DispatchExit = DispatchCLI->getExit();
5096 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5097 DispatchCLI->invalidate();
5098
5099 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5100 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5101 redirectTo(CLI->getExit(), DispatchLatch, DL);
5102 redirectTo(DispatchBody, DispatchEnter, DL);
5103
5104 // Prepare the prolog of the chunk loop.
5105 Builder.restoreIP(CLI->getPreheaderIP());
5106 Builder.SetCurrentDebugLocation(DL);
5107
5108 // Compute the number of iterations of the chunk loop.
5109 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5110 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5111 Value *IsLastChunk =
5112 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5113 Value *CountUntilOrigTripCount =
5114 Builder.CreateSub(CastedTripCount, DispatchCounter);
5115 Value *ChunkTripCount = Builder.CreateSelect(
5116 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5117 Value *BackcastedChunkTC =
5118 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5119 CLI->setTripCount(BackcastedChunkTC);
5120
5121 // Update all uses of the induction variable except the one in the condition
5122 // block that compares it with the actual upper bound, and the increment in
5123 // the latch block.
5124 Value *BackcastedDispatchCounter =
5125 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5126 CLI->mapIndVar([&](Instruction *) -> Value * {
5127 Builder.restoreIP(CLI->getBodyIP());
5128 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5129 });
5130
5131 // In the "exit" block, call the "fini" function.
5132 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5133 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5134
5135 // Add the barrier if requested.
5136 if (NeedsBarrier) {
5137 InsertPointOrErrorTy AfterIP =
5138 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5139 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5140 if (!AfterIP)
5141 return AfterIP.takeError();
5142 }
5143
5144#ifndef NDEBUG
5145 // Even though we currently do not support applying additional methods to it,
5146 // the chunk loop should remain a canonical loop.
5147 CLI->assertOK();
5148#endif
5149
5150 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5151}
5152
5153// Returns an LLVM function to call for executing an OpenMP static worksharing
5154// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5155// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5156static FunctionCallee
5157getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
5158 WorksharingLoopType LoopType) {
5159 unsigned Bitwidth = Ty->getIntegerBitWidth();
5160 Module &M = OMPBuilder->M;
5161 switch (LoopType) {
5162 case WorksharingLoopType::ForStaticLoop:
5163 if (Bitwidth == 32)
5164 return OMPBuilder->getOrCreateRuntimeFunction(
5165 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5166 if (Bitwidth == 64)
5167 return OMPBuilder->getOrCreateRuntimeFunction(
5168 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5169 break;
5170 case WorksharingLoopType::DistributeStaticLoop:
5171 if (Bitwidth == 32)
5172 return OMPBuilder->getOrCreateRuntimeFunction(
5173 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5174 if (Bitwidth == 64)
5175 return OMPBuilder->getOrCreateRuntimeFunction(
5176 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5177 break;
5178 case WorksharingLoopType::DistributeForStaticLoop:
5179 if (Bitwidth == 32)
5180 return OMPBuilder->getOrCreateRuntimeFunction(
5181 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5182 if (Bitwidth == 64)
5183 return OMPBuilder->getOrCreateRuntimeFunction(
5184 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5185 break;
5186 }
5187 if (Bitwidth != 32 && Bitwidth != 64) {
5188 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5189 }
5190 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5191}
5192
5193// Inserts a call to proper OpenMP Device RTL function which handles
5194// loop worksharing.
5195static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
5196 WorksharingLoopType LoopType,
5197 BasicBlock *InsertBlock, Value *Ident,
5198 Value *LoopBodyArg, Value *TripCount,
5199 Function &LoopBodyFn, bool NoLoop) {
5200 Type *TripCountTy = TripCount->getType();
5201 Module &M = OMPBuilder->M;
5202 IRBuilder<> &Builder = OMPBuilder->Builder;
5203 FunctionCallee RTLFn =
5204 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5205 SmallVector<Value *, 8> RealArgs;
5206 RealArgs.push_back(Ident);
5207 RealArgs.push_back(&LoopBodyFn);
5208 RealArgs.push_back(LoopBodyArg);
5209 RealArgs.push_back(TripCount);
5210 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5211 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5212 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5213 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5214 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5215 return;
5216 }
5217 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5218 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5219 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5220 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5221
5222 RealArgs.push_back(
5223 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5224 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5225 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5226 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5227 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5228 } else {
5229 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5230 }
5231
5232 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5233}
5234
5236 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5237 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5238 WorksharingLoopType LoopType, bool NoLoop) {
5239 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5240 BasicBlock *Preheader = CLI->getPreheader();
5241 Value *TripCount = CLI->getTripCount();
5242
5243 // After loop body outling, the loop body contains only set up
5244 // of loop body argument structure and the call to the outlined
5245 // loop body function. Firstly, we need to move setup of loop body args
5246 // into loop preheader.
5247 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5248 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5249
5250 // The next step is to remove the whole loop. We do not it need anymore.
5251 // That's why make an unconditional branch from loop preheader to loop
5252 // exit block
5253 Builder.restoreIP({Preheader, Preheader->end()});
5254 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5255 Preheader->getTerminator()->eraseFromParent();
5256 Builder.CreateBr(CLI->getExit());
5257
5258 // Delete dead loop blocks
5259 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5260 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5261 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5262 CleanUpInfo.EntryBB = CLI->getHeader();
5263 CleanUpInfo.ExitBB = CLI->getExit();
5264 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5265 DeleteDeadBlocks(BlocksToBeRemoved);
5266
5267 // Find the instruction which corresponds to loop body argument structure
5268 // and remove the call to loop body function instruction.
5269 Value *LoopBodyArg;
5270 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5271 assert(OutlinedFnUser &&
5272 "Expected unique undroppable user of outlined function");
5273 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5274 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5275 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5276 "Expected outlined function call to be located in loop preheader");
5277 // Check in case no argument structure has been passed.
5278 if (OutlinedFnCallInstruction->arg_size() > 1)
5279 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5280 else
5281 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5282 OutlinedFnCallInstruction->eraseFromParent();
5283
5284 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5285 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5286
5287 for (auto &ToBeDeletedItem : ToBeDeleted)
5288 ToBeDeletedItem->eraseFromParent();
5289 CLI->invalidate();
5290}
5291
5292OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5293 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5294 WorksharingLoopType LoopType, bool NoLoop) {
5295 uint32_t SrcLocStrSize;
5296 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5297 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5298
5299 OutlineInfo OI;
5300 OI.OuterAllocaBB = CLI->getPreheader();
5301 Function *OuterFn = CLI->getPreheader()->getParent();
5302
5303 // Instructions which need to be deleted at the end of code generation
5305
5306 OI.OuterAllocaBB = AllocaIP.getBlock();
5307
5308 // Mark the body loop as region which needs to be extracted
5309 OI.EntryBB = CLI->getBody();
5310 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5311 "omp.prelatch", true);
5312
5313 // Prepare loop body for extraction
5314 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5315
5316 // Insert new loop counter variable which will be used only in loop
5317 // body.
5318 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5319 Instruction *NewLoopCntLoad =
5320 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5321 // New loop counter instructions are redundant in the loop preheader when
5322 // code generation for workshare loop is finshed. That's why mark them as
5323 // ready for deletion.
5324 ToBeDeleted.push_back(NewLoopCntLoad);
5325 ToBeDeleted.push_back(NewLoopCnt);
5326
5327 // Analyse loop body region. Find all input variables which are used inside
5328 // loop body region.
5329 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5331 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5332
5333 CodeExtractorAnalysisCache CEAC(*OuterFn);
5334 CodeExtractor Extractor(Blocks,
5335 /* DominatorTree */ nullptr,
5336 /* AggregateArgs */ true,
5337 /* BlockFrequencyInfo */ nullptr,
5338 /* BranchProbabilityInfo */ nullptr,
5339 /* AssumptionCache */ nullptr,
5340 /* AllowVarArgs */ true,
5341 /* AllowAlloca */ true,
5342 /* AllocationBlock */ CLI->getPreheader(),
5343 /* Suffix */ ".omp_wsloop",
5344 /* AggrArgsIn0AddrSpace */ true);
5345
5346 BasicBlock *CommonExit = nullptr;
5347 SetVector<Value *> SinkingCands, HoistingCands;
5348
5349 // Find allocas outside the loop body region which are used inside loop
5350 // body
5351 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5352
5353 // We need to model loop body region as the function f(cnt, loop_arg).
5354 // That's why we replace loop induction variable by the new counter
5355 // which will be one of loop body function argument
5356 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5357 CLI->getIndVar()->user_end());
5358 for (auto Use : Users) {
5359 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5360 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5361 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5362 }
5363 }
5364 }
5365 // Make sure that loop counter variable is not merged into loop body
5366 // function argument structure and it is passed as separate variable
5367 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5368
5369 // PostOutline CB is invoked when loop body function is outlined and
5370 // loop body is replaced by call to outlined function. We need to add
5371 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5372 // function will handle loop control logic.
5373 //
5374 OI.PostOutlineCB = [=, ToBeDeletedVec =
5375 std::move(ToBeDeleted)](Function &OutlinedFn) {
5376 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5377 LoopType, NoLoop);
5378 };
5379 addOutlineInfo(std::move(OI));
5380 return CLI->getAfterIP();
5381}
5382
5383OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5384 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5385 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5386 bool HasSimdModifier, bool HasMonotonicModifier,
5387 bool HasNonmonotonicModifier, bool HasOrderedClause,
5388 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5389 Value *DistScheduleChunkSize) {
5390 if (Config.isTargetDevice())
5391 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5392 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5393 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5394 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
5395
5396 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5397 OMPScheduleType::ModifierOrdered;
5398 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
5399 if (HasDistSchedule) {
5400 DistScheduleSchedType = DistScheduleChunkSize
5401 ? OMPScheduleType::OrderedDistributeChunked
5402 : OMPScheduleType::OrderedDistribute;
5403 }
5404 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5405 case OMPScheduleType::BaseStatic:
5406 case OMPScheduleType::BaseDistribute:
5407 assert(!ChunkSize || !DistScheduleChunkSize &&
5408 "No chunk size with static-chunked schedule");
5409 if (IsOrdered && !HasDistSchedule)
5410 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5411 NeedsBarrier, ChunkSize);
5412 // FIXME: Monotonicity ignored?
5413 if (DistScheduleChunkSize)
5414 return applyStaticChunkedWorkshareLoop(
5415 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5416 DistScheduleChunkSize, DistScheduleSchedType);
5417 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
5418 HasDistSchedule);
5419
5420 case OMPScheduleType::BaseStaticChunked:
5421 case OMPScheduleType::BaseDistributeChunked:
5422 if (IsOrdered && !HasDistSchedule)
5423 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5424 NeedsBarrier, ChunkSize);
5425 // FIXME: Monotonicity ignored?
5426 return applyStaticChunkedWorkshareLoop(
5427 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5428 DistScheduleChunkSize, DistScheduleSchedType);
5429
5430 case OMPScheduleType::BaseRuntime:
5431 case OMPScheduleType::BaseAuto:
5432 case OMPScheduleType::BaseGreedy:
5433 case OMPScheduleType::BaseBalanced:
5434 case OMPScheduleType::BaseSteal:
5435 case OMPScheduleType::BaseGuidedSimd:
5436 case OMPScheduleType::BaseRuntimeSimd:
5437 assert(!ChunkSize &&
5438 "schedule type does not support user-defined chunk sizes");
5439 [[fallthrough]];
5440 case OMPScheduleType::BaseDynamicChunked:
5441 case OMPScheduleType::BaseGuidedChunked:
5442 case OMPScheduleType::BaseGuidedIterativeChunked:
5443 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5444 case OMPScheduleType::BaseStaticBalancedChunked:
5445 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5446 NeedsBarrier, ChunkSize);
5447
5448 default:
5449 llvm_unreachable("Unknown/unimplemented schedule kind");
5450 }
5451}
5452
5453/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5454/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5455/// the runtime. Always interpret integers as unsigned similarly to
5456/// CanonicalLoopInfo.
5457static FunctionCallee
5458getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5459 unsigned Bitwidth = Ty->getIntegerBitWidth();
5460 if (Bitwidth == 32)
5461 return OMPBuilder.getOrCreateRuntimeFunction(
5462 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5463 if (Bitwidth == 64)
5464 return OMPBuilder.getOrCreateRuntimeFunction(
5465 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5466 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5467}
5468
5469/// Returns an LLVM function to call for updating the next loop using OpenMP
5470/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5471/// the runtime. Always interpret integers as unsigned similarly to
5472/// CanonicalLoopInfo.
5473static FunctionCallee
5474getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5475 unsigned Bitwidth = Ty->getIntegerBitWidth();
5476 if (Bitwidth == 32)
5477 return OMPBuilder.getOrCreateRuntimeFunction(
5478 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5479 if (Bitwidth == 64)
5480 return OMPBuilder.getOrCreateRuntimeFunction(
5481 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5482 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5483}
5484
5485/// Returns an LLVM function to call for finalizing the dynamic loop using
5486/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5487/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5488static FunctionCallee
5489getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5490 unsigned Bitwidth = Ty->getIntegerBitWidth();
5491 if (Bitwidth == 32)
5492 return OMPBuilder.getOrCreateRuntimeFunction(
5493 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5494 if (Bitwidth == 64)
5495 return OMPBuilder.getOrCreateRuntimeFunction(
5496 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5497 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5498}
5499
5500OpenMPIRBuilder::InsertPointOrErrorTy
5501OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5502 InsertPointTy AllocaIP,
5503 OMPScheduleType SchedType,
5504 bool NeedsBarrier, Value *Chunk) {
5505 assert(CLI->isValid() && "Requires a valid canonical loop");
5506 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5507 "Require dedicated allocate IP");
5509 "Require valid schedule type");
5510
5511 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5512 OMPScheduleType::ModifierOrdered;
5513
5514 // Set up the source location value for OpenMP runtime.
5515 Builder.SetCurrentDebugLocation(DL);
5516
5517 uint32_t SrcLocStrSize;
5518 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5519 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5520
5521 // Declare useful OpenMP runtime functions.
5522 Value *IV = CLI->getIndVar();
5523 Type *IVTy = IV->getType();
5524 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5525 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5526
5527 // Allocate space for computed loop bounds as expected by the "init" function.
5528 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5529 Type *I32Type = Type::getInt32Ty(M.getContext());
5530 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5531 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5532 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5533 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5534 CLI->setLastIter(PLastIter);
5535
5536 // At the end of the preheader, prepare for calling the "init" function by
5537 // storing the current loop bounds into the allocated space. A canonical loop
5538 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5539 // and produces an inclusive upper bound.
5540 BasicBlock *PreHeader = CLI->getPreheader();
5541 Builder.SetInsertPoint(PreHeader->getTerminator());
5542 Constant *One = ConstantInt::get(IVTy, 1);
5543 Builder.CreateStore(One, PLowerBound);
5544 Value *UpperBound = CLI->getTripCount();
5545 Builder.CreateStore(UpperBound, PUpperBound);
5546 Builder.CreateStore(One, PStride);
5547
5548 BasicBlock *Header = CLI->getHeader();
5549 BasicBlock *Exit = CLI->getExit();
5550 BasicBlock *Cond = CLI->getCond();
5551 BasicBlock *Latch = CLI->getLatch();
5552 InsertPointTy AfterIP = CLI->getAfterIP();
5553
5554 // The CLI will be "broken" in the code below, as the loop is no longer
5555 // a valid canonical loop.
5556
5557 if (!Chunk)
5558 Chunk = One;
5559
5560 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5561
5562 Constant *SchedulingType =
5563 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5564
5565 // Call the "init" function.
5566 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
5567 /* LowerBound */ One, UpperBound,
5568 /* step */ One, Chunk});
5569
5570 // An outer loop around the existing one.
5571 BasicBlock *OuterCond = BasicBlock::Create(
5572 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5573 PreHeader->getParent());
5574 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5575 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5576 Value *Res = createRuntimeFunctionCall(
5577 DynamicNext,
5578 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
5579 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5580 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5581 Value *LowerBound =
5582 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5583 Builder.CreateCondBr(MoreWork, Header, Exit);
5584
5585 // Change PHI-node in loop header to use outer cond rather than preheader,
5586 // and set IV to the LowerBound.
5587 Instruction *Phi = &Header->front();
5588 auto *PI = cast<PHINode>(Phi);
5589 PI->setIncomingBlock(0, OuterCond);
5590 PI->setIncomingValue(0, LowerBound);
5591
5592 // Then set the pre-header to jump to the OuterCond
5593 Instruction *Term = PreHeader->getTerminator();
5594 auto *Br = cast<BranchInst>(Term);
5595 Br->setSuccessor(0, OuterCond);
5596
5597 // Modify the inner condition:
5598 // * Use the UpperBound returned from the DynamicNext call.
5599 // * jump to the loop outer loop when done with one of the inner loops.
5600 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5601 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5602 Instruction *Comp = &*Builder.GetInsertPoint();
5603 auto *CI = cast<CmpInst>(Comp);
5604 CI->setOperand(1, UpperBound);
5605 // Redirect the inner exit to branch to outer condition.
5606 Instruction *Branch = &Cond->back();
5607 auto *BI = cast<BranchInst>(Branch);
5608 assert(BI->getSuccessor(1) == Exit);
5609 BI->setSuccessor(1, OuterCond);
5610
5611 // Call the "fini" function if "ordered" is present in wsloop directive.
5612 if (Ordered) {
5613 Builder.SetInsertPoint(&Latch->back());
5614 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5615 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
5616 }
5617
5618 // Add the barrier if requested.
5619 if (NeedsBarrier) {
5620 Builder.SetInsertPoint(&Exit->back());
5621 InsertPointOrErrorTy BarrierIP =
5622 createBarrier(LocationDescription(Builder.saveIP(), DL),
5623 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5624 /* CheckCancelFlag */ false);
5625 if (!BarrierIP)
5626 return BarrierIP.takeError();
5627 }
5628
5629 CLI->invalidate();
5630 return AfterIP;
5631}
5632
5633/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5634/// after this \p OldTarget will be orphaned.
5636 BasicBlock *NewTarget, DebugLoc DL) {
5637 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5638 redirectTo(Pred, NewTarget, DL);
5639}
5640
5641/// Determine which blocks in \p BBs are reachable from outside and remove the
5642/// ones that are not reachable from the function.
5645 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5646 for (Use &U : BB->uses()) {
5647 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5648 if (!UseInst)
5649 continue;
5650 if (BBsToErase.count(UseInst->getParent()))
5651 continue;
5652 return true;
5653 }
5654 return false;
5655 };
5656
5657 while (BBsToErase.remove_if(HasRemainingUses)) {
5658 // Try again if anything was removed.
5659 }
5660
5661 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5662 DeleteDeadBlocks(BBVec);
5663}
5664
5665CanonicalLoopInfo *
5666OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5667 InsertPointTy ComputeIP) {
5668 assert(Loops.size() >= 1 && "At least one loop required");
5669 size_t NumLoops = Loops.size();
5670
5671 // Nothing to do if there is already just one loop.
5672 if (NumLoops == 1)
5673 return Loops.front();
5674
5675 CanonicalLoopInfo *Outermost = Loops.front();
5676 CanonicalLoopInfo *Innermost = Loops.back();
5677 BasicBlock *OrigPreheader = Outermost->getPreheader();
5678 BasicBlock *OrigAfter = Outermost->getAfter();
5679 Function *F = OrigPreheader->getParent();
5680
5681 // Loop control blocks that may become orphaned later.
5682 SmallVector<BasicBlock *, 12> OldControlBBs;
5683 OldControlBBs.reserve(6 * Loops.size());
5684 for (CanonicalLoopInfo *Loop : Loops)
5685 Loop->collectControlBlocks(OldControlBBs);
5686
5687 // Setup the IRBuilder for inserting the trip count computation.
5688 Builder.SetCurrentDebugLocation(DL);
5689 if (ComputeIP.isSet())
5690 Builder.restoreIP(ComputeIP);
5691 else
5692 Builder.restoreIP(Outermost->getPreheaderIP());
5693
5694 // Derive the collapsed' loop trip count.
5695 // TODO: Find common/largest indvar type.
5696 Value *CollapsedTripCount = nullptr;
5697 for (CanonicalLoopInfo *L : Loops) {
5698 assert(L->isValid() &&
5699 "All loops to collapse must be valid canonical loops");
5700 Value *OrigTripCount = L->getTripCount();
5701 if (!CollapsedTripCount) {
5702 CollapsedTripCount = OrigTripCount;
5703 continue;
5704 }
5705
5706 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5707 CollapsedTripCount =
5708 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5709 }
5710
5711 // Create the collapsed loop control flow.
5712 CanonicalLoopInfo *Result =
5713 createLoopSkeleton(DL, CollapsedTripCount, F,
5714 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5715
5716 // Build the collapsed loop body code.
5717 // Start with deriving the input loop induction variables from the collapsed
5718 // one, using a divmod scheme. To preserve the original loops' order, the
5719 // innermost loop use the least significant bits.
5720 Builder.restoreIP(Result->getBodyIP());
5721
5722 Value *Leftover = Result->getIndVar();
5723 SmallVector<Value *> NewIndVars;
5724 NewIndVars.resize(NumLoops);
5725 for (int i = NumLoops - 1; i >= 1; --i) {
5726 Value *OrigTripCount = Loops[i]->getTripCount();
5727
5728 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5729 NewIndVars[i] = NewIndVar;
5730
5731 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5732 }
5733 // Outermost loop gets all the remaining bits.
5734 NewIndVars[0] = Leftover;
5735
5736 // Construct the loop body control flow.
5737 // We progressively construct the branch structure following in direction of
5738 // the control flow, from the leading in-between code, the loop nest body, the
5739 // trailing in-between code, and rejoining the collapsed loop's latch.
5740 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5741 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5742 // its predecessors as sources.
5743 BasicBlock *ContinueBlock = Result->getBody();
5744 BasicBlock *ContinuePred = nullptr;
5745 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5746 BasicBlock *NextSrc) {
5747 if (ContinueBlock)
5748 redirectTo(ContinueBlock, Dest, DL);
5749 else
5750 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5751
5752 ContinueBlock = nullptr;
5753 ContinuePred = NextSrc;
5754 };
5755
5756 // The code before the nested loop of each level.
5757 // Because we are sinking it into the nest, it will be executed more often
5758 // that the original loop. More sophisticated schemes could keep track of what
5759 // the in-between code is and instantiate it only once per thread.
5760 for (size_t i = 0; i < NumLoops - 1; ++i)
5761 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5762
5763 // Connect the loop nest body.
5764 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5765
5766 // The code after the nested loop at each level.
5767 for (size_t i = NumLoops - 1; i > 0; --i)
5768 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5769
5770 // Connect the finished loop to the collapsed loop latch.
5771 ContinueWith(Result->getLatch(), nullptr);
5772
5773 // Replace the input loops with the new collapsed loop.
5774 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5775 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5776
5777 // Replace the input loop indvars with the derived ones.
5778 for (size_t i = 0; i < NumLoops; ++i)
5779 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5780
5781 // Remove unused parts of the input loops.
5782 removeUnusedBlocksFromParent(OldControlBBs);
5783
5784 for (CanonicalLoopInfo *L : Loops)
5785 L->invalidate();
5786
5787#ifndef NDEBUG
5788 Result->assertOK();
5789#endif
5790 return Result;
5791}
5792
5793std::vector<CanonicalLoopInfo *>
5794OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5795 ArrayRef<Value *> TileSizes) {
5796 assert(TileSizes.size() == Loops.size() &&
5797 "Must pass as many tile sizes as there are loops");
5798 int NumLoops = Loops.size();
5799 assert(NumLoops >= 1 && "At least one loop to tile required");
5800
5801 CanonicalLoopInfo *OutermostLoop = Loops.front();
5802 CanonicalLoopInfo *InnermostLoop = Loops.back();
5803 Function *F = OutermostLoop->getBody()->getParent();
5804 BasicBlock *InnerEnter = InnermostLoop->getBody();
5805 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5806
5807 // Loop control blocks that may become orphaned later.
5808 SmallVector<BasicBlock *, 12> OldControlBBs;
5809 OldControlBBs.reserve(6 * Loops.size());
5810 for (CanonicalLoopInfo *Loop : Loops)
5811 Loop->collectControlBlocks(OldControlBBs);
5812
5813 // Collect original trip counts and induction variable to be accessible by
5814 // index. Also, the structure of the original loops is not preserved during
5815 // the construction of the tiled loops, so do it before we scavenge the BBs of
5816 // any original CanonicalLoopInfo.
5817 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5818 for (CanonicalLoopInfo *L : Loops) {
5819 assert(L->isValid() && "All input loops must be valid canonical loops");
5820 OrigTripCounts.push_back(L->getTripCount());
5821 OrigIndVars.push_back(L->getIndVar());
5822 }
5823
5824 // Collect the code between loop headers. These may contain SSA definitions
5825 // that are used in the loop nest body. To be usable with in the innermost
5826 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5827 // these instructions may be executed more often than before the tiling.
5828 // TODO: It would be sufficient to only sink them into body of the
5829 // corresponding tile loop.
5831 for (int i = 0; i < NumLoops - 1; ++i) {
5832 CanonicalLoopInfo *Surrounding = Loops[i];
5833 CanonicalLoopInfo *Nested = Loops[i + 1];
5834
5835 BasicBlock *EnterBB = Surrounding->getBody();
5836 BasicBlock *ExitBB = Nested->getHeader();
5837 InbetweenCode.emplace_back(EnterBB, ExitBB);
5838 }
5839
5840 // Compute the trip counts of the floor loops.
5841 Builder.SetCurrentDebugLocation(DL);
5842 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5843 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5844 for (int i = 0; i < NumLoops; ++i) {
5845 Value *TileSize = TileSizes[i];
5846 Value *OrigTripCount = OrigTripCounts[i];
5847 Type *IVType = OrigTripCount->getType();
5848
5849 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5850 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5851
5852 // 0 if tripcount divides the tilesize, 1 otherwise.
5853 // 1 means we need an additional iteration for a partial tile.
5854 //
5855 // Unfortunately we cannot just use the roundup-formula
5856 // (tripcount + tilesize - 1)/tilesize
5857 // because the summation might overflow. We do not want introduce undefined
5858 // behavior when the untiled loop nest did not.
5859 Value *FloorTripOverflow =
5860 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5861
5862 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5863 Value *FloorTripCount =
5864 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5865 "omp_floor" + Twine(i) + ".tripcount", true);
5866
5867 // Remember some values for later use.
5868 FloorCompleteCount.push_back(FloorCompleteTripCount);
5869 FloorCount.push_back(FloorTripCount);
5870 FloorRems.push_back(FloorTripRem);
5871 }
5872
5873 // Generate the new loop nest, from the outermost to the innermost.
5874 std::vector<CanonicalLoopInfo *> Result;
5875 Result.reserve(NumLoops * 2);
5876
5877 // The basic block of the surrounding loop that enters the nest generated
5878 // loop.
5879 BasicBlock *Enter = OutermostLoop->getPreheader();
5880
5881 // The basic block of the surrounding loop where the inner code should
5882 // continue.
5883 BasicBlock *Continue = OutermostLoop->getAfter();
5884
5885 // Where the next loop basic block should be inserted.
5886 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
5887
5888 auto EmbeddNewLoop =
5889 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
5890 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
5891 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
5892 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
5893 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
5894 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
5895
5896 // Setup the position where the next embedded loop connects to this loop.
5897 Enter = EmbeddedLoop->getBody();
5898 Continue = EmbeddedLoop->getLatch();
5899 OutroInsertBefore = EmbeddedLoop->getLatch();
5900 return EmbeddedLoop;
5901 };
5902
5903 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
5904 const Twine &NameBase) {
5905 for (auto P : enumerate(TripCounts)) {
5906 CanonicalLoopInfo *EmbeddedLoop =
5907 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
5908 Result.push_back(EmbeddedLoop);
5909 }
5910 };
5911
5912 EmbeddNewLoops(FloorCount, "floor");
5913
5914 // Within the innermost floor loop, emit the code that computes the tile
5915 // sizes.
5916 Builder.SetInsertPoint(Enter->getTerminator());
5917 SmallVector<Value *, 4> TileCounts;
5918 for (int i = 0; i < NumLoops; ++i) {
5919 CanonicalLoopInfo *FloorLoop = Result[i];
5920 Value *TileSize = TileSizes[i];
5921
5922 Value *FloorIsEpilogue =
5923 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
5924 Value *TileTripCount =
5925 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
5926
5927 TileCounts.push_back(TileTripCount);
5928 }
5929
5930 // Create the tile loops.
5931 EmbeddNewLoops(TileCounts, "tile");
5932
5933 // Insert the inbetween code into the body.
5934 BasicBlock *BodyEnter = Enter;
5935 BasicBlock *BodyEntered = nullptr;
5936 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
5937 BasicBlock *EnterBB = P.first;
5938 BasicBlock *ExitBB = P.second;
5939
5940 if (BodyEnter)
5941 redirectTo(BodyEnter, EnterBB, DL);
5942 else
5943 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
5944
5945 BodyEnter = nullptr;
5946 BodyEntered = ExitBB;
5947 }
5948
5949 // Append the original loop nest body into the generated loop nest body.
5950 if (BodyEnter)
5951 redirectTo(BodyEnter, InnerEnter, DL);
5952 else
5953 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
5955
5956 // Replace the original induction variable with an induction variable computed
5957 // from the tile and floor induction variables.
5958 Builder.restoreIP(Result.back()->getBodyIP());
5959 for (int i = 0; i < NumLoops; ++i) {
5960 CanonicalLoopInfo *FloorLoop = Result[i];
5961 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
5962 Value *OrigIndVar = OrigIndVars[i];
5963 Value *Size = TileSizes[i];
5964
5965 Value *Scale =
5966 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
5967 Value *Shift =
5968 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
5969 OrigIndVar->replaceAllUsesWith(Shift);
5970 }
5971
5972 // Remove unused parts of the original loops.
5973 removeUnusedBlocksFromParent(OldControlBBs);
5974
5975 for (CanonicalLoopInfo *L : Loops)
5976 L->invalidate();
5977
5978#ifndef NDEBUG
5979 for (CanonicalLoopInfo *GenL : Result)
5980 GenL->assertOK();
5981#endif
5982 return Result;
5983}
5984
5985/// Attach metadata \p Properties to the basic block described by \p BB. If the
5986/// basic block already has metadata, the basic block properties are appended.
5988 ArrayRef<Metadata *> Properties) {
5989 // Nothing to do if no property to attach.
5990 if (Properties.empty())
5991 return;
5992
5993 LLVMContext &Ctx = BB->getContext();
5994 SmallVector<Metadata *> NewProperties;
5995 NewProperties.push_back(nullptr);
5996
5997 // If the basic block already has metadata, prepend it to the new metadata.
5998 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
5999 if (Existing)
6000 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6001
6002 append_range(NewProperties, Properties);
6003 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6004 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6005
6006 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6007}
6008
6009/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6010/// loop already has metadata, the loop properties are appended.
6011static void addLoopMetadata(CanonicalLoopInfo *Loop,
6012 ArrayRef<Metadata *> Properties) {
6013 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6014
6015 // Attach metadata to the loop's latch
6016 BasicBlock *Latch = Loop->getLatch();
6017 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6018 addBasicBlockMetadata(Latch, Properties);
6019}
6020
6021/// Attach llvm.access.group metadata to the memref instructions of \p Block
6023 LoopInfo &LI) {
6024 for (Instruction &I : *Block) {
6025 if (I.mayReadOrWriteMemory()) {
6026 // TODO: This instruction may already have access group from
6027 // other pragmas e.g. #pragma clang loop vectorize. Append
6028 // so that the existing metadata is not overwritten.
6029 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6030 }
6031 }
6032}
6033
6034void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
6035 LLVMContext &Ctx = Builder.getContext();
6037 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6038 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6039}
6040
6041void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
6042 LLVMContext &Ctx = Builder.getContext();
6044 Loop, {
6045 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6046 });
6047}
6048
6049void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6050 Value *IfCond, ValueToValueMapTy &VMap,
6051 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6052 const Twine &NamePrefix) {
6053 Function *F = CanonicalLoop->getFunction();
6054
6055 // We can't do
6056 // if (cond) {
6057 // simd_loop;
6058 // } else {
6059 // non_simd_loop;
6060 // }
6061 // because then the CanonicalLoopInfo would only point to one of the loops:
6062 // leading to other constructs operating on the same loop to malfunction.
6063 // Instead generate
6064 // while (...) {
6065 // if (cond) {
6066 // simd_body;
6067 // } else {
6068 // not_simd_body;
6069 // }
6070 // }
6071 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6072 // body at -O3
6073
6074 // Define where if branch should be inserted
6075 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6076
6077 // Create additional blocks for the if statement
6078 BasicBlock *Cond = SplitBeforeIt->getParent();
6079 llvm::LLVMContext &C = Cond->getContext();
6081 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6083 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6084
6085 // Create if condition branch.
6086 Builder.SetInsertPoint(SplitBeforeIt);
6087 Instruction *BrInstr =
6088 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6089 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6090 // Then block contains branch to omp loop body which needs to be vectorized
6091 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6092 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6093
6094 Builder.SetInsertPoint(ElseBlock);
6095
6096 // Clone loop for the else branch
6098
6099 SmallVector<BasicBlock *, 8> ExistingBlocks;
6100 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6101 ExistingBlocks.push_back(ThenBlock);
6102 ExistingBlocks.append(L->block_begin(), L->block_end());
6103 // Cond is the block that has the if clause condition
6104 // LoopCond is omp_loop.cond
6105 // LoopHeader is omp_loop.header
6106 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6107 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6108 assert(LoopCond && LoopHeader && "Invalid loop structure");
6109 for (BasicBlock *Block : ExistingBlocks) {
6110 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6111 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6112 continue;
6113 }
6114 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6115
6116 // fix name not to be omp.if.then
6117 if (Block == ThenBlock)
6118 NewBB->setName(NamePrefix + ".if.else");
6119
6120 NewBB->moveBefore(CanonicalLoop->getExit());
6121 VMap[Block] = NewBB;
6122 NewBlocks.push_back(NewBB);
6123 }
6124 remapInstructionsInBlocks(NewBlocks, VMap);
6125 Builder.CreateBr(NewBlocks.front());
6126
6127 // The loop latch must have only one predecessor. Currently it is branched to
6128 // from both the 'then' and 'else' branches.
6129 L->getLoopLatch()->splitBasicBlock(
6130 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
6131
6132 // Ensure that the then block is added to the loop so we add the attributes in
6133 // the next step
6134 L->addBasicBlockToLoop(ThenBlock, LI);
6135}
6136
6137unsigned
6138OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
6139 const StringMap<bool> &Features) {
6140 if (TargetTriple.isX86()) {
6141 if (Features.lookup("avx512f"))
6142 return 512;
6143 else if (Features.lookup("avx"))
6144 return 256;
6145 return 128;
6146 }
6147 if (TargetTriple.isPPC())
6148 return 128;
6149 if (TargetTriple.isWasm())
6150 return 128;
6151 return 0;
6152}
6153
6154void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
6155 MapVector<Value *, Value *> AlignedVars,
6156 Value *IfCond, OrderKind Order,
6157 ConstantInt *Simdlen, ConstantInt *Safelen) {
6158 LLVMContext &Ctx = Builder.getContext();
6159
6160 Function *F = CanonicalLoop->getFunction();
6161
6162 // TODO: We should not rely on pass manager. Currently we use pass manager
6163 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6164 // object. We should have a method which returns all blocks between
6165 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6167 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6168 FAM.registerPass([]() { return LoopAnalysis(); });
6169 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6170
6171 LoopAnalysis LIA;
6172 LoopInfo &&LI = LIA.run(*F, FAM);
6173
6174 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6175 if (AlignedVars.size()) {
6176 InsertPointTy IP = Builder.saveIP();
6177 for (auto &AlignedItem : AlignedVars) {
6178 Value *AlignedPtr = AlignedItem.first;
6179 Value *Alignment = AlignedItem.second;
6180 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6181 Builder.SetInsertPoint(loadInst->getNextNode());
6182 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6183 Alignment);
6184 }
6185 Builder.restoreIP(IP);
6186 }
6187
6188 if (IfCond) {
6189 ValueToValueMapTy VMap;
6190 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6191 }
6192
6194
6195 // Get the basic blocks from the loop in which memref instructions
6196 // can be found.
6197 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6198 // preferably without running any passes.
6199 for (BasicBlock *Block : L->getBlocks()) {
6200 if (Block == CanonicalLoop->getCond() ||
6201 Block == CanonicalLoop->getHeader())
6202 continue;
6203 Reachable.insert(Block);
6204 }
6205
6206 SmallVector<Metadata *> LoopMDList;
6207
6208 // In presence of finite 'safelen', it may be unsafe to mark all
6209 // the memory instructions parallel, because loop-carried
6210 // dependences of 'safelen' iterations are possible.
6211 // If clause order(concurrent) is specified then the memory instructions
6212 // are marked parallel even if 'safelen' is finite.
6213 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6214 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6215
6216 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6217 // versions so we can't add the loop attributes in that case.
6218 if (IfCond) {
6219 // we can still add llvm.loop.parallel_access
6220 addLoopMetadata(CanonicalLoop, LoopMDList);
6221 return;
6222 }
6223
6224 // Use the above access group metadata to create loop level
6225 // metadata, which should be distinct for each loop.
6226 ConstantAsMetadata *BoolConst =
6228 LoopMDList.push_back(MDNode::get(
6229 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6230
6231 if (Simdlen || Safelen) {
6232 // If both simdlen and safelen clauses are specified, the value of the
6233 // simdlen parameter must be less than or equal to the value of the safelen
6234 // parameter. Therefore, use safelen only in the absence of simdlen.
6235 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6236 LoopMDList.push_back(
6237 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6238 ConstantAsMetadata::get(VectorizeWidth)}));
6239 }
6240
6241 addLoopMetadata(CanonicalLoop, LoopMDList);
6242}
6243
6244/// Create the TargetMachine object to query the backend for optimization
6245/// preferences.
6246///
6247/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6248/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6249/// needed for the LLVM pass pipline. We use some default options to avoid
6250/// having to pass too many settings from the frontend that probably do not
6251/// matter.
6252///
6253/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6254/// method. If we are going to use TargetMachine for more purposes, especially
6255/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6256/// might become be worth requiring front-ends to pass on their TargetMachine,
6257/// or at least cache it between methods. Note that while fontends such as Clang
6258/// have just a single main TargetMachine per translation unit, "target-cpu" and
6259/// "target-features" that determine the TargetMachine are per-function and can
6260/// be overrided using __attribute__((target("OPTIONS"))).
6261static std::unique_ptr<TargetMachine>
6263 Module *M = F->getParent();
6264
6265 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6266 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6267 const llvm::Triple &Triple = M->getTargetTriple();
6268
6269 std::string Error;
6271 if (!TheTarget)
6272 return {};
6273
6275 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6276 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6277 /*CodeModel=*/std::nullopt, OptLevel));
6278}
6279
6280/// Heuristically determine the best-performant unroll factor for \p CLI. This
6281/// depends on the target processor. We are re-using the same heuristics as the
6282/// LoopUnrollPass.
6283static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6284 Function *F = CLI->getFunction();
6285
6286 // Assume the user requests the most aggressive unrolling, even if the rest of
6287 // the code is optimized using a lower setting.
6289 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6290
6292 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6293 FAM.registerPass([]() { return AssumptionAnalysis(); });
6294 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6295 FAM.registerPass([]() { return LoopAnalysis(); });
6296 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6297 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6298 TargetIRAnalysis TIRA;
6299 if (TM)
6300 TIRA = TargetIRAnalysis(
6301 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6302 FAM.registerPass([&]() { return TIRA; });
6303
6304 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6306 ScalarEvolution &&SE = SEA.run(*F, FAM);
6308 DominatorTree &&DT = DTA.run(*F, FAM);
6309 LoopAnalysis LIA;
6310 LoopInfo &&LI = LIA.run(*F, FAM);
6312 AssumptionCache &&AC = ACT.run(*F, FAM);
6314
6315 Loop *L = LI.getLoopFor(CLI->getHeader());
6316 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6317
6319 L, SE, TTI,
6320 /*BlockFrequencyInfo=*/nullptr,
6321 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6322 /*UserThreshold=*/std::nullopt,
6323 /*UserCount=*/std::nullopt,
6324 /*UserAllowPartial=*/true,
6325 /*UserAllowRuntime=*/true,
6326 /*UserUpperBound=*/std::nullopt,
6327 /*UserFullUnrollMaxCount=*/std::nullopt);
6328
6329 UP.Force = true;
6330
6331 // Account for additional optimizations taking place before the LoopUnrollPass
6332 // would unroll the loop.
6335
6336 // Use normal unroll factors even if the rest of the code is optimized for
6337 // size.
6340
6341 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6342 << " Threshold=" << UP.Threshold << "\n"
6343 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6344 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6345 << " PartialOptSizeThreshold="
6346 << UP.PartialOptSizeThreshold << "\n");
6347
6348 // Disable peeling.
6351 /*UserAllowPeeling=*/false,
6352 /*UserAllowProfileBasedPeeling=*/false,
6353 /*UnrollingSpecficValues=*/false);
6354
6356 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6357
6358 // Assume that reads and writes to stack variables can be eliminated by
6359 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6360 // size.
6361 for (BasicBlock *BB : L->blocks()) {
6362 for (Instruction &I : *BB) {
6363 Value *Ptr;
6364 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6365 Ptr = Load->getPointerOperand();
6366 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6367 Ptr = Store->getPointerOperand();
6368 } else
6369 continue;
6370
6371 Ptr = Ptr->stripPointerCasts();
6372
6373 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6374 if (Alloca->getParent() == &F->getEntryBlock())
6375 EphValues.insert(&I);
6376 }
6377 }
6378 }
6379
6380 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6381
6382 // Loop is not unrollable if the loop contains certain instructions.
6383 if (!UCE.canUnroll()) {
6384 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6385 return 1;
6386 }
6387
6388 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6389 << "\n");
6390
6391 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6392 // be able to use it.
6393 int TripCount = 0;
6394 int MaxTripCount = 0;
6395 bool MaxOrZero = false;
6396 unsigned TripMultiple = 0;
6397
6398 bool UseUpperBound = false;
6399 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6400 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6401 UseUpperBound);
6402 unsigned Factor = UP.Count;
6403 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6404
6405 // This function returns 1 to signal to not unroll a loop.
6406 if (Factor == 0)
6407 return 1;
6408 return Factor;
6409}
6410
6411void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6412 int32_t Factor,
6413 CanonicalLoopInfo **UnrolledCLI) {
6414 assert(Factor >= 0 && "Unroll factor must not be negative");
6415
6416 Function *F = Loop->getFunction();
6417 LLVMContext &Ctx = F->getContext();
6418
6419 // If the unrolled loop is not used for another loop-associated directive, it
6420 // is sufficient to add metadata for the LoopUnrollPass.
6421 if (!UnrolledCLI) {
6422 SmallVector<Metadata *, 2> LoopMetadata;
6423 LoopMetadata.push_back(
6424 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6425
6426 if (Factor >= 1) {
6428 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6429 LoopMetadata.push_back(MDNode::get(
6430 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6431 }
6432
6433 addLoopMetadata(Loop, LoopMetadata);
6434 return;
6435 }
6436
6437 // Heuristically determine the unroll factor.
6438 if (Factor == 0)
6440
6441 // No change required with unroll factor 1.
6442 if (Factor == 1) {
6443 *UnrolledCLI = Loop;
6444 return;
6445 }
6446
6447 assert(Factor >= 2 &&
6448 "unrolling only makes sense with a factor of 2 or larger");
6449
6450 Type *IndVarTy = Loop->getIndVarType();
6451
6452 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6453 // unroll the inner loop.
6454 Value *FactorVal =
6455 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6456 /*isSigned=*/false));
6457 std::vector<CanonicalLoopInfo *> LoopNest =
6458 tileLoops(DL, {Loop}, {FactorVal});
6459 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6460 *UnrolledCLI = LoopNest[0];
6461 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6462
6463 // LoopUnrollPass can only fully unroll loops with constant trip count.
6464 // Unroll by the unroll factor with a fallback epilog for the remainder
6465 // iterations if necessary.
6467 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6469 InnerLoop,
6470 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6472 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6473
6474#ifndef NDEBUG
6475 (*UnrolledCLI)->assertOK();
6476#endif
6477}
6478
6479OpenMPIRBuilder::InsertPointTy
6480OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6481 llvm::Value *BufSize, llvm::Value *CpyBuf,
6482 llvm::Value *CpyFn, llvm::Value *DidIt) {
6483 if (!updateToLocation(Loc))
6484 return Loc.IP;
6485
6486 uint32_t SrcLocStrSize;
6487 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6488 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6489 Value *ThreadId = getOrCreateThreadID(Ident);
6490
6491 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6492
6493 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6494
6495 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6496 createRuntimeFunctionCall(Fn, Args);
6497
6498 return Builder.saveIP();
6499}
6500
6501OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6502 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6503 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6505
6506 if (!updateToLocation(Loc))
6507 return Loc.IP;
6508
6509 // If needed allocate and initialize `DidIt` with 0.
6510 // DidIt: flag variable: 1=single thread; 0=not single thread.
6511 llvm::Value *DidIt = nullptr;
6512 if (!CPVars.empty()) {
6513 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6514 Builder.CreateStore(Builder.getInt32(0), DidIt);
6515 }
6516
6517 Directive OMPD = Directive::OMPD_single;
6518 uint32_t SrcLocStrSize;
6519 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6520 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6521 Value *ThreadId = getOrCreateThreadID(Ident);
6522 Value *Args[] = {Ident, ThreadId};
6523
6524 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6525 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
6526
6527 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6528 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6529
6530 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6531 if (Error Err = FiniCB(IP))
6532 return Err;
6533
6534 // The thread that executes the single region must set `DidIt` to 1.
6535 // This is used by __kmpc_copyprivate, to know if the caller is the
6536 // single thread or not.
6537 if (DidIt)
6538 Builder.CreateStore(Builder.getInt32(1), DidIt);
6539
6540 return Error::success();
6541 };
6542
6543 // generates the following:
6544 // if (__kmpc_single()) {
6545 // .... single region ...
6546 // __kmpc_end_single
6547 // }
6548 // __kmpc_copyprivate
6549 // __kmpc_barrier
6550
6551 InsertPointOrErrorTy AfterIP =
6552 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6553 /*Conditional*/ true,
6554 /*hasFinalize*/ true);
6555 if (!AfterIP)
6556 return AfterIP.takeError();
6557
6558 if (DidIt) {
6559 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6560 // NOTE BufSize is currently unused, so just pass 0.
6561 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6562 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6563 CPFuncs[I], DidIt);
6564 // NOTE __kmpc_copyprivate already inserts a barrier
6565 } else if (!IsNowait) {
6566 InsertPointOrErrorTy AfterIP =
6567 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6568 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6569 /* CheckCancelFlag */ false);
6570 if (!AfterIP)
6571 return AfterIP.takeError();
6572 }
6573 return Builder.saveIP();
6574}
6575
6576OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6577 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6578 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6579
6580 if (!updateToLocation(Loc))
6581 return Loc.IP;
6582
6583 Directive OMPD = Directive::OMPD_critical;
6584 uint32_t SrcLocStrSize;
6585 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6586 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6587 Value *ThreadId = getOrCreateThreadID(Ident);
6588 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6589 Value *Args[] = {Ident, ThreadId, LockVar};
6590
6591 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6592 Function *RTFn = nullptr;
6593 if (HintInst) {
6594 // Add Hint to entry Args and create call
6595 EnterArgs.push_back(HintInst);
6596 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6597 } else {
6598 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6599 }
6600 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
6601
6602 Function *ExitRTLFn =
6603 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6604 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6605
6606 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6607 /*Conditional*/ false, /*hasFinalize*/ true);
6608}
6609
6610OpenMPIRBuilder::InsertPointTy
6611OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6612 InsertPointTy AllocaIP, unsigned NumLoops,
6613 ArrayRef<llvm::Value *> StoreValues,
6614 const Twine &Name, bool IsDependSource) {
6615 assert(
6616 llvm::all_of(StoreValues,
6617 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6618 "OpenMP runtime requires depend vec with i64 type");
6619
6620 if (!updateToLocation(Loc))
6621 return Loc.IP;
6622
6623 // Allocate space for vector and generate alloc instruction.
6624 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6625 Builder.restoreIP(AllocaIP);
6626 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6627 ArgsBase->setAlignment(Align(8));
6628 updateToLocation(Loc);
6629
6630 // Store the index value with offset in depend vector.
6631 for (unsigned I = 0; I < NumLoops; ++I) {
6632 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6633 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6634 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6635 STInst->setAlignment(Align(8));
6636 }
6637
6638 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6639 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6640
6641 uint32_t SrcLocStrSize;
6642 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6643 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6644 Value *ThreadId = getOrCreateThreadID(Ident);
6645 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6646
6647 Function *RTLFn = nullptr;
6648 if (IsDependSource)
6649 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6650 else
6651 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6652 createRuntimeFunctionCall(RTLFn, Args);
6653
6654 return Builder.saveIP();
6655}
6656
6657OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6658 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6659 FinalizeCallbackTy FiniCB, bool IsThreads) {
6660 if (!updateToLocation(Loc))
6661 return Loc.IP;
6662
6663 Directive OMPD = Directive::OMPD_ordered;
6664 Instruction *EntryCall = nullptr;
6665 Instruction *ExitCall = nullptr;
6666
6667 if (IsThreads) {
6668 uint32_t SrcLocStrSize;
6669 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6670 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6671 Value *ThreadId = getOrCreateThreadID(Ident);
6672 Value *Args[] = {Ident, ThreadId};
6673
6674 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6675 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
6676
6677 Function *ExitRTLFn =
6678 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6679 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6680 }
6681
6682 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6683 /*Conditional*/ false, /*hasFinalize*/ true);
6684}
6685
6686OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6687 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6688 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6689 bool HasFinalize, bool IsCancellable) {
6690
6691 if (HasFinalize)
6692 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6693
6694 // Create inlined region's entry and body blocks, in preparation
6695 // for conditional creation
6696 BasicBlock *EntryBB = Builder.GetInsertBlock();
6697 Instruction *SplitPos = EntryBB->getTerminator();
6698 if (!isa_and_nonnull<BranchInst>(SplitPos))
6699 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6700 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6701 BasicBlock *FiniBB =
6702 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6703
6704 Builder.SetInsertPoint(EntryBB->getTerminator());
6705 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6706
6707 // generate body
6708 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6709 /* CodeGenIP */ Builder.saveIP()))
6710 return Err;
6711
6712 // emit exit call and do any needed finalization.
6713 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6714 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6715 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6716 "Unexpected control flow graph state!!");
6717 InsertPointOrErrorTy AfterIP =
6718 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6719 if (!AfterIP)
6720 return AfterIP.takeError();
6721 assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
6722 "Unexpected Control Flow State!");
6724
6725 // If we are skipping the region of a non conditional, remove the exit
6726 // block, and clear the builder's insertion point.
6727 assert(SplitPos->getParent() == ExitBB &&
6728 "Unexpected Insertion point location!");
6729 auto merged = MergeBlockIntoPredecessor(ExitBB);
6730 BasicBlock *ExitPredBB = SplitPos->getParent();
6731 auto InsertBB = merged ? ExitPredBB : ExitBB;
6732 if (!isa_and_nonnull<BranchInst>(SplitPos))
6733 SplitPos->eraseFromParent();
6734 Builder.SetInsertPoint(InsertBB);
6735
6736 return Builder.saveIP();
6737}
6738
6739OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6740 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6741 // if nothing to do, Return current insertion point.
6742 if (!Conditional || !EntryCall)
6743 return Builder.saveIP();
6744
6745 BasicBlock *EntryBB = Builder.GetInsertBlock();
6746 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6747 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6748 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6749
6750 // Emit thenBB and set the Builder's insertion point there for
6751 // body generation next. Place the block after the current block.
6752 Function *CurFn = EntryBB->getParent();
6753 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6754
6755 // Move Entry branch to end of ThenBB, and replace with conditional
6756 // branch (If-stmt)
6757 Instruction *EntryBBTI = EntryBB->getTerminator();
6758 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6759 EntryBBTI->removeFromParent();
6760 Builder.SetInsertPoint(UI);
6761 Builder.Insert(EntryBBTI);
6762 UI->eraseFromParent();
6763 Builder.SetInsertPoint(ThenBB->getTerminator());
6764
6765 // return an insertion point to ExitBB.
6766 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6767}
6768
6769OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6770 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6771 bool HasFinalize) {
6772
6773 Builder.restoreIP(FinIP);
6774
6775 // If there is finalization to do, emit it before the exit call
6776 if (HasFinalize) {
6777 assert(!FinalizationStack.empty() &&
6778 "Unexpected finalization stack state!");
6779
6780 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6781 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6782
6783 if (Error Err = Fi.FiniCB(FinIP))
6784 return Err;
6785
6786 BasicBlock *FiniBB = FinIP.getBlock();
6787 Instruction *FiniBBTI = FiniBB->getTerminator();
6788
6789 // set Builder IP for call creation
6790 Builder.SetInsertPoint(FiniBBTI);
6791 }
6792
6793 if (!ExitCall)
6794 return Builder.saveIP();
6795
6796 // place the Exitcall as last instruction before Finalization block terminator
6797 ExitCall->removeFromParent();
6798 Builder.Insert(ExitCall);
6799
6800 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6801 ExitCall->getIterator());
6802}
6803
6804OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6805 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6806 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6807 if (!IP.isSet())
6808 return IP;
6809
6810 IRBuilder<>::InsertPointGuard IPG(Builder);
6811
6812 // creates the following CFG structure
6813 // OMP_Entry : (MasterAddr != PrivateAddr)?
6814 // F T
6815 // | \
6816 // | copin.not.master
6817 // | /
6818 // v /
6819 // copyin.not.master.end
6820 // |
6821 // v
6822 // OMP.Entry.Next
6823
6824 BasicBlock *OMP_Entry = IP.getBlock();
6825 Function *CurFn = OMP_Entry->getParent();
6826 BasicBlock *CopyBegin =
6827 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6828 BasicBlock *CopyEnd = nullptr;
6829
6830 // If entry block is terminated, split to preserve the branch to following
6831 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6832 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6833 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6834 "copyin.not.master.end");
6835 OMP_Entry->getTerminator()->eraseFromParent();
6836 } else {
6837 CopyEnd =
6838 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6839 }
6840
6841 Builder.SetInsertPoint(OMP_Entry);
6842 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6843 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6844 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6845 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6846
6847 Builder.SetInsertPoint(CopyBegin);
6848 if (BranchtoEnd)
6849 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6850
6851 return Builder.saveIP();
6852}
6853
6854CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6856 std::string Name) {
6857 IRBuilder<>::InsertPointGuard IPG(Builder);
6858 updateToLocation(Loc);
6859
6860 uint32_t SrcLocStrSize;
6861 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6862 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6863 Value *ThreadId = getOrCreateThreadID(Ident);
6864 Value *Args[] = {ThreadId, Size, Allocator};
6865
6866 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6867
6868 return createRuntimeFunctionCall(Fn, Args, Name);
6869}
6870
6871CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6872 Value *Addr, Value *Allocator,
6873 std::string Name) {
6874 IRBuilder<>::InsertPointGuard IPG(Builder);
6875 updateToLocation(Loc);
6876
6877 uint32_t SrcLocStrSize;
6878 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6879 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6880 Value *ThreadId = getOrCreateThreadID(Ident);
6881 Value *Args[] = {ThreadId, Addr, Allocator};
6882 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6883 return createRuntimeFunctionCall(Fn, Args, Name);
6884}
6885
6886CallInst *OpenMPIRBuilder::createOMPInteropInit(
6887 const LocationDescription &Loc, Value *InteropVar,
6888 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6889 Value *DependenceAddress, bool HaveNowaitClause) {
6890 IRBuilder<>::InsertPointGuard IPG(Builder);
6891 updateToLocation(Loc);
6892
6893 uint32_t SrcLocStrSize;
6894 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6895 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6896 Value *ThreadId = getOrCreateThreadID(Ident);
6897 if (Device == nullptr)
6899 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
6900 if (NumDependences == nullptr) {
6901 NumDependences = ConstantInt::get(Int32, 0);
6902 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6903 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6904 }
6905 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6906 Value *Args[] = {
6907 Ident, ThreadId, InteropVar, InteropTypeVal,
6908 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
6909
6910 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
6911
6912 return createRuntimeFunctionCall(Fn, Args);
6913}
6914
6915CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
6916 const LocationDescription &Loc, Value *InteropVar, Value *Device,
6917 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
6918 IRBuilder<>::InsertPointGuard IPG(Builder);
6919 updateToLocation(Loc);
6920
6921 uint32_t SrcLocStrSize;
6922 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6923 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6924 Value *ThreadId = getOrCreateThreadID(Ident);
6925 if (Device == nullptr)
6927 if (NumDependences == nullptr) {
6928 NumDependences = ConstantInt::get(Int32, 0);
6929 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6930 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6931 }
6932 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6933 Value *Args[] = {
6934 Ident, ThreadId, InteropVar, Device,
6935 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6936
6937 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
6938
6939 return createRuntimeFunctionCall(Fn, Args);
6940}
6941
6942CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
6943 Value *InteropVar, Value *Device,
6944 Value *NumDependences,
6945 Value *DependenceAddress,
6946 bool HaveNowaitClause) {
6947 IRBuilder<>::InsertPointGuard IPG(Builder);
6948 updateToLocation(Loc);
6949 uint32_t SrcLocStrSize;
6950 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6951 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6952 Value *ThreadId = getOrCreateThreadID(Ident);
6953 if (Device == nullptr)
6955 if (NumDependences == nullptr) {
6956 NumDependences = ConstantInt::get(Int32, 0);
6957 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
6958 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
6959 }
6960 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
6961 Value *Args[] = {
6962 Ident, ThreadId, InteropVar, Device,
6963 NumDependences, DependenceAddress, HaveNowaitClauseVal};
6964
6965 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
6966
6967 return createRuntimeFunctionCall(Fn, Args);
6968}
6969
6970CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
6971 const LocationDescription &Loc, llvm::Value *Pointer,
6972 llvm::ConstantInt *Size, const llvm::Twine &Name) {
6973 IRBuilder<>::InsertPointGuard IPG(Builder);
6974 updateToLocation(Loc);
6975
6976 uint32_t SrcLocStrSize;
6977 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6978 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6979 Value *ThreadId = getOrCreateThreadID(Ident);
6980 Constant *ThreadPrivateCache =
6981 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
6982 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
6983
6984 Function *Fn =
6985 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
6986
6987 return createRuntimeFunctionCall(Fn, Args);
6988}
6989
6990OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
6991 const LocationDescription &Loc,
6992 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
6993 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
6994 "expected num_threads and num_teams to be specified");
6995
6996 if (!updateToLocation(Loc))
6997 return Loc.IP;
6998
6999 uint32_t SrcLocStrSize;
7000 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7001 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7002 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7003 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7004 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7005 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7006 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7007
7008 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7009 Function *Kernel = DebugKernelWrapper;
7010
7011 // We need to strip the debug prefix to get the correct kernel name.
7012 StringRef KernelName = Kernel->getName();
7013 const std::string DebugPrefix = "_debug__";
7014 if (KernelName.ends_with(DebugPrefix)) {
7015 KernelName = KernelName.drop_back(DebugPrefix.length());
7016 Kernel = M.getFunction(KernelName);
7017 assert(Kernel && "Expected the real kernel to exist");
7018 }
7019
7020 // Manifest the launch configuration in the metadata matching the kernel
7021 // environment.
7022 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7023 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7024
7025 // If MaxThreads not set, select the maximum between the default workgroup
7026 // size and the MinThreads value.
7027 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7028 if (MaxThreadsVal < 0)
7029 MaxThreadsVal = std::max(
7030 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7031
7032 if (MaxThreadsVal > 0)
7033 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7034
7035 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7036 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7037 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7038 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7039 Constant *ReductionDataSize =
7040 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7041 Constant *ReductionBufferLength =
7042 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7043
7044 Function *Fn = getOrCreateRuntimeFunctionPtr(
7045 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7046 const DataLayout &DL = Fn->getDataLayout();
7047
7048 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7049 Constant *DynamicEnvironmentInitializer =
7050 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7051 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7052 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7053 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7054 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7055 DL.getDefaultGlobalsAddressSpace());
7056 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7057
7058 Constant *DynamicEnvironment =
7059 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7060 ? DynamicEnvironmentGV
7061 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7062 DynamicEnvironmentPtr);
7063
7064 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7065 ConfigurationEnvironment, {
7066 UseGenericStateMachineVal,
7067 MayUseNestedParallelismVal,
7068 IsSPMDVal,
7069 MinThreads,
7070 MaxThreads,
7071 MinTeams,
7072 MaxTeams,
7073 ReductionDataSize,
7074 ReductionBufferLength,
7075 });
7076 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7077 KernelEnvironment, {
7078 ConfigurationEnvironmentInitializer,
7079 Ident,
7080 DynamicEnvironment,
7081 });
7082 std::string KernelEnvironmentName =
7083 (KernelName + "_kernel_environment").str();
7084 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7085 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7086 KernelEnvironmentInitializer, KernelEnvironmentName,
7087 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7088 DL.getDefaultGlobalsAddressSpace());
7089 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7090
7091 Constant *KernelEnvironment =
7092 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7093 ? KernelEnvironmentGV
7094 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7095 KernelEnvironmentPtr);
7096 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7097 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7098 KernelLaunchEnvironment =
7099 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7100 ? KernelLaunchEnvironment
7101 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7102 KernelLaunchEnvParamTy);
7103 CallInst *ThreadKind = createRuntimeFunctionCall(
7104 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7105
7106 Value *ExecUserCode = Builder.CreateICmpEQ(
7107 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7108 "exec_user_code");
7109
7110 // ThreadKind = __kmpc_target_init(...)
7111 // if (ThreadKind == -1)
7112 // user_code
7113 // else
7114 // return;
7115
7116 auto *UI = Builder.CreateUnreachable();
7117 BasicBlock *CheckBB = UI->getParent();
7118 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7119
7120 BasicBlock *WorkerExitBB = BasicBlock::Create(
7121 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7122 Builder.SetInsertPoint(WorkerExitBB);
7123 Builder.CreateRetVoid();
7124
7125 auto *CheckBBTI = CheckBB->getTerminator();
7126 Builder.SetInsertPoint(CheckBBTI);
7127 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7128
7129 CheckBBTI->eraseFromParent();
7130 UI->eraseFromParent();
7131
7132 // Continue in the "user_code" block, see diagram above and in
7133 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7134 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7135}
7136
7137void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
7138 int32_t TeamsReductionDataSize,
7139 int32_t TeamsReductionBufferLength) {
7140 if (!updateToLocation(Loc))
7141 return;
7142
7143 Function *Fn = getOrCreateRuntimeFunctionPtr(
7144 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7145
7146 createRuntimeFunctionCall(Fn, {});
7147
7148 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7149 return;
7150
7151 Function *Kernel = Builder.GetInsertBlock()->getParent();
7152 // We need to strip the debug prefix to get the correct kernel name.
7153 StringRef KernelName = Kernel->getName();
7154 const std::string DebugPrefix = "_debug__";
7155 if (KernelName.ends_with(DebugPrefix))
7156 KernelName = KernelName.drop_back(DebugPrefix.length());
7157 auto *KernelEnvironmentGV =
7158 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7159 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7160 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7161 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7162 KernelEnvironmentInitializer,
7163 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7164 NewInitializer = ConstantFoldInsertValueInstruction(
7165 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7166 {0, 8});
7167 KernelEnvironmentGV->setInitializer(NewInitializer);
7168}
7169
7170static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7171 bool Min) {
7172 if (Kernel.hasFnAttribute(Name)) {
7173 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7174 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7175 }
7176 Kernel.addFnAttr(Name, llvm::utostr(Value));
7177}
7178
7179std::pair<int32_t, int32_t>
7180OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
7181 int32_t ThreadLimit =
7182 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7183
7184 if (T.isAMDGPU()) {
7185 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7186 if (!Attr.isValid() || !Attr.isStringAttribute())
7187 return {0, ThreadLimit};
7188 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7189 int32_t LB, UB;
7190 if (!llvm::to_integer(UBStr, UB, 10))
7191 return {0, ThreadLimit};
7192 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7193 if (!llvm::to_integer(LBStr, LB, 10))
7194 return {0, UB};
7195 return {LB, UB};
7196 }
7197
7198 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7199 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7200 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7201 }
7202 return {0, ThreadLimit};
7203}
7204
7205void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
7206 Function &Kernel, int32_t LB,
7207 int32_t UB) {
7208 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7209
7210 if (T.isAMDGPU()) {
7211 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7212 llvm::utostr(LB) + "," + llvm::utostr(UB));
7213 return;
7214 }
7215
7216 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7217}
7218
7219std::pair<int32_t, int32_t>
7220OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
7221 // TODO: Read from backend annotations if available.
7222 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7223}
7224
7225void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7226 int32_t LB, int32_t UB) {
7227 if (T.isNVPTX())
7228 if (UB > 0)
7229 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7230 if (T.isAMDGPU())
7231 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7232
7233 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7234}
7235
7236void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7237 Function *OutlinedFn) {
7238 if (Config.isTargetDevice()) {
7240 // TODO: Determine if DSO local can be set to true.
7241 OutlinedFn->setDSOLocal(false);
7243 if (T.isAMDGCN())
7245 else if (T.isNVPTX())
7247 else if (T.isSPIRV())
7249 }
7250}
7251
7252Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7253 StringRef EntryFnIDName) {
7254 if (Config.isTargetDevice()) {
7255 assert(OutlinedFn && "The outlined function must exist if embedded");
7256 return OutlinedFn;
7257 }
7258
7259 return new GlobalVariable(
7260 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7261 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7262}
7263
7264Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7265 StringRef EntryFnName) {
7266 if (OutlinedFn)
7267 return OutlinedFn;
7268
7269 assert(!M.getGlobalVariable(EntryFnName, true) &&
7270 "Named kernel already exists?");
7271 return new GlobalVariable(
7272 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7273 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7274}
7275
7276Error OpenMPIRBuilder::emitTargetRegionFunction(
7277 TargetRegionEntryInfo &EntryInfo,
7278 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7279 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7280
7281 SmallString<64> EntryFnName;
7282 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7283
7284 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7285 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7286 if (!CBResult)
7287 return CBResult.takeError();
7288 OutlinedFn = *CBResult;
7289 } else {
7290 OutlinedFn = nullptr;
7291 }
7292
7293 // If this target outline function is not an offload entry, we don't need to
7294 // register it. This may be in the case of a false if clause, or if there are
7295 // no OpenMP targets.
7296 if (!IsOffloadEntry)
7297 return Error::success();
7298
7299 std::string EntryFnIDName =
7300 Config.isTargetDevice()
7301 ? std::string(EntryFnName)
7302 : createPlatformSpecificName({EntryFnName, "region_id"});
7303
7304 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7305 EntryFnName, EntryFnIDName);
7306 return Error::success();
7307}
7308
7309Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7310 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7311 StringRef EntryFnName, StringRef EntryFnIDName) {
7312 if (OutlinedFn)
7313 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7314 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7315 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7316 OffloadInfoManager.registerTargetRegionEntryInfo(
7317 EntryInfo, EntryAddr, OutlinedFnID,
7318 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7319 return OutlinedFnID;
7320}
7321
7322OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7323 const LocationDescription &Loc, InsertPointTy AllocaIP,
7324 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7325 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7326 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7327 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7328 BodyGenTy BodyGenType)>
7329 BodyGenCB,
7330 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7331 if (!updateToLocation(Loc))
7332 return InsertPointTy();
7333
7334 Builder.restoreIP(CodeGenIP);
7335 // Disable TargetData CodeGen on Device pass.
7336 if (Config.IsTargetDevice.value_or(false)) {
7337 if (BodyGenCB) {
7338 InsertPointOrErrorTy AfterIP =
7339 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7340 if (!AfterIP)
7341 return AfterIP.takeError();
7342 Builder.restoreIP(*AfterIP);
7343 }
7344 return Builder.saveIP();
7345 }
7346
7347 bool IsStandAlone = !BodyGenCB;
7348 MapInfosTy *MapInfo;
7349 // Generate the code for the opening of the data environment. Capture all the
7350 // arguments of the runtime call by reference because they are used in the
7351 // closing of the region.
7352 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7353 InsertPointTy CodeGenIP) -> Error {
7354 MapInfo = &GenMapInfoCB(Builder.saveIP());
7355 if (Error Err = emitOffloadingArrays(
7356 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7357 /*IsNonContiguous=*/true, DeviceAddrCB))
7358 return Err;
7359
7360 TargetDataRTArgs RTArgs;
7361 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7362
7363 // Emit the number of elements in the offloading arrays.
7364 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7365
7366 // Source location for the ident struct
7367 if (!SrcLocInfo) {
7368 uint32_t SrcLocStrSize;
7369 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7370 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7371 }
7372
7373 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7374 SrcLocInfo, DeviceID,
7375 PointerNum, RTArgs.BasePointersArray,
7376 RTArgs.PointersArray, RTArgs.SizesArray,
7377 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7378 RTArgs.MappersArray};
7379
7380 if (IsStandAlone) {
7381 assert(MapperFunc && "MapperFunc missing for standalone target data");
7382
7383 auto TaskBodyCB = [&](Value *, Value *,
7385 if (Info.HasNoWait) {
7386 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7390 }
7391
7392 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7393 OffloadingArgs);
7394
7395 if (Info.HasNoWait) {
7396 BasicBlock *OffloadContBlock =
7397 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7398 Function *CurFn = Builder.GetInsertBlock()->getParent();
7399 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7400 Builder.restoreIP(Builder.saveIP());
7401 }
7402 return Error::success();
7403 };
7404
7405 bool RequiresOuterTargetTask = Info.HasNoWait;
7406 if (!RequiresOuterTargetTask)
7407 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7408 /*TargetTaskAllocaIP=*/{}));
7409 else
7410 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7411 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7412 } else {
7413 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7414 omp::OMPRTL___tgt_target_data_begin_mapper);
7415
7416 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
7417
7418 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7419 if (isa<AllocaInst>(DeviceMap.second.second)) {
7420 auto *LI =
7421 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7422 Builder.CreateStore(LI, DeviceMap.second.second);
7423 }
7424 }
7425
7426 // If device pointer privatization is required, emit the body of the
7427 // region here. It will have to be duplicated: with and without
7428 // privatization.
7429 InsertPointOrErrorTy AfterIP =
7430 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7431 if (!AfterIP)
7432 return AfterIP.takeError();
7433 Builder.restoreIP(*AfterIP);
7434 }
7435 return Error::success();
7436 };
7437
7438 // If we need device pointer privatization, we need to emit the body of the
7439 // region with no privatization in the 'else' branch of the conditional.
7440 // Otherwise, we don't have to do anything.
7441 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7442 InsertPointTy CodeGenIP) -> Error {
7443 InsertPointOrErrorTy AfterIP =
7444 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7445 if (!AfterIP)
7446 return AfterIP.takeError();
7447 Builder.restoreIP(*AfterIP);
7448 return Error::success();
7449 };
7450
7451 // Generate code for the closing of the data region.
7452 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7453 TargetDataRTArgs RTArgs;
7454 Info.EmitDebug = !MapInfo->Names.empty();
7455 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7456
7457 // Emit the number of elements in the offloading arrays.
7458 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7459
7460 // Source location for the ident struct
7461 if (!SrcLocInfo) {
7462 uint32_t SrcLocStrSize;
7463 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7464 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7465 }
7466
7467 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7468 PointerNum, RTArgs.BasePointersArray,
7469 RTArgs.PointersArray, RTArgs.SizesArray,
7470 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7471 RTArgs.MappersArray};
7472 Function *EndMapperFunc =
7473 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7474
7475 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
7476 return Error::success();
7477 };
7478
7479 // We don't have to do anything to close the region if the if clause evaluates
7480 // to false.
7481 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7482 return Error::success();
7483 };
7484
7485 Error Err = [&]() -> Error {
7486 if (BodyGenCB) {
7487 Error Err = [&]() {
7488 if (IfCond)
7489 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7490 return BeginThenGen(AllocaIP, Builder.saveIP());
7491 }();
7492
7493 if (Err)
7494 return Err;
7495
7496 // If we don't require privatization of device pointers, we emit the body
7497 // in between the runtime calls. This avoids duplicating the body code.
7498 InsertPointOrErrorTy AfterIP =
7499 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7500 if (!AfterIP)
7501 return AfterIP.takeError();
7502 restoreIPandDebugLoc(Builder, *AfterIP);
7503
7504 if (IfCond)
7505 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7506 return EndThenGen(AllocaIP, Builder.saveIP());
7507 }
7508 if (IfCond)
7509 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7510 return BeginThenGen(AllocaIP, Builder.saveIP());
7511 }();
7512
7513 if (Err)
7514 return Err;
7515
7516 return Builder.saveIP();
7517}
7518
7520OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7521 bool IsGPUDistribute) {
7522 assert((IVSize == 32 || IVSize == 64) &&
7523 "IV size is not compatible with the omp runtime");
7525 if (IsGPUDistribute)
7526 Name = IVSize == 32
7527 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7528 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7529 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7530 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7531 else
7532 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7533 : omp::OMPRTL___kmpc_for_static_init_4u)
7534 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7535 : omp::OMPRTL___kmpc_for_static_init_8u);
7536
7537 return getOrCreateRuntimeFunction(M, Name);
7538}
7539
7540FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7541 bool IVSigned) {
7542 assert((IVSize == 32 || IVSize == 64) &&
7543 "IV size is not compatible with the omp runtime");
7544 RuntimeFunction Name = IVSize == 32
7545 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7546 : omp::OMPRTL___kmpc_dispatch_init_4u)
7547 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7548 : omp::OMPRTL___kmpc_dispatch_init_8u);
7549
7550 return getOrCreateRuntimeFunction(M, Name);
7551}
7552
7553FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7554 bool IVSigned) {
7555 assert((IVSize == 32 || IVSize == 64) &&
7556 "IV size is not compatible with the omp runtime");
7557 RuntimeFunction Name = IVSize == 32
7558 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7559 : omp::OMPRTL___kmpc_dispatch_next_4u)
7560 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7561 : omp::OMPRTL___kmpc_dispatch_next_8u);
7562
7563 return getOrCreateRuntimeFunction(M, Name);
7564}
7565
7566FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7567 bool IVSigned) {
7568 assert((IVSize == 32 || IVSize == 64) &&
7569 "IV size is not compatible with the omp runtime");
7570 RuntimeFunction Name = IVSize == 32
7571 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7572 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7573 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7574 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7575
7576 return getOrCreateRuntimeFunction(M, Name);
7577}
7578
7579FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7580 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7581}
7582
7584 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7585 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7586
7587 DISubprogram *NewSP = Func->getSubprogram();
7588 if (!NewSP)
7589 return;
7590
7592
7593 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7594 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7595 // Only use cached variable if the arg number matches. This is important
7596 // so that DIVariable created for privatized variables are not discarded.
7597 if (NewVar && (arg == NewVar->getArg()))
7598 return NewVar;
7599
7601 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7602 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7603 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7604 return NewVar;
7605 };
7606
7607 auto UpdateDebugRecord = [&](auto *DR) {
7608 DILocalVariable *OldVar = DR->getVariable();
7609 unsigned ArgNo = 0;
7610 for (auto Loc : DR->location_ops()) {
7611 auto Iter = ValueReplacementMap.find(Loc);
7612 if (Iter != ValueReplacementMap.end()) {
7613 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7614 ArgNo = std::get<1>(Iter->second) + 1;
7615 }
7616 }
7617 if (ArgNo != 0)
7618 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7619 };
7620
7621 // The location and scope of variable intrinsics and records still point to
7622 // the parent function of the target region. Update them.
7623 for (Instruction &I : instructions(Func)) {
7625 "Unexpected debug intrinsic");
7626 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7627 UpdateDebugRecord(&DVR);
7628 }
7629 // An extra argument is passed to the device. Create the debug data for it.
7630 if (OMPBuilder.Config.isTargetDevice()) {
7631 DICompileUnit *CU = NewSP->getUnit();
7632 Module *M = Func->getParent();
7633 DIBuilder DB(*M, true, CU);
7634 DIType *VoidPtrTy =
7635 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7636 DILocalVariable *Var = DB.createParameterVariable(
7637 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7638 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7639 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7640 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7641 &(*Func->begin()));
7642 }
7643}
7644
7646 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7647 return cast<Operator>(V)->getOperand(0);
7648 return V;
7649}
7650
7652 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7653 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7654 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7655 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7656 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7657 SmallVector<Type *> ParameterTypes;
7658 if (OMPBuilder.Config.isTargetDevice()) {
7659 // Add the "implicit" runtime argument we use to provide launch specific
7660 // information for target devices.
7661 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7662 ParameterTypes.push_back(Int8PtrTy);
7663
7664 // All parameters to target devices are passed as pointers
7665 // or i64. This assumes 64-bit address spaces/pointers.
7666 for (auto &Arg : Inputs)
7667 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7668 ? Arg->getType()
7669 : Type::getInt64Ty(Builder.getContext()));
7670 } else {
7671 for (auto &Arg : Inputs)
7672 ParameterTypes.push_back(Arg->getType());
7673 }
7674
7675 auto BB = Builder.GetInsertBlock();
7676 auto M = BB->getModule();
7677 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7678 /*isVarArg*/ false);
7679 auto Func =
7680 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7681
7682 // Forward target-cpu and target-features function attributes from the
7683 // original function to the new outlined function.
7684 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7685
7686 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7687 if (TargetCpuAttr.isStringAttribute())
7688 Func->addFnAttr(TargetCpuAttr);
7689
7690 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7691 if (TargetFeaturesAttr.isStringAttribute())
7692 Func->addFnAttr(TargetFeaturesAttr);
7693
7694 if (OMPBuilder.Config.isTargetDevice()) {
7695 Value *ExecMode =
7696 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7697 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7698 }
7699
7700 // Save insert point.
7701 IRBuilder<>::InsertPointGuard IPG(Builder);
7702 // We will generate the entries in the outlined function but the debug
7703 // location may still be pointing to the parent function. Reset it now.
7704 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7705
7706 // Generate the region into the function.
7707 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7708 Builder.SetInsertPoint(EntryBB);
7709
7710 // Insert target init call in the device compilation pass.
7711 if (OMPBuilder.Config.isTargetDevice())
7712 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7713
7714 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7715
7716 // As we embed the user code in the middle of our target region after we
7717 // generate entry code, we must move what allocas we can into the entry
7718 // block to avoid possible breaking optimisations for device
7719 if (OMPBuilder.Config.isTargetDevice())
7720 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7721
7722 // Insert target deinit call in the device compilation pass.
7723 BasicBlock *OutlinedBodyBB =
7724 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7725 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7726 Builder.saveIP(),
7727 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7728 if (!AfterIP)
7729 return AfterIP.takeError();
7730 Builder.restoreIP(*AfterIP);
7731 if (OMPBuilder.Config.isTargetDevice())
7732 OMPBuilder.createTargetDeinit(Builder);
7733
7734 // Insert return instruction.
7735 Builder.CreateRetVoid();
7736
7737 // New Alloca IP at entry point of created device function.
7738 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7739 auto AllocaIP = Builder.saveIP();
7740
7741 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7742
7743 // Skip the artificial dyn_ptr on the device.
7744 const auto &ArgRange =
7745 OMPBuilder.Config.isTargetDevice()
7746 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7747 : Func->args();
7748
7750
7751 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7752 // Things like GEP's can come in the form of Constants. Constants and
7753 // ConstantExpr's do not have access to the knowledge of what they're
7754 // contained in, so we must dig a little to find an instruction so we
7755 // can tell if they're used inside of the function we're outlining. We
7756 // also replace the original constant expression with a new instruction
7757 // equivalent; an instruction as it allows easy modification in the
7758 // following loop, as we can now know the constant (instruction) is
7759 // owned by our target function and replaceUsesOfWith can now be invoked
7760 // on it (cannot do this with constants it seems). A brand new one also
7761 // allows us to be cautious as it is perhaps possible the old expression
7762 // was used inside of the function but exists and is used externally
7763 // (unlikely by the nature of a Constant, but still).
7764 // NOTE: We cannot remove dead constants that have been rewritten to
7765 // instructions at this stage, we run the risk of breaking later lowering
7766 // by doing so as we could still be in the process of lowering the module
7767 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7768 // constants we have created rewritten versions of.
7769 if (auto *Const = dyn_cast<Constant>(Input))
7770 convertUsersOfConstantsToInstructions(Const, Func, false);
7771
7772 // Collect users before iterating over them to avoid invalidating the
7773 // iteration in case a user uses Input more than once (e.g. a call
7774 // instruction).
7775 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7776 // Collect all the instructions
7778 if (auto *Instr = dyn_cast<Instruction>(User))
7779 if (Instr->getFunction() == Func)
7780 Instr->replaceUsesOfWith(Input, InputCopy);
7781 };
7782
7783 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7784
7785 // Rewrite uses of input valus to parameters.
7786 for (auto InArg : zip(Inputs, ArgRange)) {
7787 Value *Input = std::get<0>(InArg);
7788 Argument &Arg = std::get<1>(InArg);
7789 Value *InputCopy = nullptr;
7790
7791 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7792 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7793 if (!AfterIP)
7794 return AfterIP.takeError();
7795 Builder.restoreIP(*AfterIP);
7796 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7797
7798 // In certain cases a Global may be set up for replacement, however, this
7799 // Global may be used in multiple arguments to the kernel, just segmented
7800 // apart, for example, if we have a global array, that is sectioned into
7801 // multiple mappings (technically not legal in OpenMP, but there is a case
7802 // in Fortran for Common Blocks where this is neccesary), we will end up
7803 // with GEP's into this array inside the kernel, that refer to the Global
7804 // but are technically seperate arguments to the kernel for all intents and
7805 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7806 // index, it will fold into an referal to the Global, if we then encounter
7807 // this folded GEP during replacement all of the references to the
7808 // Global in the kernel will be replaced with the argument we have generated
7809 // that corresponds to it, including any other GEP's that refer to the
7810 // Global that may be other arguments. This will invalidate all of the other
7811 // preceding mapped arguments that refer to the same global that may be
7812 // seperate segments. To prevent this, we defer global processing until all
7813 // other processing has been performed.
7816 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7817 continue;
7818 }
7819
7821 continue;
7822
7823 ReplaceValue(Input, InputCopy, Func);
7824 }
7825
7826 // Replace all of our deferred Input values, currently just Globals.
7827 for (auto Deferred : DeferredReplacement)
7828 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7829
7830 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7831 ValueReplacementMap);
7832 return Func;
7833}
7834/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7835/// of pointers containing shared data between the parent task and the created
7836/// task.
7837static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7838 IRBuilderBase &Builder,
7839 Value *TaskWithPrivates,
7840 Type *TaskWithPrivatesTy) {
7841
7842 Type *TaskTy = OMPIRBuilder.Task;
7843 LLVMContext &Ctx = Builder.getContext();
7844 Value *TaskT =
7845 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7846 Value *Shareds = TaskT;
7847 // TaskWithPrivatesTy can be one of the following
7848 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7849 // %struct.privates }
7850 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7851 //
7852 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7853 // its first member has to be the task descriptor. TaskTy is the type of the
7854 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7855 // first member of TaskT, gives us the pointer to shared data.
7856 if (TaskWithPrivatesTy != TaskTy)
7857 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7858 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7859}
7860/// Create an entry point for a target task with the following.
7861/// It'll have the following signature
7862/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7863/// This function is called from emitTargetTask once the
7864/// code to launch the target kernel has been outlined already.
7865/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7866/// into the task structure so that the deferred target task can access this
7867/// data even after the stack frame of the generating task has been rolled
7868/// back. Offloading arrays contain base pointers, pointers, sizes etc
7869/// of the data that the target kernel will access. These in effect are the
7870/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7872 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7873 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7874 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7875
7876 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7877 // This is because PrivatesTy is the type of the structure in which
7878 // we pass the offloading arrays to the deferred target task.
7879 assert((!NumOffloadingArrays || PrivatesTy) &&
7880 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7881 "to privatize");
7882
7883 Module &M = OMPBuilder.M;
7884 // KernelLaunchFunction is the target launch function, i.e.
7885 // the function that sets up kernel arguments and calls
7886 // __tgt_target_kernel to launch the kernel on the device.
7887 //
7888 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7889
7890 // StaleCI is the CallInst which is the call to the outlined
7891 // target kernel launch function. If there are local live-in values
7892 // that the outlined function uses then these are aggregated into a structure
7893 // which is passed as the second argument. If there are no local live-in
7894 // values or if all values used by the outlined kernel are global variables,
7895 // then there's only one argument, the threadID. So, StaleCI can be
7896 //
7897 // %structArg = alloca { ptr, ptr }, align 8
7898 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
7899 // store ptr %20, ptr %gep_, align 8
7900 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
7901 // store ptr %21, ptr %gep_8, align 8
7902 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
7903 //
7904 // OR
7905 //
7906 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
7907 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
7908 StaleCI->getIterator());
7909
7910 LLVMContext &Ctx = StaleCI->getParent()->getContext();
7911
7912 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
7913 Type *TaskPtrTy = OMPBuilder.TaskPtr;
7914 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
7915
7916 auto ProxyFnTy =
7917 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
7918 /* isVarArg */ false);
7919 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
7920 ".omp_target_task_proxy_func",
7921 Builder.GetInsertBlock()->getModule());
7922 Value *ThreadId = ProxyFn->getArg(0);
7923 Value *TaskWithPrivates = ProxyFn->getArg(1);
7924 ThreadId->setName("thread.id");
7925 TaskWithPrivates->setName("task");
7926
7927 bool HasShareds = SharedArgsOperandNo > 0;
7928 bool HasOffloadingArrays = NumOffloadingArrays > 0;
7929 BasicBlock *EntryBB =
7930 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
7931 Builder.SetInsertPoint(EntryBB);
7932
7933 SmallVector<Value *> KernelLaunchArgs;
7934 KernelLaunchArgs.reserve(StaleCI->arg_size());
7935 KernelLaunchArgs.push_back(ThreadId);
7936
7937 if (HasOffloadingArrays) {
7938 assert(TaskTy != TaskWithPrivatesTy &&
7939 "If there are offloading arrays to pass to the target"
7940 "TaskTy cannot be the same as TaskWithPrivatesTy");
7941 (void)TaskTy;
7942 Value *Privates =
7943 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
7944 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
7945 KernelLaunchArgs.push_back(
7946 Builder.CreateStructGEP(PrivatesTy, Privates, i));
7947 }
7948
7949 if (HasShareds) {
7950 auto *ArgStructAlloca =
7951 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
7952 assert(ArgStructAlloca &&
7953 "Unable to find the alloca instruction corresponding to arguments "
7954 "for extracted function");
7955 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
7956
7957 AllocaInst *NewArgStructAlloca =
7958 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
7959
7960 Value *SharedsSize =
7961 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
7962
7964 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
7965
7966 Builder.CreateMemCpy(
7967 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
7968 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
7969 KernelLaunchArgs.push_back(NewArgStructAlloca);
7970 }
7971 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
7972 Builder.CreateRetVoid();
7973 return ProxyFn;
7974}
7976
7977 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
7978 return GEP->getSourceElementType();
7979 if (auto *Alloca = dyn_cast<AllocaInst>(V))
7980 return Alloca->getAllocatedType();
7981
7982 llvm_unreachable("Unhandled Instruction type");
7983 return nullptr;
7984}
7985// This function returns a struct that has at most two members.
7986// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
7987// descriptor. The second member, if needed, is a struct containing arrays
7988// that need to be passed to the offloaded target kernel. For example,
7989// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
7990// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
7991// respectively, then the types created by this function are
7992//
7993// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
7994// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7995// %struct.privates }
7996// %struct.task_with_privates is returned by this function.
7997// If there aren't any offloading arrays to pass to the target kernel,
7998// %struct.kmp_task_ompbuilder_t is returned.
7999static StructType *
8000createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
8001 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8002
8003 if (OffloadingArraysToPrivatize.empty())
8004 return OMPIRBuilder.Task;
8005
8006 SmallVector<Type *, 4> StructFieldTypes;
8007 for (Value *V : OffloadingArraysToPrivatize) {
8008 assert(V->getType()->isPointerTy() &&
8009 "Expected pointer to array to privatize. Got a non-pointer value "
8010 "instead");
8011 Type *ArrayTy = getOffloadingArrayType(V);
8012 assert(ArrayTy && "ArrayType cannot be nullptr");
8013 StructFieldTypes.push_back(ArrayTy);
8014 }
8015 StructType *PrivatesStructTy =
8016 StructType::create(StructFieldTypes, "struct.privates");
8017 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8018 "struct.task_with_privates");
8019}
8021 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8022 TargetRegionEntryInfo &EntryInfo,
8023 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8024 Function *&OutlinedFn, Constant *&OutlinedFnID,
8026 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
8027 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
8028
8029 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8030 [&](StringRef EntryFnName) {
8031 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8032 EntryFnName, Inputs, CBFunc,
8033 ArgAccessorFuncCB);
8034 };
8035
8036 return OMPBuilder.emitTargetRegionFunction(
8037 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8038 OutlinedFnID);
8039}
8040
8041OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
8042 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8043 OpenMPIRBuilder::InsertPointTy AllocaIP,
8045 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8046
8047 // The following explains the code-gen scenario for the `target` directive. A
8048 // similar scneario is followed for other device-related directives (e.g.
8049 // `target enter data`) but in similar fashion since we only need to emit task
8050 // that encapsulates the proper runtime call.
8051 //
8052 // When we arrive at this function, the target region itself has been
8053 // outlined into the function OutlinedFn.
8054 // So at ths point, for
8055 // --------------------------------------------------------------
8056 // void user_code_that_offloads(...) {
8057 // omp target depend(..) map(from:a) map(to:b) private(i)
8058 // do i = 1, 10
8059 // a(i) = b(i) + n
8060 // }
8061 //
8062 // --------------------------------------------------------------
8063 //
8064 // we have
8065 //
8066 // --------------------------------------------------------------
8067 //
8068 // void user_code_that_offloads(...) {
8069 // %.offload_baseptrs = alloca [2 x ptr], align 8
8070 // %.offload_ptrs = alloca [2 x ptr], align 8
8071 // %.offload_mappers = alloca [2 x ptr], align 8
8072 // ;; target region has been outlined and now we need to
8073 // ;; offload to it via a target task.
8074 // }
8075 // void outlined_device_function(ptr a, ptr b, ptr n) {
8076 // n = *n_ptr;
8077 // do i = 1, 10
8078 // a(i) = b(i) + n
8079 // }
8080 //
8081 // We have to now do the following
8082 // (i) Make an offloading call to outlined_device_function using the OpenMP
8083 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8084 // emitted by emitKernelLaunch
8085 // (ii) Create a task entry point function that calls kernel_launch_function
8086 // and is the entry point for the target task. See
8087 // '@.omp_target_task_proxy_func in the pseudocode below.
8088 // (iii) Create a task with the task entry point created in (ii)
8089 //
8090 // That is we create the following
8091 // struct task_with_privates {
8092 // struct kmp_task_ompbuilder_t task_struct;
8093 // struct privates {
8094 // [2 x ptr] ; baseptrs
8095 // [2 x ptr] ; ptrs
8096 // [2 x i64] ; sizes
8097 // }
8098 // }
8099 // void user_code_that_offloads(...) {
8100 // %.offload_baseptrs = alloca [2 x ptr], align 8
8101 // %.offload_ptrs = alloca [2 x ptr], align 8
8102 // %.offload_sizes = alloca [2 x i64], align 8
8103 //
8104 // %structArg = alloca { ptr, ptr, ptr }, align 8
8105 // %strucArg[0] = a
8106 // %strucArg[1] = b
8107 // %strucArg[2] = &n
8108 //
8109 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8110 // sizeof(kmp_task_ompbuilder_t),
8111 // sizeof(structArg),
8112 // @.omp_target_task_proxy_func,
8113 // ...)
8114 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8115 // sizeof(structArg))
8116 // memcpy(target_task_with_privates->privates->baseptrs,
8117 // offload_baseptrs, sizeof(offload_baseptrs)
8118 // memcpy(target_task_with_privates->privates->ptrs,
8119 // offload_ptrs, sizeof(offload_ptrs)
8120 // memcpy(target_task_with_privates->privates->sizes,
8121 // offload_sizes, sizeof(offload_sizes)
8122 // dependencies_array = ...
8123 // ;; if nowait not present
8124 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8125 // call @__kmpc_omp_task_begin_if0(...)
8126 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8127 // %target_task_with_privates)
8128 // call @__kmpc_omp_task_complete_if0(...)
8129 // }
8130 //
8131 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8132 // ptr %task) {
8133 // %structArg = alloca {ptr, ptr, ptr}
8134 // %task_ptr = getelementptr(%task, 0, 0)
8135 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8136 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8137 //
8138 // %offloading_arrays = getelementptr(%task, 0, 1)
8139 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8140 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8141 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8142 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8143 // %offload_sizes, %structArg)
8144 // }
8145 //
8146 // We need the proxy function because the signature of the task entry point
8147 // expected by kmpc_omp_task is always the same and will be different from
8148 // that of the kernel_launch function.
8149 //
8150 // kernel_launch_function is generated by emitKernelLaunch and has the
8151 // always_inline attribute. For this example, it'll look like so:
8152 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8153 // %offload_sizes, %structArg) alwaysinline {
8154 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8155 // ; load aggregated data from %structArg
8156 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8157 // ; offload_sizes
8158 // call i32 @__tgt_target_kernel(...,
8159 // outlined_device_function,
8160 // ptr %kernel_args)
8161 // }
8162 // void outlined_device_function(ptr a, ptr b, ptr n) {
8163 // n = *n_ptr;
8164 // do i = 1, 10
8165 // a(i) = b(i) + n
8166 // }
8167 //
8168 BasicBlock *TargetTaskBodyBB =
8169 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8170 BasicBlock *TargetTaskAllocaBB =
8171 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8172
8173 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8174 TargetTaskAllocaBB->begin());
8175 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8176
8177 OutlineInfo OI;
8178 OI.EntryBB = TargetTaskAllocaBB;
8179 OI.OuterAllocaBB = AllocaIP.getBlock();
8180
8181 // Add the thread ID argument.
8183 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
8184 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8185
8186 // Generate the task body which will subsequently be outlined.
8187 Builder.restoreIP(TargetTaskBodyIP);
8188 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8189 return Err;
8190
8191 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8192 // it is given. These blocks are enumerated by
8193 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8194 // to be outside the region. In other words, OI.ExitBlock is expected to be
8195 // the start of the region after the outlining. We used to set OI.ExitBlock
8196 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8197 // except when the task body is a single basic block. In that case,
8198 // OI.ExitBlock is set to the single task body block and will get left out of
8199 // the outlining process. So, simply create a new empty block to which we
8200 // uncoditionally branch from where TaskBodyCB left off
8201 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8202 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8203 /*IsFinished=*/true);
8204
8205 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8206 bool NeedsTargetTask = HasNoWait && DeviceID;
8207 if (NeedsTargetTask) {
8208 for (auto *V :
8209 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8210 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8211 RTArgs.SizesArray}) {
8213 OffloadingArraysToPrivatize.push_back(V);
8214 OI.ExcludeArgsFromAggregate.push_back(V);
8215 }
8216 }
8217 }
8218 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8219 DeviceID, OffloadingArraysToPrivatize](
8220 Function &OutlinedFn) mutable {
8221 assert(OutlinedFn.hasOneUse() &&
8222 "there must be a single user for the outlined function");
8223
8224 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8225
8226 // The first argument of StaleCI is always the thread id.
8227 // The next few arguments are the pointers to offloading arrays
8228 // if any. (see OffloadingArraysToPrivatize)
8229 // Finally, all other local values that are live-in into the outlined region
8230 // end up in a structure whose pointer is passed as the last argument. This
8231 // piece of data is passed in the "shared" field of the task structure. So,
8232 // we know we have to pass shareds to the task if the number of arguments is
8233 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8234 // thread id. Further, for safety, we assert that the number of arguments of
8235 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8236 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8237 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8238 assert((!HasShareds ||
8239 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8240 "Wrong number of arguments for StaleCI when shareds are present");
8241 int SharedArgOperandNo =
8242 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8243
8244 StructType *TaskWithPrivatesTy =
8245 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8246 StructType *PrivatesTy = nullptr;
8247
8248 if (!OffloadingArraysToPrivatize.empty())
8249 PrivatesTy =
8250 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8251
8253 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8254 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8255
8256 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8257 << "\n");
8258
8259 Builder.SetInsertPoint(StaleCI);
8260
8261 // Gather the arguments for emitting the runtime call.
8262 uint32_t SrcLocStrSize;
8263 Constant *SrcLocStr =
8264 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8265 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8266
8267 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8268 //
8269 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8270 // the DeviceID to the deferred task and also since
8271 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8272 Function *TaskAllocFn =
8273 !NeedsTargetTask
8274 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8275 : getOrCreateRuntimeFunctionPtr(
8276 OMPRTL___kmpc_omp_target_task_alloc);
8277
8278 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8279 // call.
8280 Value *ThreadID = getOrCreateThreadID(Ident);
8281
8282 // Argument - `sizeof_kmp_task_t` (TaskSize)
8283 // Tasksize refers to the size in bytes of kmp_task_t data structure
8284 // plus any other data to be passed to the target task, if any, which
8285 // is packed into a struct. kmp_task_t and the struct so created are
8286 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8287 Value *TaskSize = Builder.getInt64(
8288 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8289
8290 // Argument - `sizeof_shareds` (SharedsSize)
8291 // SharedsSize refers to the shareds array size in the kmp_task_t data
8292 // structure.
8293 Value *SharedsSize = Builder.getInt64(0);
8294 if (HasShareds) {
8295 auto *ArgStructAlloca =
8296 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8297 assert(ArgStructAlloca &&
8298 "Unable to find the alloca instruction corresponding to arguments "
8299 "for extracted function");
8300 auto *ArgStructType =
8301 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8302 assert(ArgStructType && "Unable to find struct type corresponding to "
8303 "arguments for extracted function");
8304 SharedsSize =
8305 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8306 }
8307
8308 // Argument - `flags`
8309 // Task is tied iff (Flags & 1) == 1.
8310 // Task is untied iff (Flags & 1) == 0.
8311 // Task is final iff (Flags & 2) == 2.
8312 // Task is not final iff (Flags & 2) == 0.
8313 // A target task is not final and is untied.
8314 Value *Flags = Builder.getInt32(0);
8315
8316 // Emit the @__kmpc_omp_task_alloc runtime call
8317 // The runtime call returns a pointer to an area where the task captured
8318 // variables must be copied before the task is run (TaskData)
8319 CallInst *TaskData = nullptr;
8320
8321 SmallVector<llvm::Value *> TaskAllocArgs = {
8322 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8323 /*flags=*/Flags,
8324 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8325 /*task_func=*/ProxyFn};
8326
8327 if (NeedsTargetTask) {
8328 assert(DeviceID && "Expected non-empty device ID.");
8329 TaskAllocArgs.push_back(DeviceID);
8330 }
8331
8332 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
8333
8334 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8335 if (HasShareds) {
8336 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8338 *this, Builder, TaskData, TaskWithPrivatesTy);
8339 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8340 SharedsSize);
8341 }
8342 if (!OffloadingArraysToPrivatize.empty()) {
8343 Value *Privates =
8344 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8345 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8346 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8347 [[maybe_unused]] Type *ArrayType =
8348 getOffloadingArrayType(PtrToPrivatize);
8349 assert(ArrayType && "ArrayType cannot be nullptr");
8350
8351 Type *ElementType = PrivatesTy->getElementType(i);
8352 assert(ElementType == ArrayType &&
8353 "ElementType should match ArrayType");
8354 (void)ArrayType;
8355
8356 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8357 Builder.CreateMemCpy(
8358 Dst, Alignment, PtrToPrivatize, Alignment,
8359 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8360 }
8361 }
8362
8363 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8364
8365 // ---------------------------------------------------------------
8366 // V5.2 13.8 target construct
8367 // If the nowait clause is present, execution of the target task
8368 // may be deferred. If the nowait clause is not present, the target task is
8369 // an included task.
8370 // ---------------------------------------------------------------
8371 // The above means that the lack of a nowait on the target construct
8372 // translates to '#pragma omp task if(0)'
8373 if (!NeedsTargetTask) {
8374 if (DepArray) {
8375 Function *TaskWaitFn =
8376 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8377 createRuntimeFunctionCall(
8378 TaskWaitFn,
8379 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8380 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8381 /*dep_list=*/DepArray,
8382 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8383 /*noalias_dep_list=*/
8385 }
8386 // Included task.
8387 Function *TaskBeginFn =
8388 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8389 Function *TaskCompleteFn =
8390 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8391 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8392 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
8393 CI->setDebugLoc(StaleCI->getDebugLoc());
8394 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8395 } else if (DepArray) {
8396 // HasNoWait - meaning the task may be deferred. Call
8397 // __kmpc_omp_task_with_deps if there are dependencies,
8398 // else call __kmpc_omp_task
8399 Function *TaskFn =
8400 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8401 createRuntimeFunctionCall(
8402 TaskFn,
8403 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8404 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8406 } else {
8407 // Emit the @__kmpc_omp_task runtime call to spawn the task
8408 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8409 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
8410 }
8411
8412 StaleCI->eraseFromParent();
8413 for (Instruction *I : llvm::reverse(ToBeDeleted))
8414 I->eraseFromParent();
8415 };
8416 addOutlineInfo(std::move(OI));
8417
8418 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8419 << *(Builder.GetInsertBlock()) << "\n");
8420 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8421 << *(Builder.GetInsertBlock()->getParent()->getParent())
8422 << "\n");
8423 return Builder.saveIP();
8424}
8425
8426Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8427 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8428 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8429 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8430 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8431 if (Error Err =
8432 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8433 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8434 return Err;
8435 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8436 return Error::success();
8437}
8438
8439static void emitTargetCall(
8440 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8441 OpenMPIRBuilder::InsertPointTy AllocaIP,
8442 OpenMPIRBuilder::TargetDataInfo &Info,
8443 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8444 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8445 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8447 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8448 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8450 bool HasNoWait, Value *DynCGroupMem,
8451 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8452 // Generate a function call to the host fallback implementation of the target
8453 // region. This is called by the host when no offload entry was generated for
8454 // the target region and when the offloading call fails at runtime.
8455 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8456 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8457 Builder.restoreIP(IP);
8458 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
8459 return Builder.saveIP();
8460 };
8461
8462 bool HasDependencies = Dependencies.size() > 0;
8463 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8464
8465 OpenMPIRBuilder::TargetKernelArgs KArgs;
8466
8467 auto TaskBodyCB =
8468 [&](Value *DeviceID, Value *RTLoc,
8469 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8470 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8471 // produce any.
8472 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8473 // emitKernelLaunch makes the necessary runtime call to offload the
8474 // kernel. We then outline all that code into a separate function
8475 // ('kernel_launch_function' in the pseudo code above). This function is
8476 // then called by the target task proxy function (see
8477 // '@.omp_target_task_proxy_func' in the pseudo code above)
8478 // "@.omp_target_task_proxy_func' is generated by
8479 // emitTargetTaskProxyFunction.
8480 if (OutlinedFnID && DeviceID)
8481 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8482 EmitTargetCallFallbackCB, KArgs,
8483 DeviceID, RTLoc, TargetTaskAllocaIP);
8484
8485 // We only need to do the outlining if `DeviceID` is set to avoid calling
8486 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8487 // generating the `else` branch of an `if` clause.
8488 //
8489 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8490 // In this case, we execute the host implementation directly.
8491 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8492 }());
8493
8494 OMPBuilder.Builder.restoreIP(AfterIP);
8495 return Error::success();
8496 };
8497
8498 auto &&EmitTargetCallElse =
8499 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8500 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8501 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8502 // produce any.
8503 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8504 if (RequiresOuterTargetTask) {
8505 // Arguments that are intended to be directly forwarded to an
8506 // emitKernelLaunch call are pased as nullptr, since
8507 // OutlinedFnID=nullptr results in that call not being done.
8508 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8509 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8510 /*RTLoc=*/nullptr, AllocaIP,
8511 Dependencies, EmptyRTArgs, HasNoWait);
8512 }
8513 return EmitTargetCallFallbackCB(Builder.saveIP());
8514 }());
8515
8516 Builder.restoreIP(AfterIP);
8517 return Error::success();
8518 };
8519
8520 auto &&EmitTargetCallThen =
8521 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8522 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8523 Info.HasNoWait = HasNoWait;
8524 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8525 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8526 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8527 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8528 /*IsNonContiguous=*/true,
8529 /*ForEndCall=*/false))
8530 return Err;
8531
8532 SmallVector<Value *, 3> NumTeamsC;
8533 for (auto [DefaultVal, RuntimeVal] :
8534 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8535 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8536 : Builder.getInt32(DefaultVal));
8537
8538 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8539 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8540 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8541 if (Clause)
8542 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8543 /*isSigned=*/false);
8544 return Clause;
8545 };
8546 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8547 if (Clause)
8548 Result =
8549 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8550 Result, Clause)
8551 : Clause;
8552 };
8553
8554 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8555 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8556 SmallVector<Value *, 3> NumThreadsC;
8557 Value *MaxThreadsClause =
8558 RuntimeAttrs.TeamsThreadLimit.size() == 1
8559 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8560 : nullptr;
8561
8562 for (auto [TeamsVal, TargetVal] : zip_equal(
8563 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8564 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8565 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8566
8567 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8568 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8569
8570 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8571 }
8572
8573 unsigned NumTargetItems = Info.NumberOfPtrs;
8574 // TODO: Use correct device ID
8575 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8576 uint32_t SrcLocStrSize;
8577 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8578 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8579 llvm::omp::IdentFlag(0), 0);
8580
8581 Value *TripCount = RuntimeAttrs.LoopTripCount
8582 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8583 Builder.getInt64Ty(),
8584 /*isSigned=*/false)
8585 : Builder.getInt64(0);
8586
8587 // Request zero groupprivate bytes by default.
8588 if (!DynCGroupMem)
8589 DynCGroupMem = Builder.getInt32(0);
8590
8591 KArgs = OpenMPIRBuilder::TargetKernelArgs(
8592 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
8593 HasNoWait, DynCGroupMemFallback);
8594
8595 // Assume no error was returned because TaskBodyCB and
8596 // EmitTargetCallFallbackCB don't produce any.
8597 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8598 // The presence of certain clauses on the target directive require the
8599 // explicit generation of the target task.
8600 if (RequiresOuterTargetTask)
8601 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8602 Dependencies, KArgs.RTArgs,
8603 Info.HasNoWait);
8604
8605 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8606 EmitTargetCallFallbackCB, KArgs,
8607 DeviceID, RTLoc, AllocaIP);
8608 }());
8609
8610 Builder.restoreIP(AfterIP);
8611 return Error::success();
8612 };
8613
8614 // If we don't have an ID for the target region, it means an offload entry
8615 // wasn't created. In this case we just run the host fallback directly and
8616 // ignore any potential 'if' clauses.
8617 if (!OutlinedFnID) {
8618 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8619 return;
8620 }
8621
8622 // If there's no 'if' clause, only generate the kernel launch code path.
8623 if (!IfCond) {
8624 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8625 return;
8626 }
8627
8628 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8629 EmitTargetCallElse, AllocaIP));
8630}
8631
8632OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8633 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8634 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8635 TargetRegionEntryInfo &EntryInfo,
8636 const TargetKernelDefaultAttrs &DefaultAttrs,
8637 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8638 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8639 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8640 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8641 CustomMapperCallbackTy CustomMapperCB,
8642 const SmallVector<DependData> &Dependencies, bool HasNowait,
8643 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8644
8645 if (!updateToLocation(Loc))
8646 return InsertPointTy();
8647
8648 Builder.restoreIP(CodeGenIP);
8649
8650 Function *OutlinedFn;
8651 Constant *OutlinedFnID = nullptr;
8652 // The target region is outlined into its own function. The LLVM IR for
8653 // the target region itself is generated using the callbacks CBFunc
8654 // and ArgAccessorFuncCB
8656 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8657 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8658 return Err;
8659
8660 // If we are not on the target device, then we need to generate code
8661 // to make a remote call (offload) to the previously outlined function
8662 // that represents the target region. Do that now.
8663 if (!Config.isTargetDevice())
8664 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8665 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8666 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
8667 DynCGroupMemFallback);
8668 return Builder.saveIP();
8669}
8670
8671std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8672 StringRef FirstSeparator,
8673 StringRef Separator) {
8674 SmallString<128> Buffer;
8675 llvm::raw_svector_ostream OS(Buffer);
8676 StringRef Sep = FirstSeparator;
8677 for (StringRef Part : Parts) {
8678 OS << Sep << Part;
8679 Sep = Separator;
8680 }
8681 return OS.str().str();
8682}
8683
8684std::string
8685OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8686 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8687 Config.separator());
8688}
8689
8690GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
8691 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
8692 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8693 if (Elem.second) {
8694 assert(Elem.second->getValueType() == Ty &&
8695 "OMP internal variable has different type than requested");
8696 } else {
8697 // TODO: investigate the appropriate linkage type used for the global
8698 // variable for possibly changing that to internal or private, or maybe
8699 // create different versions of the function for different OMP internal
8700 // variables.
8701 const DataLayout &DL = M.getDataLayout();
8702 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
8703 // default global AS is 1.
8704 // See double-target-call-with-declare-target.f90 and
8705 // declare-target-vars-in-target-region.f90 libomptarget
8706 // tests.
8707 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
8708 : M.getTargetTriple().isAMDGPU()
8709 ? 0
8710 : DL.getDefaultGlobalsAddressSpace();
8711 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8714 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8715 Constant::getNullValue(Ty), Elem.first(),
8716 /*InsertBefore=*/nullptr,
8717 GlobalValue::NotThreadLocal, AddressSpaceVal);
8718 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8719 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
8720 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8721 Elem.second = GV;
8722 }
8723
8724 return Elem.second;
8725}
8726
8727Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8728 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8729 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8730 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8731}
8732
8733Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8734 LLVMContext &Ctx = Builder.getContext();
8735 Value *Null =
8737 Value *SizeGep =
8738 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8739 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8740 return SizePtrToInt;
8741}
8742
8744OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8745 std::string VarName) {
8746 llvm::Constant *MaptypesArrayInit =
8747 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8748 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8749 M, MaptypesArrayInit->getType(),
8750 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8751 VarName);
8752 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8753 return MaptypesArrayGlobal;
8754}
8755
8756void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8757 InsertPointTy AllocaIP,
8758 unsigned NumOperands,
8759 struct MapperAllocas &MapperAllocas) {
8760 if (!updateToLocation(Loc))
8761 return;
8762
8763 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8764 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8765 Builder.restoreIP(AllocaIP);
8766 AllocaInst *ArgsBase = Builder.CreateAlloca(
8767 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8768 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8769 ".offload_ptrs");
8770 AllocaInst *ArgSizes = Builder.CreateAlloca(
8771 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8772 updateToLocation(Loc);
8773 MapperAllocas.ArgsBase = ArgsBase;
8774 MapperAllocas.Args = Args;
8775 MapperAllocas.ArgSizes = ArgSizes;
8776}
8777
8778void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8779 Function *MapperFunc, Value *SrcLocInfo,
8780 Value *MaptypesArg, Value *MapnamesArg,
8781 struct MapperAllocas &MapperAllocas,
8782 int64_t DeviceID, unsigned NumOperands) {
8783 if (!updateToLocation(Loc))
8784 return;
8785
8786 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8787 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8788 Value *ArgsBaseGEP =
8789 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8790 {Builder.getInt32(0), Builder.getInt32(0)});
8791 Value *ArgsGEP =
8792 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8793 {Builder.getInt32(0), Builder.getInt32(0)});
8794 Value *ArgSizesGEP =
8795 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8796 {Builder.getInt32(0), Builder.getInt32(0)});
8797 Value *NullPtr =
8798 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8799 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
8800 Builder.getInt32(NumOperands),
8801 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
8802 MaptypesArg, MapnamesArg, NullPtr});
8803}
8804
8805void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8806 TargetDataRTArgs &RTArgs,
8807 TargetDataInfo &Info,
8808 bool ForEndCall) {
8809 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8810 "expected region end call to runtime only when end call is separate");
8811 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8812 auto VoidPtrTy = UnqualPtrTy;
8813 auto VoidPtrPtrTy = UnqualPtrTy;
8814 auto Int64Ty = Type::getInt64Ty(M.getContext());
8815 auto Int64PtrTy = UnqualPtrTy;
8816
8817 if (!Info.NumberOfPtrs) {
8818 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8819 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8820 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8821 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8822 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8823 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8824 return;
8825 }
8826
8827 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8828 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8829 Info.RTArgs.BasePointersArray,
8830 /*Idx0=*/0, /*Idx1=*/0);
8831 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8832 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8833 /*Idx0=*/0,
8834 /*Idx1=*/0);
8835 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8836 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8837 /*Idx0=*/0, /*Idx1=*/0);
8838 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8839 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8840 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8841 : Info.RTArgs.MapTypesArray,
8842 /*Idx0=*/0,
8843 /*Idx1=*/0);
8844
8845 // Only emit the mapper information arrays if debug information is
8846 // requested.
8847 if (!Info.EmitDebug)
8848 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8849 else
8850 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8851 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8852 /*Idx0=*/0,
8853 /*Idx1=*/0);
8854 // If there is no user-defined mapper, set the mapper array to nullptr to
8855 // avoid an unnecessary data privatization
8856 if (!Info.HasMapper)
8857 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8858 else
8859 RTArgs.MappersArray =
8860 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8861}
8862
8863void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8864 InsertPointTy CodeGenIP,
8865 MapInfosTy &CombinedInfo,
8866 TargetDataInfo &Info) {
8867 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8868 CombinedInfo.NonContigInfo;
8869
8870 // Build an array of struct descriptor_dim and then assign it to
8871 // offload_args.
8872 //
8873 // struct descriptor_dim {
8874 // uint64_t offset;
8875 // uint64_t count;
8876 // uint64_t stride
8877 // };
8878 Type *Int64Ty = Builder.getInt64Ty();
8880 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8881 "struct.descriptor_dim");
8882
8883 enum { OffsetFD = 0, CountFD, StrideFD };
8884 // We need two index variable here since the size of "Dims" is the same as
8885 // the size of Components, however, the size of offset, count, and stride is
8886 // equal to the size of base declaration that is non-contiguous.
8887 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8888 // Skip emitting ir if dimension size is 1 since it cannot be
8889 // non-contiguous.
8890 if (NonContigInfo.Dims[I] == 1)
8891 continue;
8892 Builder.restoreIP(AllocaIP);
8893 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
8894 AllocaInst *DimsAddr =
8895 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
8896 Builder.restoreIP(CodeGenIP);
8897 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
8898 unsigned RevIdx = EE - II - 1;
8899 Value *DimsLVal = Builder.CreateInBoundsGEP(
8900 DimsAddr->getAllocatedType(), DimsAddr,
8901 {Builder.getInt64(0), Builder.getInt64(II)});
8902 // Offset
8903 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
8904 Builder.CreateAlignedStore(
8905 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
8906 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
8907 // Count
8908 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
8909 Builder.CreateAlignedStore(
8910 NonContigInfo.Counts[L][RevIdx], CountLVal,
8911 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8912 // Stride
8913 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
8914 Builder.CreateAlignedStore(
8915 NonContigInfo.Strides[L][RevIdx], StrideLVal,
8916 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
8917 }
8918 // args[I] = &dims
8919 Builder.restoreIP(CodeGenIP);
8920 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
8921 DimsAddr, Builder.getPtrTy());
8922 Value *P = Builder.CreateConstInBoundsGEP2_32(
8923 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
8924 Info.RTArgs.PointersArray, 0, I);
8925 Builder.CreateAlignedStore(
8926 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
8927 ++L;
8928 }
8929}
8930
8931void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
8932 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
8933 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
8934 BasicBlock *ExitBB, bool IsInit) {
8935 StringRef Prefix = IsInit ? ".init" : ".del";
8936
8937 // Evaluate if this is an array section.
8939 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
8940 Value *IsArray =
8941 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
8942 Value *DeleteBit = Builder.CreateAnd(
8943 MapType,
8944 Builder.getInt64(
8945 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8946 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
8947 Value *DeleteCond;
8948 Value *Cond;
8949 if (IsInit) {
8950 // base != begin?
8951 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
8952 // IsPtrAndObj?
8953 Value *PtrAndObjBit = Builder.CreateAnd(
8954 MapType,
8955 Builder.getInt64(
8956 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8957 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
8958 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
8959 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
8960 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
8961 DeleteCond = Builder.CreateIsNull(
8962 DeleteBit,
8963 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8964 } else {
8965 Cond = IsArray;
8966 DeleteCond = Builder.CreateIsNotNull(
8967 DeleteBit,
8968 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
8969 }
8970 Cond = Builder.CreateAnd(Cond, DeleteCond);
8971 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
8972
8973 emitBlock(BodyBB, MapperFn);
8974 // Get the array size by multiplying element size and element number (i.e., \p
8975 // Size).
8976 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
8977 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
8978 // memory allocation/deletion purpose only.
8979 Value *MapTypeArg = Builder.CreateAnd(
8980 MapType,
8981 Builder.getInt64(
8982 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8983 OpenMPOffloadMappingFlags::OMP_MAP_TO |
8984 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
8985 MapTypeArg = Builder.CreateOr(
8986 MapTypeArg,
8987 Builder.getInt64(
8988 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
8989 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
8990
8991 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
8992 // data structure.
8993 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
8994 ArraySize, MapTypeArg, MapName};
8995 createRuntimeFunctionCall(
8996 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
8997 OffloadingArgs);
8998}
8999
9000Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
9001 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
9002 llvm::Value *BeginArg)>
9003 GenMapInfoCB,
9004 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9005 SmallVector<Type *> Params;
9006 Params.emplace_back(Builder.getPtrTy());
9007 Params.emplace_back(Builder.getPtrTy());
9008 Params.emplace_back(Builder.getPtrTy());
9009 Params.emplace_back(Builder.getInt64Ty());
9010 Params.emplace_back(Builder.getInt64Ty());
9011 Params.emplace_back(Builder.getPtrTy());
9012
9013 auto *FnTy =
9014 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9015
9016 SmallString<64> TyStr;
9017 raw_svector_ostream Out(TyStr);
9018 Function *MapperFn =
9020 MapperFn->addFnAttr(Attribute::NoInline);
9021 MapperFn->addFnAttr(Attribute::NoUnwind);
9022 MapperFn->addParamAttr(0, Attribute::NoUndef);
9023 MapperFn->addParamAttr(1, Attribute::NoUndef);
9024 MapperFn->addParamAttr(2, Attribute::NoUndef);
9025 MapperFn->addParamAttr(3, Attribute::NoUndef);
9026 MapperFn->addParamAttr(4, Attribute::NoUndef);
9027 MapperFn->addParamAttr(5, Attribute::NoUndef);
9028
9029 // Start the mapper function code generation.
9030 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9031 auto SavedIP = Builder.saveIP();
9032 Builder.SetInsertPoint(EntryBB);
9033
9034 Value *MapperHandle = MapperFn->getArg(0);
9035 Value *BaseIn = MapperFn->getArg(1);
9036 Value *BeginIn = MapperFn->getArg(2);
9037 Value *Size = MapperFn->getArg(3);
9038 Value *MapType = MapperFn->getArg(4);
9039 Value *MapName = MapperFn->getArg(5);
9040
9041 // Compute the starting and end addresses of array elements.
9042 // Prepare common arguments for array initiation and deletion.
9043 // Convert the size in bytes into the number of array elements.
9044 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9045 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9046 Value *PtrBegin = BeginIn;
9047 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9048
9049 // Emit array initiation if this is an array section and \p MapType indicates
9050 // that memory allocation is required.
9051 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9052 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9053 MapType, MapName, ElementSize, HeadBB,
9054 /*IsInit=*/true);
9055
9056 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9057
9058 // Emit the loop header block.
9059 emitBlock(HeadBB, MapperFn);
9060 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9061 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9062 // Evaluate whether the initial condition is satisfied.
9063 Value *IsEmpty =
9064 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9065 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9066
9067 // Emit the loop body block.
9068 emitBlock(BodyBB, MapperFn);
9069 BasicBlock *LastBB = BodyBB;
9070 PHINode *PtrPHI =
9071 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9072 PtrPHI->addIncoming(PtrBegin, HeadBB);
9073
9074 // Get map clause information. Fill up the arrays with all mapped variables.
9075 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9076 if (!Info)
9077 return Info.takeError();
9078
9079 // Call the runtime API __tgt_mapper_num_components to get the number of
9080 // pre-existing components.
9081 Value *OffloadingArgs[] = {MapperHandle};
9082 Value *PreviousSize = createRuntimeFunctionCall(
9083 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9084 OffloadingArgs);
9085 Value *ShiftedPreviousSize =
9086 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9087
9088 // Fill up the runtime mapper handle for all components.
9089 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9090 Value *CurBaseArg = Info->BasePointers[I];
9091 Value *CurBeginArg = Info->Pointers[I];
9092 Value *CurSizeArg = Info->Sizes[I];
9093 Value *CurNameArg = Info->Names.size()
9094 ? Info->Names[I]
9095 : Constant::getNullValue(Builder.getPtrTy());
9096
9097 // Extract the MEMBER_OF field from the map type.
9098 Value *OriMapType = Builder.getInt64(
9099 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9100 Info->Types[I]));
9101 Value *MemberMapType =
9102 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9103
9104 // Combine the map type inherited from user-defined mapper with that
9105 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9106 // bits of the \a MapType, which is the input argument of the mapper
9107 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9108 // bits of MemberMapType.
9109 // [OpenMP 5.0], 1.2.6. map-type decay.
9110 // | alloc | to | from | tofrom | release | delete
9111 // ----------------------------------------------------------
9112 // alloc | alloc | alloc | alloc | alloc | release | delete
9113 // to | alloc | to | alloc | to | release | delete
9114 // from | alloc | alloc | from | from | release | delete
9115 // tofrom | alloc | to | from | tofrom | release | delete
9116 Value *LeftToFrom = Builder.CreateAnd(
9117 MapType,
9118 Builder.getInt64(
9119 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9120 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9121 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9122 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9123 BasicBlock *AllocElseBB =
9124 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9125 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9126 BasicBlock *ToElseBB =
9127 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9128 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9129 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9130 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9131 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9132 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9133 emitBlock(AllocBB, MapperFn);
9134 Value *AllocMapType = Builder.CreateAnd(
9135 MemberMapType,
9136 Builder.getInt64(
9137 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9138 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9139 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9140 Builder.CreateBr(EndBB);
9141 emitBlock(AllocElseBB, MapperFn);
9142 Value *IsTo = Builder.CreateICmpEQ(
9143 LeftToFrom,
9144 Builder.getInt64(
9145 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9146 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9147 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9148 // In case of to, clear OMP_MAP_FROM.
9149 emitBlock(ToBB, MapperFn);
9150 Value *ToMapType = Builder.CreateAnd(
9151 MemberMapType,
9152 Builder.getInt64(
9153 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9154 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9155 Builder.CreateBr(EndBB);
9156 emitBlock(ToElseBB, MapperFn);
9157 Value *IsFrom = Builder.CreateICmpEQ(
9158 LeftToFrom,
9159 Builder.getInt64(
9160 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9161 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9162 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9163 // In case of from, clear OMP_MAP_TO.
9164 emitBlock(FromBB, MapperFn);
9165 Value *FromMapType = Builder.CreateAnd(
9166 MemberMapType,
9167 Builder.getInt64(
9168 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9169 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9170 // In case of tofrom, do nothing.
9171 emitBlock(EndBB, MapperFn);
9172 LastBB = EndBB;
9173 PHINode *CurMapType =
9174 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9175 CurMapType->addIncoming(AllocMapType, AllocBB);
9176 CurMapType->addIncoming(ToMapType, ToBB);
9177 CurMapType->addIncoming(FromMapType, FromBB);
9178 CurMapType->addIncoming(MemberMapType, ToElseBB);
9179
9180 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9181 CurSizeArg, CurMapType, CurNameArg};
9182
9183 auto ChildMapperFn = CustomMapperCB(I);
9184 if (!ChildMapperFn)
9185 return ChildMapperFn.takeError();
9186 if (*ChildMapperFn) {
9187 // Call the corresponding mapper function.
9188 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9189 ->setDoesNotThrow();
9190 } else {
9191 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9192 // data structure.
9193 createRuntimeFunctionCall(
9194 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9195 OffloadingArgs);
9196 }
9197 }
9198
9199 // Update the pointer to point to the next element that needs to be mapped,
9200 // and check whether we have mapped all elements.
9201 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9202 "omp.arraymap.next");
9203 PtrPHI->addIncoming(PtrNext, LastBB);
9204 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9205 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9206 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9207
9208 emitBlock(ExitBB, MapperFn);
9209 // Emit array deletion if this is an array section and \p MapType indicates
9210 // that deletion is required.
9211 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9212 MapType, MapName, ElementSize, DoneBB,
9213 /*IsInit=*/false);
9214
9215 // Emit the function exit block.
9216 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9217
9218 Builder.CreateRetVoid();
9219 Builder.restoreIP(SavedIP);
9220 return MapperFn;
9221}
9222
9223Error OpenMPIRBuilder::emitOffloadingArrays(
9224 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9225 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9226 bool IsNonContiguous,
9227 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9228
9229 // Reset the array information.
9230 Info.clearArrayInfo();
9231 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9232
9233 if (Info.NumberOfPtrs == 0)
9234 return Error::success();
9235
9236 Builder.restoreIP(AllocaIP);
9237 // Detect if we have any capture size requiring runtime evaluation of the
9238 // size so that a constant array could be eventually used.
9239 ArrayType *PointerArrayType =
9240 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9241
9242 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9243 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9244
9245 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9246 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9247 AllocaInst *MappersArray = Builder.CreateAlloca(
9248 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9249 Info.RTArgs.MappersArray = MappersArray;
9250
9251 // If we don't have any VLA types or other types that require runtime
9252 // evaluation, we can use a constant array for the map sizes, otherwise we
9253 // need to fill up the arrays as we do for the pointers.
9254 Type *Int64Ty = Builder.getInt64Ty();
9255 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9256 ConstantInt::get(Int64Ty, 0));
9257 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9258 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9259 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9260 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9261 if (IsNonContiguous &&
9262 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9263 CombinedInfo.Types[I] &
9264 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9265 ConstSizes[I] =
9266 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9267 else
9268 ConstSizes[I] = CI;
9269 continue;
9270 }
9271 }
9272 RuntimeSizes.set(I);
9273 }
9274
9275 if (RuntimeSizes.all()) {
9276 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9277 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9278 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9279 restoreIPandDebugLoc(Builder, CodeGenIP);
9280 } else {
9281 auto *SizesArrayInit = ConstantArray::get(
9282 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9283 std::string Name = createPlatformSpecificName({"offload_sizes"});
9284 auto *SizesArrayGbl =
9285 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9286 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9287 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9288
9289 if (!RuntimeSizes.any()) {
9290 Info.RTArgs.SizesArray = SizesArrayGbl;
9291 } else {
9292 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9293 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9294 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9295 AllocaInst *Buffer = Builder.CreateAlloca(
9296 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9297 Buffer->setAlignment(OffloadSizeAlign);
9298 restoreIPandDebugLoc(Builder, CodeGenIP);
9299 Builder.CreateMemCpy(
9300 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9301 SizesArrayGbl, OffloadSizeAlign,
9302 Builder.getIntN(
9303 IndexSize,
9304 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9305
9306 Info.RTArgs.SizesArray = Buffer;
9307 }
9308 restoreIPandDebugLoc(Builder, CodeGenIP);
9309 }
9310
9311 // The map types are always constant so we don't need to generate code to
9312 // fill arrays. Instead, we create an array constant.
9314 for (auto mapFlag : CombinedInfo.Types)
9315 Mapping.push_back(
9316 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9317 mapFlag));
9318 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9319 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9320 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9321
9322 // The information types are only built if provided.
9323 if (!CombinedInfo.Names.empty()) {
9324 auto *MapNamesArrayGbl = createOffloadMapnames(
9325 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9326 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9327 Info.EmitDebug = true;
9328 } else {
9329 Info.RTArgs.MapNamesArray =
9330 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9331 Info.EmitDebug = false;
9332 }
9333
9334 // If there's a present map type modifier, it must not be applied to the end
9335 // of a region, so generate a separate map type array in that case.
9336 if (Info.separateBeginEndCalls()) {
9337 bool EndMapTypesDiffer = false;
9338 for (uint64_t &Type : Mapping) {
9339 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9340 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9341 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9342 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9343 EndMapTypesDiffer = true;
9344 }
9345 }
9346 if (EndMapTypesDiffer) {
9347 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9348 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9349 }
9350 }
9351
9352 PointerType *PtrTy = Builder.getPtrTy();
9353 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9354 Value *BPVal = CombinedInfo.BasePointers[I];
9355 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9356 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9357 0, I);
9358 Builder.CreateAlignedStore(BPVal, BP,
9359 M.getDataLayout().getPrefTypeAlign(PtrTy));
9360
9361 if (Info.requiresDevicePointerInfo()) {
9362 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9363 CodeGenIP = Builder.saveIP();
9364 Builder.restoreIP(AllocaIP);
9365 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9366 Builder.restoreIP(CodeGenIP);
9367 if (DeviceAddrCB)
9368 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9369 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9370 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9371 if (DeviceAddrCB)
9372 DeviceAddrCB(I, BP);
9373 }
9374 }
9375
9376 Value *PVal = CombinedInfo.Pointers[I];
9377 Value *P = Builder.CreateConstInBoundsGEP2_32(
9378 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9379 I);
9380 // TODO: Check alignment correct.
9381 Builder.CreateAlignedStore(PVal, P,
9382 M.getDataLayout().getPrefTypeAlign(PtrTy));
9383
9384 if (RuntimeSizes.test(I)) {
9385 Value *S = Builder.CreateConstInBoundsGEP2_32(
9386 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9387 /*Idx0=*/0,
9388 /*Idx1=*/I);
9389 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9390 Int64Ty,
9391 /*isSigned=*/true),
9392 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9393 }
9394 // Fill up the mapper array.
9395 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9396 Value *MFunc = ConstantPointerNull::get(PtrTy);
9397
9398 auto CustomMFunc = CustomMapperCB(I);
9399 if (!CustomMFunc)
9400 return CustomMFunc.takeError();
9401 if (*CustomMFunc)
9402 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9403
9404 Value *MAddr = Builder.CreateInBoundsGEP(
9405 MappersArray->getAllocatedType(), MappersArray,
9406 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9407 Builder.CreateAlignedStore(
9408 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9409 }
9410
9411 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9412 Info.NumberOfPtrs == 0)
9413 return Error::success();
9414 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9415 return Error::success();
9416}
9417
9418void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9419 BasicBlock *CurBB = Builder.GetInsertBlock();
9420
9421 if (!CurBB || CurBB->getTerminator()) {
9422 // If there is no insert point or the previous block is already
9423 // terminated, don't touch it.
9424 } else {
9425 // Otherwise, create a fall-through branch.
9426 Builder.CreateBr(Target);
9427 }
9428
9429 Builder.ClearInsertionPoint();
9430}
9431
9432void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9433 bool IsFinished) {
9434 BasicBlock *CurBB = Builder.GetInsertBlock();
9435
9436 // Fall out of the current block (if necessary).
9437 emitBranch(BB);
9438
9439 if (IsFinished && BB->use_empty()) {
9440 BB->eraseFromParent();
9441 return;
9442 }
9443
9444 // Place the block after the current block, if possible, or else at
9445 // the end of the function.
9446 if (CurBB && CurBB->getParent())
9447 CurFn->insert(std::next(CurBB->getIterator()), BB);
9448 else
9449 CurFn->insert(CurFn->end(), BB);
9450 Builder.SetInsertPoint(BB);
9451}
9452
9453Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9454 BodyGenCallbackTy ElseGen,
9455 InsertPointTy AllocaIP) {
9456 // If the condition constant folds and can be elided, try to avoid emitting
9457 // the condition and the dead arm of the if/else.
9458 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9459 auto CondConstant = CI->getSExtValue();
9460 if (CondConstant)
9461 return ThenGen(AllocaIP, Builder.saveIP());
9462
9463 return ElseGen(AllocaIP, Builder.saveIP());
9464 }
9465
9466 Function *CurFn = Builder.GetInsertBlock()->getParent();
9467
9468 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9469 // emit the conditional branch.
9470 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9471 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9472 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9473 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9474 // Emit the 'then' code.
9475 emitBlock(ThenBlock, CurFn);
9476 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9477 return Err;
9478 emitBranch(ContBlock);
9479 // Emit the 'else' code if present.
9480 // There is no need to emit line number for unconditional branch.
9481 emitBlock(ElseBlock, CurFn);
9482 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9483 return Err;
9484 // There is no need to emit line number for unconditional branch.
9485 emitBranch(ContBlock);
9486 // Emit the continuation block for code after the if.
9487 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9488 return Error::success();
9489}
9490
9491bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9492 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9495 "Unexpected Atomic Ordering.");
9496
9497 bool Flush = false;
9499
9500 switch (AK) {
9501 case Read:
9504 FlushAO = AtomicOrdering::Acquire;
9505 Flush = true;
9506 }
9507 break;
9508 case Write:
9509 case Compare:
9510 case Update:
9513 FlushAO = AtomicOrdering::Release;
9514 Flush = true;
9515 }
9516 break;
9517 case Capture:
9518 switch (AO) {
9520 FlushAO = AtomicOrdering::Acquire;
9521 Flush = true;
9522 break;
9524 FlushAO = AtomicOrdering::Release;
9525 Flush = true;
9526 break;
9530 Flush = true;
9531 break;
9532 default:
9533 // do nothing - leave silently.
9534 break;
9535 }
9536 }
9537
9538 if (Flush) {
9539 // Currently Flush RT call still doesn't take memory_ordering, so for when
9540 // that happens, this tries to do the resolution of which atomic ordering
9541 // to use with but issue the flush call
9542 // TODO: pass `FlushAO` after memory ordering support is added
9543 (void)FlushAO;
9544 emitFlush(Loc);
9545 }
9546
9547 // for AO == AtomicOrdering::Monotonic and all other case combinations
9548 // do nothing
9549 return Flush;
9550}
9551
9552OpenMPIRBuilder::InsertPointTy
9553OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9554 AtomicOpValue &X, AtomicOpValue &V,
9555 AtomicOrdering AO, InsertPointTy AllocaIP) {
9556 if (!updateToLocation(Loc))
9557 return Loc.IP;
9558
9559 assert(X.Var->getType()->isPointerTy() &&
9560 "OMP Atomic expects a pointer to target memory");
9561 Type *XElemTy = X.ElemTy;
9562 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9563 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9564 "OMP atomic read expected a scalar type");
9565
9566 Value *XRead = nullptr;
9567
9568 if (XElemTy->isIntegerTy()) {
9569 LoadInst *XLD =
9570 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9571 XLD->setAtomic(AO);
9572 XRead = cast<Value>(XLD);
9573 } else if (XElemTy->isStructTy()) {
9574 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9575 // target does not support `atomicrmw` of the size of the struct
9576 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9577 OldVal->setAtomic(AO);
9578 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9579 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9580 OpenMPIRBuilder::AtomicInfo atomicInfo(
9581 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9582 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9583 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9584 XRead = AtomicLoadRes.first;
9585 OldVal->eraseFromParent();
9586 } else {
9587 // We need to perform atomic op as integer
9588 IntegerType *IntCastTy =
9589 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9590 LoadInst *XLoad =
9591 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9592 XLoad->setAtomic(AO);
9593 if (XElemTy->isFloatingPointTy()) {
9594 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9595 } else {
9596 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9597 }
9598 }
9599 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9600 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9601 return Builder.saveIP();
9602}
9603
9604OpenMPIRBuilder::InsertPointTy
9605OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9606 AtomicOpValue &X, Value *Expr,
9607 AtomicOrdering AO, InsertPointTy AllocaIP) {
9608 if (!updateToLocation(Loc))
9609 return Loc.IP;
9610
9611 assert(X.Var->getType()->isPointerTy() &&
9612 "OMP Atomic expects a pointer to target memory");
9613 Type *XElemTy = X.ElemTy;
9614 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9615 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9616 "OMP atomic write expected a scalar type");
9617
9618 if (XElemTy->isIntegerTy()) {
9619 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9620 XSt->setAtomic(AO);
9621 } else if (XElemTy->isStructTy()) {
9622 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9623 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9624 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9625 OpenMPIRBuilder::AtomicInfo atomicInfo(
9626 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9627 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9628 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9629 OldVal->eraseFromParent();
9630 } else {
9631 // We need to bitcast and perform atomic op as integers
9632 IntegerType *IntCastTy =
9633 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9634 Value *ExprCast =
9635 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9636 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9637 XSt->setAtomic(AO);
9638 }
9639
9640 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9641 return Builder.saveIP();
9642}
9643
9644OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9645 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9646 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9647 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9648 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9649 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9650 if (!updateToLocation(Loc))
9651 return Loc.IP;
9652
9653 LLVM_DEBUG({
9654 Type *XTy = X.Var->getType();
9655 assert(XTy->isPointerTy() &&
9656 "OMP Atomic expects a pointer to target memory");
9657 Type *XElemTy = X.ElemTy;
9658 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9659 XElemTy->isPointerTy()) &&
9660 "OMP atomic update expected a scalar type");
9661 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9662 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9663 "OpenMP atomic does not support LT or GT operations");
9664 });
9665
9666 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9667 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9668 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9669 if (!AtomicResult)
9670 return AtomicResult.takeError();
9671 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9672 return Builder.saveIP();
9673}
9674
9675// FIXME: Duplicating AtomicExpand
9676Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9677 AtomicRMWInst::BinOp RMWOp) {
9678 switch (RMWOp) {
9679 case AtomicRMWInst::Add:
9680 return Builder.CreateAdd(Src1, Src2);
9681 case AtomicRMWInst::Sub:
9682 return Builder.CreateSub(Src1, Src2);
9683 case AtomicRMWInst::And:
9684 return Builder.CreateAnd(Src1, Src2);
9686 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9687 case AtomicRMWInst::Or:
9688 return Builder.CreateOr(Src1, Src2);
9689 case AtomicRMWInst::Xor:
9690 return Builder.CreateXor(Src1, Src2);
9695 case AtomicRMWInst::Max:
9696 case AtomicRMWInst::Min:
9707 llvm_unreachable("Unsupported atomic update operation");
9708 }
9709 llvm_unreachable("Unsupported atomic update operation");
9710}
9711
9712Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9713 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9715 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9716 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9717 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9718 // or a complex datatype.
9719 bool emitRMWOp = false;
9720 switch (RMWOp) {
9721 case AtomicRMWInst::Add:
9722 case AtomicRMWInst::And:
9724 case AtomicRMWInst::Or:
9725 case AtomicRMWInst::Xor:
9727 emitRMWOp = XElemTy;
9728 break;
9729 case AtomicRMWInst::Sub:
9730 emitRMWOp = (IsXBinopExpr && XElemTy);
9731 break;
9732 default:
9733 emitRMWOp = false;
9734 }
9735 emitRMWOp &= XElemTy->isIntegerTy();
9736
9737 std::pair<Value *, Value *> Res;
9738 if (emitRMWOp) {
9739 AtomicRMWInst *RMWInst =
9740 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9741 if (T.isAMDGPU()) {
9742 if (IsIgnoreDenormalMode)
9743 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9744 llvm::MDNode::get(Builder.getContext(), {}));
9745 if (!IsFineGrainedMemory)
9746 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9747 llvm::MDNode::get(Builder.getContext(), {}));
9748 if (!IsRemoteMemory)
9749 RMWInst->setMetadata("amdgpu.no.remote.memory",
9750 llvm::MDNode::get(Builder.getContext(), {}));
9751 }
9752 Res.first = RMWInst;
9753 // not needed except in case of postfix captures. Generate anyway for
9754 // consistency with the else part. Will be removed with any DCE pass.
9755 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9756 if (RMWOp == AtomicRMWInst::Xchg)
9757 Res.second = Res.first;
9758 else
9759 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9760 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9761 XElemTy->isStructTy()) {
9762 LoadInst *OldVal =
9763 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9764 OldVal->setAtomic(AO);
9765 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9766 unsigned LoadSize =
9767 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9768
9769 OpenMPIRBuilder::AtomicInfo atomicInfo(
9770 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9771 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9772 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9773 BasicBlock *CurBB = Builder.GetInsertBlock();
9774 Instruction *CurBBTI = CurBB->getTerminator();
9775 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9776 BasicBlock *ExitBB =
9777 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9778 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9779 X->getName() + ".atomic.cont");
9780 ContBB->getTerminator()->eraseFromParent();
9781 Builder.restoreIP(AllocaIP);
9782 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9783 NewAtomicAddr->setName(X->getName() + "x.new.val");
9784 Builder.SetInsertPoint(ContBB);
9785 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9786 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9787 Value *OldExprVal = PHI;
9788 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9789 if (!CBResult)
9790 return CBResult.takeError();
9791 Value *Upd = *CBResult;
9792 Builder.CreateStore(Upd, NewAtomicAddr);
9795 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9796 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9797 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9798 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9799 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9800 OldVal->eraseFromParent();
9801 Res.first = OldExprVal;
9802 Res.second = Upd;
9803
9804 if (UnreachableInst *ExitTI =
9806 CurBBTI->eraseFromParent();
9807 Builder.SetInsertPoint(ExitBB);
9808 } else {
9809 Builder.SetInsertPoint(ExitTI);
9810 }
9811 } else {
9812 IntegerType *IntCastTy =
9813 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9814 LoadInst *OldVal =
9815 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9816 OldVal->setAtomic(AO);
9817 // CurBB
9818 // | /---\
9819 // ContBB |
9820 // | \---/
9821 // ExitBB
9822 BasicBlock *CurBB = Builder.GetInsertBlock();
9823 Instruction *CurBBTI = CurBB->getTerminator();
9824 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9825 BasicBlock *ExitBB =
9826 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9827 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9828 X->getName() + ".atomic.cont");
9829 ContBB->getTerminator()->eraseFromParent();
9830 Builder.restoreIP(AllocaIP);
9831 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9832 NewAtomicAddr->setName(X->getName() + "x.new.val");
9833 Builder.SetInsertPoint(ContBB);
9834 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9835 PHI->addIncoming(OldVal, CurBB);
9836 bool IsIntTy = XElemTy->isIntegerTy();
9837 Value *OldExprVal = PHI;
9838 if (!IsIntTy) {
9839 if (XElemTy->isFloatingPointTy()) {
9840 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9841 X->getName() + ".atomic.fltCast");
9842 } else {
9843 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9844 X->getName() + ".atomic.ptrCast");
9845 }
9846 }
9847
9848 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9849 if (!CBResult)
9850 return CBResult.takeError();
9851 Value *Upd = *CBResult;
9852 Builder.CreateStore(Upd, NewAtomicAddr);
9853 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9856 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9857 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9858 Result->setVolatile(VolatileX);
9859 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9860 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9861 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9862 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9863
9864 Res.first = OldExprVal;
9865 Res.second = Upd;
9866
9867 // set Insertion point in exit block
9868 if (UnreachableInst *ExitTI =
9870 CurBBTI->eraseFromParent();
9871 Builder.SetInsertPoint(ExitBB);
9872 } else {
9873 Builder.SetInsertPoint(ExitTI);
9874 }
9875 }
9876
9877 return Res;
9878}
9879
9880OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9881 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9882 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9883 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9884 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9885 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9886 if (!updateToLocation(Loc))
9887 return Loc.IP;
9888
9889 LLVM_DEBUG({
9890 Type *XTy = X.Var->getType();
9891 assert(XTy->isPointerTy() &&
9892 "OMP Atomic expects a pointer to target memory");
9893 Type *XElemTy = X.ElemTy;
9894 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9895 XElemTy->isPointerTy()) &&
9896 "OMP atomic capture expected a scalar type");
9897 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9898 "OpenMP atomic does not support LT or GT operations");
9899 });
9900
9901 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
9902 // 'x' is simply atomically rewritten with 'expr'.
9903 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
9904 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9905 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
9906 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9907 if (!AtomicResult)
9908 return AtomicResult.takeError();
9909 Value *CapturedVal =
9910 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
9911 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
9912
9913 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
9914 return Builder.saveIP();
9915}
9916
9917OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9918 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9919 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9920 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9921 bool IsFailOnly) {
9922
9924 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
9925 IsPostfixUpdate, IsFailOnly, Failure);
9926}
9927
9928OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
9929 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
9930 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
9931 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
9932 bool IsFailOnly, AtomicOrdering Failure) {
9933
9934 if (!updateToLocation(Loc))
9935 return Loc.IP;
9936
9937 assert(X.Var->getType()->isPointerTy() &&
9938 "OMP atomic expects a pointer to target memory");
9939 // compare capture
9940 if (V.Var) {
9941 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
9942 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
9943 }
9944
9945 bool IsInteger = E->getType()->isIntegerTy();
9946
9947 if (Op == OMPAtomicCompareOp::EQ) {
9948 AtomicCmpXchgInst *Result = nullptr;
9949 if (!IsInteger) {
9950 IntegerType *IntCastTy =
9951 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
9952 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
9953 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
9954 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
9955 AO, Failure);
9956 } else {
9957 Result =
9958 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
9959 }
9960
9961 if (V.Var) {
9962 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9963 if (!IsInteger)
9964 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
9965 assert(OldValue->getType() == V.ElemTy &&
9966 "OldValue and V must be of same type");
9967 if (IsPostfixUpdate) {
9968 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
9969 } else {
9970 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9971 if (IsFailOnly) {
9972 // CurBB----
9973 // | |
9974 // v |
9975 // ContBB |
9976 // | |
9977 // v |
9978 // ExitBB <-
9979 //
9980 // where ContBB only contains the store of old value to 'v'.
9981 BasicBlock *CurBB = Builder.GetInsertBlock();
9982 Instruction *CurBBTI = CurBB->getTerminator();
9983 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9984 BasicBlock *ExitBB = CurBB->splitBasicBlock(
9985 CurBBTI, X.Var->getName() + ".atomic.exit");
9986 BasicBlock *ContBB = CurBB->splitBasicBlock(
9987 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
9988 ContBB->getTerminator()->eraseFromParent();
9989 CurBB->getTerminator()->eraseFromParent();
9990
9991 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
9992
9993 Builder.SetInsertPoint(ContBB);
9994 Builder.CreateStore(OldValue, V.Var);
9995 Builder.CreateBr(ExitBB);
9996
9997 if (UnreachableInst *ExitTI =
9999 CurBBTI->eraseFromParent();
10000 Builder.SetInsertPoint(ExitBB);
10001 } else {
10002 Builder.SetInsertPoint(ExitTI);
10003 }
10004 } else {
10005 Value *CapturedValue =
10006 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10007 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10008 }
10009 }
10010 }
10011 // The comparison result has to be stored.
10012 if (R.Var) {
10013 assert(R.Var->getType()->isPointerTy() &&
10014 "r.var must be of pointer type");
10015 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10016
10017 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10018 Value *ResultCast = R.IsSigned
10019 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10020 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10021 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10022 }
10023 } else {
10024 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10025 "Op should be either max or min at this point");
10026 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10027
10028 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10029 // Let's take max as example.
10030 // OpenMP form:
10031 // x = x > expr ? expr : x;
10032 // LLVM form:
10033 // *ptr = *ptr > val ? *ptr : val;
10034 // We need to transform to LLVM form.
10035 // x = x <= expr ? x : expr;
10037 if (IsXBinopExpr) {
10038 if (IsInteger) {
10039 if (X.IsSigned)
10040 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10042 else
10043 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10045 } else {
10046 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10048 }
10049 } else {
10050 if (IsInteger) {
10051 if (X.IsSigned)
10052 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10054 else
10055 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10057 } else {
10058 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10060 }
10061 }
10062
10063 AtomicRMWInst *OldValue =
10064 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10065 if (V.Var) {
10066 Value *CapturedValue = nullptr;
10067 if (IsPostfixUpdate) {
10068 CapturedValue = OldValue;
10069 } else {
10070 CmpInst::Predicate Pred;
10071 switch (NewOp) {
10072 case AtomicRMWInst::Max:
10073 Pred = CmpInst::ICMP_SGT;
10074 break;
10076 Pred = CmpInst::ICMP_UGT;
10077 break;
10079 Pred = CmpInst::FCMP_OGT;
10080 break;
10081 case AtomicRMWInst::Min:
10082 Pred = CmpInst::ICMP_SLT;
10083 break;
10085 Pred = CmpInst::ICMP_ULT;
10086 break;
10088 Pred = CmpInst::FCMP_OLT;
10089 break;
10090 default:
10091 llvm_unreachable("unexpected comparison op");
10092 }
10093 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10094 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10095 }
10096 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10097 }
10098 }
10099
10100 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10101
10102 return Builder.saveIP();
10103}
10104
10105OpenMPIRBuilder::InsertPointOrErrorTy
10106OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
10107 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10108 Value *NumTeamsUpper, Value *ThreadLimit,
10109 Value *IfExpr) {
10110 if (!updateToLocation(Loc))
10111 return InsertPointTy();
10112
10113 uint32_t SrcLocStrSize;
10114 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10115 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10116 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10117
10118 // Outer allocation basicblock is the entry block of the current function.
10119 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10120 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10121 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10122 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10123 }
10124
10125 // The current basic block is split into four basic blocks. After outlining,
10126 // they will be mapped as follows:
10127 // ```
10128 // def current_fn() {
10129 // current_basic_block:
10130 // br label %teams.exit
10131 // teams.exit:
10132 // ; instructions after teams
10133 // }
10134 //
10135 // def outlined_fn() {
10136 // teams.alloca:
10137 // br label %teams.body
10138 // teams.body:
10139 // ; instructions within teams body
10140 // }
10141 // ```
10142 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10143 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10144 BasicBlock *AllocaBB =
10145 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10146
10147 bool SubClausesPresent =
10148 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10149 // Push num_teams
10150 if (!Config.isTargetDevice() && SubClausesPresent) {
10151 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10152 "if lowerbound is non-null, then upperbound must also be non-null "
10153 "for bounds on num_teams");
10154
10155 if (NumTeamsUpper == nullptr)
10156 NumTeamsUpper = Builder.getInt32(0);
10157
10158 if (NumTeamsLower == nullptr)
10159 NumTeamsLower = NumTeamsUpper;
10160
10161 if (IfExpr) {
10162 assert(IfExpr->getType()->isIntegerTy() &&
10163 "argument to if clause must be an integer value");
10164
10165 // upper = ifexpr ? upper : 1
10166 if (IfExpr->getType() != Int1)
10167 IfExpr = Builder.CreateICmpNE(IfExpr,
10168 ConstantInt::get(IfExpr->getType(), 0));
10169 NumTeamsUpper = Builder.CreateSelect(
10170 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10171
10172 // lower = ifexpr ? lower : 1
10173 NumTeamsLower = Builder.CreateSelect(
10174 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10175 }
10176
10177 if (ThreadLimit == nullptr)
10178 ThreadLimit = Builder.getInt32(0);
10179
10180 Value *ThreadNum = getOrCreateThreadID(Ident);
10181 createRuntimeFunctionCall(
10182 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10183 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
10184 }
10185 // Generate the body of teams.
10186 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10187 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10188 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10189 return Err;
10190
10191 OutlineInfo OI;
10192 OI.EntryBB = AllocaBB;
10193 OI.ExitBB = ExitBB;
10194 OI.OuterAllocaBB = &OuterAllocaBB;
10195
10196 // Insert fake values for global tid and bound tid.
10198 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10199 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10200 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10201 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10202 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10203
10204 auto HostPostOutlineCB = [this, Ident,
10205 ToBeDeleted](Function &OutlinedFn) mutable {
10206 // The stale call instruction will be replaced with a new call instruction
10207 // for runtime call with the outlined function.
10208
10209 assert(OutlinedFn.hasOneUse() &&
10210 "there must be a single user for the outlined function");
10211 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10212 ToBeDeleted.push_back(StaleCI);
10213
10214 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10215 "Outlined function must have two or three arguments only");
10216
10217 bool HasShared = OutlinedFn.arg_size() == 3;
10218
10219 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10220 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10221 if (HasShared)
10222 OutlinedFn.getArg(2)->setName("data");
10223
10224 // Call to the runtime function for teams in the current function.
10225 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10226 "outlined function.");
10227 Builder.SetInsertPoint(StaleCI);
10229 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10230 if (HasShared)
10231 Args.push_back(StaleCI->getArgOperand(2));
10232 createRuntimeFunctionCall(
10233 getOrCreateRuntimeFunctionPtr(
10234 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10235 Args);
10236
10237 for (Instruction *I : llvm::reverse(ToBeDeleted))
10238 I->eraseFromParent();
10239 };
10240
10241 if (!Config.isTargetDevice())
10242 OI.PostOutlineCB = HostPostOutlineCB;
10243
10244 addOutlineInfo(std::move(OI));
10245
10246 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10247
10248 return Builder.saveIP();
10249}
10250
10251OpenMPIRBuilder::InsertPointOrErrorTy
10252OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10253 InsertPointTy OuterAllocaIP,
10254 BodyGenCallbackTy BodyGenCB) {
10255 if (!updateToLocation(Loc))
10256 return InsertPointTy();
10257
10258 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10259
10260 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10261 BasicBlock *BodyBB =
10262 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10263 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10264 }
10265 BasicBlock *ExitBB =
10266 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10267 BasicBlock *BodyBB =
10268 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10269 BasicBlock *AllocaBB =
10270 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10271
10272 // Generate the body of distribute clause
10273 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10274 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10275 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10276 return Err;
10277
10278 // When using target we use different runtime functions which require a
10279 // callback.
10280 if (Config.isTargetDevice()) {
10281 OutlineInfo OI;
10282 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10283 OI.EntryBB = AllocaBB;
10284 OI.ExitBB = ExitBB;
10285
10286 addOutlineInfo(std::move(OI));
10287 }
10288 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10289
10290 return Builder.saveIP();
10291}
10292
10294OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10295 std::string VarName) {
10296 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10298 Names.size()),
10299 Names);
10300 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10301 M, MapNamesArrayInit->getType(),
10302 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10303 VarName);
10304 return MapNamesArrayGlobal;
10305}
10306
10307// Create all simple and struct types exposed by the runtime and remember
10308// the llvm::PointerTypes of them for easy access later.
10309void OpenMPIRBuilder::initializeTypes(Module &M) {
10310 LLVMContext &Ctx = M.getContext();
10311 StructType *T;
10312 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10313 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10314#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10315#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10316 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10317 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10318#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10319 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10320 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10321#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10322 T = StructType::getTypeByName(Ctx, StructName); \
10323 if (!T) \
10324 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10325 VarName = T; \
10326 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10327#include "llvm/Frontend/OpenMP/OMPKinds.def"
10328}
10329
10330void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10332 SmallVectorImpl<BasicBlock *> &BlockVector) {
10334 BlockSet.insert(EntryBB);
10335 BlockSet.insert(ExitBB);
10336
10337 Worklist.push_back(EntryBB);
10338 while (!Worklist.empty()) {
10339 BasicBlock *BB = Worklist.pop_back_val();
10340 BlockVector.push_back(BB);
10341 for (BasicBlock *SuccBB : successors(BB))
10342 if (BlockSet.insert(SuccBB).second)
10343 Worklist.push_back(SuccBB);
10344 }
10345}
10346
10347void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10348 uint64_t Size, int32_t Flags,
10350 StringRef Name) {
10351 if (!Config.isGPU()) {
10354 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10355 return;
10356 }
10357 // TODO: Add support for global variables on the device after declare target
10358 // support.
10359 Function *Fn = dyn_cast<Function>(Addr);
10360 if (!Fn)
10361 return;
10362
10363 // Add a function attribute for the kernel.
10364 Fn->addFnAttr("kernel");
10365 if (T.isAMDGCN())
10366 Fn->addFnAttr("uniform-work-group-size", "true");
10367 Fn->addFnAttr(Attribute::MustProgress);
10368}
10369
10370// We only generate metadata for function that contain target regions.
10371void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10372 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10373
10374 // If there are no entries, we don't need to do anything.
10375 if (OffloadInfoManager.empty())
10376 return;
10377
10378 LLVMContext &C = M.getContext();
10379 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10380 TargetRegionEntryInfo>,
10381 16>
10382 OrderedEntries(OffloadInfoManager.size());
10383
10384 // Auxiliary methods to create metadata values and strings.
10385 auto &&GetMDInt = [this](unsigned V) {
10386 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10387 };
10388
10389 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10390
10391 // Create the offloading info metadata node.
10392 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10393 auto &&TargetRegionMetadataEmitter =
10394 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10395 const TargetRegionEntryInfo &EntryInfo,
10396 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10397 // Generate metadata for target regions. Each entry of this metadata
10398 // contains:
10399 // - Entry 0 -> Kind of this type of metadata (0).
10400 // - Entry 1 -> Device ID of the file where the entry was identified.
10401 // - Entry 2 -> File ID of the file where the entry was identified.
10402 // - Entry 3 -> Mangled name of the function where the entry was
10403 // identified.
10404 // - Entry 4 -> Line in the file where the entry was identified.
10405 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10406 // - Entry 6 -> Order the entry was created.
10407 // The first element of the metadata node is the kind.
10408 Metadata *Ops[] = {
10409 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10410 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10411 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10412 GetMDInt(E.getOrder())};
10413
10414 // Save this entry in the right position of the ordered entries array.
10415 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10416
10417 // Add metadata to the named metadata node.
10418 MD->addOperand(MDNode::get(C, Ops));
10419 };
10420
10421 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10422
10423 // Create function that emits metadata for each device global variable entry;
10424 auto &&DeviceGlobalVarMetadataEmitter =
10425 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10426 StringRef MangledName,
10427 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10428 // Generate metadata for global variables. Each entry of this metadata
10429 // contains:
10430 // - Entry 0 -> Kind of this type of metadata (1).
10431 // - Entry 1 -> Mangled name of the variable.
10432 // - Entry 2 -> Declare target kind.
10433 // - Entry 3 -> Order the entry was created.
10434 // The first element of the metadata node is the kind.
10435 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10436 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10437
10438 // Save this entry in the right position of the ordered entries array.
10439 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10440 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10441
10442 // Add metadata to the named metadata node.
10443 MD->addOperand(MDNode::get(C, Ops));
10444 };
10445
10446 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10447 DeviceGlobalVarMetadataEmitter);
10448
10449 for (const auto &E : OrderedEntries) {
10450 assert(E.first && "All ordered entries must exist!");
10451 if (const auto *CE =
10453 E.first)) {
10454 if (!CE->getID() || !CE->getAddress()) {
10455 // Do not blame the entry if the parent funtion is not emitted.
10456 TargetRegionEntryInfo EntryInfo = E.second;
10457 StringRef FnName = EntryInfo.ParentName;
10458 if (!M.getNamedValue(FnName))
10459 continue;
10460 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10461 continue;
10462 }
10463 createOffloadEntry(CE->getID(), CE->getAddress(),
10464 /*Size=*/0, CE->getFlags(),
10466 } else if (const auto *CE = dyn_cast<
10467 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10468 E.first)) {
10469 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10470 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10471 CE->getFlags());
10472 switch (Flags) {
10473 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10474 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10475 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10476 continue;
10477 if (!CE->getAddress()) {
10478 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10479 continue;
10480 }
10481 // The vaiable has no definition - no need to add the entry.
10482 if (CE->getVarSize() == 0)
10483 continue;
10484 break;
10485 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10486 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10487 (!Config.isTargetDevice() && CE->getAddress())) &&
10488 "Declaret target link address is set.");
10489 if (Config.isTargetDevice())
10490 continue;
10491 if (!CE->getAddress()) {
10492 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10493 continue;
10494 }
10495 break;
10496 default:
10497 break;
10498 }
10499
10500 // Hidden or internal symbols on the device are not externally visible.
10501 // We should not attempt to register them by creating an offloading
10502 // entry. Indirect variables are handled separately on the device.
10503 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10504 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10505 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10506 continue;
10507
10508 // Indirect globals need to use a special name that doesn't match the name
10509 // of the associated host global.
10510 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10511 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10512 Flags, CE->getLinkage(), CE->getVarName());
10513 else
10514 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10515 Flags, CE->getLinkage());
10516
10517 } else {
10518 llvm_unreachable("Unsupported entry kind.");
10519 }
10520 }
10521
10522 // Emit requires directive globals to a special entry so the runtime can
10523 // register them when the device image is loaded.
10524 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10525 // entries should be redesigned to better suit this use-case.
10526 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10530 ".requires", /*Size=*/0,
10531 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10532 Config.getRequiresFlags());
10533}
10534
10535void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10536 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10537 unsigned FileID, unsigned Line, unsigned Count) {
10538 raw_svector_ostream OS(Name);
10539 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10540 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10541 if (Count)
10542 OS << "_" << Count;
10543}
10544
10545void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10546 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10547 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10548 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10549 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10550 EntryInfo.Line, NewCount);
10551}
10552
10553TargetRegionEntryInfo
10554OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10555 vfs::FileSystem &VFS,
10556 StringRef ParentName) {
10557 sys::fs::UniqueID ID(0xdeadf17e, 0);
10558 auto FileIDInfo = CallBack();
10559 uint64_t FileID = 0;
10560 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10561 ID = Status->getUniqueID();
10562 FileID = Status->getUniqueID().getFile();
10563 } else {
10564 // If the inode ID could not be determined, create a hash value
10565 // the current file name and use that as an ID.
10566 FileID = hash_value(std::get<0>(FileIDInfo));
10567 }
10568
10569 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10570 std::get<1>(FileIDInfo));
10571}
10572
10573unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10574 unsigned Offset = 0;
10575 for (uint64_t Remain =
10576 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10578 !(Remain & 1); Remain = Remain >> 1)
10579 Offset++;
10580 return Offset;
10581}
10582
10584OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10585 // Rotate by getFlagMemberOffset() bits.
10586 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10587 << getFlagMemberOffset());
10588}
10589
10590void OpenMPIRBuilder::setCorrectMemberOfFlag(
10592 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10593 // If the entry is PTR_AND_OBJ but has not been marked with the special
10594 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10595 // marked as MEMBER_OF.
10596 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10598 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10601 return;
10602
10603 // Reset the placeholder value to prepare the flag for the assignment of the
10604 // proper MEMBER_OF value.
10605 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10606 Flags |= MemberOfFlag;
10607}
10608
10609Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10610 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10611 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10612 bool IsDeclaration, bool IsExternallyVisible,
10613 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10614 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10615 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10616 std::function<Constant *()> GlobalInitializer,
10617 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10618 // TODO: convert this to utilise the IRBuilder Config rather than
10619 // a passed down argument.
10620 if (OpenMPSIMD)
10621 return nullptr;
10622
10623 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10624 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10625 CaptureClause ==
10626 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10627 Config.hasRequiresUnifiedSharedMemory())) {
10628 SmallString<64> PtrName;
10629 {
10630 raw_svector_ostream OS(PtrName);
10631 OS << MangledName;
10632 if (!IsExternallyVisible)
10633 OS << format("_%x", EntryInfo.FileID);
10634 OS << "_decl_tgt_ref_ptr";
10635 }
10636
10637 Value *Ptr = M.getNamedValue(PtrName);
10638
10639 if (!Ptr) {
10640 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10641 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10642
10643 auto *GV = cast<GlobalVariable>(Ptr);
10644 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10645
10646 if (!Config.isTargetDevice()) {
10647 if (GlobalInitializer)
10648 GV->setInitializer(GlobalInitializer());
10649 else
10650 GV->setInitializer(GlobalValue);
10651 }
10652
10653 registerTargetGlobalVariable(
10654 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10655 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10656 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10657 }
10658
10659 return cast<Constant>(Ptr);
10660 }
10661
10662 return nullptr;
10663}
10664
10665void OpenMPIRBuilder::registerTargetGlobalVariable(
10666 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10667 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10668 bool IsDeclaration, bool IsExternallyVisible,
10669 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10670 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10671 std::vector<Triple> TargetTriple,
10672 std::function<Constant *()> GlobalInitializer,
10673 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10674 Constant *Addr) {
10675 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10676 (TargetTriple.empty() && !Config.isTargetDevice()))
10677 return;
10678
10679 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10681 int64_t VarSize;
10683
10684 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10685 CaptureClause ==
10686 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10687 !Config.hasRequiresUnifiedSharedMemory()) {
10688 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10689 VarName = MangledName;
10690 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10691
10692 if (!IsDeclaration)
10693 VarSize = divideCeil(
10694 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10695 else
10696 VarSize = 0;
10697 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10698
10699 // This is a workaround carried over from Clang which prevents undesired
10700 // optimisation of internal variables.
10701 if (Config.isTargetDevice() &&
10702 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10703 // Do not create a "ref-variable" if the original is not also available
10704 // on the host.
10705 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10706 return;
10707
10708 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10709
10710 if (!M.getNamedValue(RefName)) {
10711 Constant *AddrRef =
10712 getOrCreateInternalVariable(Addr->getType(), RefName);
10713 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10714 GvAddrRef->setConstant(true);
10715 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10716 GvAddrRef->setInitializer(Addr);
10717 GeneratedRefs.push_back(GvAddrRef);
10718 }
10719 }
10720 } else {
10721 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10722 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10723 else
10724 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10725
10726 if (Config.isTargetDevice()) {
10727 VarName = (Addr) ? Addr->getName() : "";
10728 Addr = nullptr;
10729 } else {
10730 Addr = getAddrOfDeclareTargetVar(
10731 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10732 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10733 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10734 VarName = (Addr) ? Addr->getName() : "";
10735 }
10736 VarSize = M.getDataLayout().getPointerSize();
10738 }
10739
10740 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10741 Flags, Linkage);
10742}
10743
10744/// Loads all the offload entries information from the host IR
10745/// metadata.
10746void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10747 // If we are in target mode, load the metadata from the host IR. This code has
10748 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10749
10750 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10751 if (!MD)
10752 return;
10753
10754 for (MDNode *MN : MD->operands()) {
10755 auto &&GetMDInt = [MN](unsigned Idx) {
10756 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10757 return cast<ConstantInt>(V->getValue())->getZExtValue();
10758 };
10759
10760 auto &&GetMDString = [MN](unsigned Idx) {
10761 auto *V = cast<MDString>(MN->getOperand(Idx));
10762 return V->getString();
10763 };
10764
10765 switch (GetMDInt(0)) {
10766 default:
10767 llvm_unreachable("Unexpected metadata!");
10768 break;
10769 case OffloadEntriesInfoManager::OffloadEntryInfo::
10770 OffloadingEntryInfoTargetRegion: {
10771 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10772 /*DeviceID=*/GetMDInt(1),
10773 /*FileID=*/GetMDInt(2),
10774 /*Line=*/GetMDInt(4),
10775 /*Count=*/GetMDInt(5));
10776 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10777 /*Order=*/GetMDInt(6));
10778 break;
10779 }
10780 case OffloadEntriesInfoManager::OffloadEntryInfo::
10781 OffloadingEntryInfoDeviceGlobalVar:
10782 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10783 /*MangledName=*/GetMDString(1),
10784 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10785 /*Flags=*/GetMDInt(2)),
10786 /*Order=*/GetMDInt(3));
10787 break;
10788 }
10789 }
10790}
10791
10792void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10793 StringRef HostFilePath) {
10794 if (HostFilePath.empty())
10795 return;
10796
10797 auto Buf = VFS.getBufferForFile(HostFilePath);
10798 if (std::error_code Err = Buf.getError()) {
10799 report_fatal_error(("error opening host file from host file path inside of "
10800 "OpenMPIRBuilder: " +
10801 Err.message())
10802 .c_str());
10803 }
10804
10805 LLVMContext Ctx;
10807 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10808 if (std::error_code Err = M.getError()) {
10810 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10811 .c_str());
10812 }
10813
10814 loadOffloadInfoMetadata(*M.get());
10815}
10816
10817//===----------------------------------------------------------------------===//
10818// OffloadEntriesInfoManager
10819//===----------------------------------------------------------------------===//
10820
10821bool OffloadEntriesInfoManager::empty() const {
10822 return OffloadEntriesTargetRegion.empty() &&
10823 OffloadEntriesDeviceGlobalVar.empty();
10824}
10825
10826unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10827 const TargetRegionEntryInfo &EntryInfo) const {
10828 auto It = OffloadEntriesTargetRegionCount.find(
10829 getTargetRegionEntryCountKey(EntryInfo));
10830 if (It == OffloadEntriesTargetRegionCount.end())
10831 return 0;
10832 return It->second;
10833}
10834
10835void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10836 const TargetRegionEntryInfo &EntryInfo) {
10837 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10838 EntryInfo.Count + 1;
10839}
10840
10841/// Initialize target region entry.
10842void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10843 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10844 OffloadEntriesTargetRegion[EntryInfo] =
10845 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10846 OMPTargetRegionEntryTargetRegion);
10847 ++OffloadingEntriesNum;
10848}
10849
10850void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10851 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10852 OMPTargetRegionEntryKind Flags) {
10853 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10854
10855 // Update the EntryInfo with the next available count for this location.
10856 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10857
10858 // If we are emitting code for a target, the entry is already initialized,
10859 // only has to be registered.
10860 if (OMPBuilder->Config.isTargetDevice()) {
10861 // This could happen if the device compilation is invoked standalone.
10862 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10863 return;
10864 }
10865 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10866 Entry.setAddress(Addr);
10867 Entry.setID(ID);
10868 Entry.setFlags(Flags);
10869 } else {
10870 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10871 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10872 return;
10873 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10874 "Target region entry already registered!");
10875 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10876 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10877 ++OffloadingEntriesNum;
10878 }
10879 incrementTargetRegionEntryInfoCount(EntryInfo);
10880}
10881
10882bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10883 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10884
10885 // Update the EntryInfo with the next available count for this location.
10886 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10887
10888 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10889 if (It == OffloadEntriesTargetRegion.end()) {
10890 return false;
10891 }
10892 // Fail if this entry is already registered.
10893 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
10894 return false;
10895 return true;
10896}
10897
10898void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
10899 const OffloadTargetRegionEntryInfoActTy &Action) {
10900 // Scan all target region entries and perform the provided action.
10901 for (const auto &It : OffloadEntriesTargetRegion) {
10902 Action(It.first, It.second);
10903 }
10904}
10905
10906void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
10907 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
10908 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
10909 ++OffloadingEntriesNum;
10910}
10911
10912void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
10913 StringRef VarName, Constant *Addr, int64_t VarSize,
10914 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
10915 if (OMPBuilder->Config.isTargetDevice()) {
10916 // This could happen if the device compilation is invoked standalone.
10917 if (!hasDeviceGlobalVarEntryInfo(VarName))
10918 return;
10919 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10920 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
10921 if (Entry.getVarSize() == 0) {
10922 Entry.setVarSize(VarSize);
10923 Entry.setLinkage(Linkage);
10924 }
10925 return;
10926 }
10927 Entry.setVarSize(VarSize);
10928 Entry.setLinkage(Linkage);
10929 Entry.setAddress(Addr);
10930 } else {
10931 if (hasDeviceGlobalVarEntryInfo(VarName)) {
10932 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
10933 assert(Entry.isValid() && Entry.getFlags() == Flags &&
10934 "Entry not initialized!");
10935 if (Entry.getVarSize() == 0) {
10936 Entry.setVarSize(VarSize);
10937 Entry.setLinkage(Linkage);
10938 }
10939 return;
10940 }
10941 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10942 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
10943 Addr, VarSize, Flags, Linkage,
10944 VarName.str());
10945 else
10946 OffloadEntriesDeviceGlobalVar.try_emplace(
10947 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
10948 ++OffloadingEntriesNum;
10949 }
10950}
10951
10952void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
10953 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
10954 // Scan all target region entries and perform the provided action.
10955 for (const auto &E : OffloadEntriesDeviceGlobalVar)
10956 Action(E.getKey(), E.getValue());
10957}
10958
10959//===----------------------------------------------------------------------===//
10960// CanonicalLoopInfo
10961//===----------------------------------------------------------------------===//
10962
10963void CanonicalLoopInfo::collectControlBlocks(
10965 // We only count those BBs as control block for which we do not need to
10966 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
10967 // flow. For consistency, this also means we do not add the Body block, which
10968 // is just the entry to the body code.
10969 BBs.reserve(BBs.size() + 6);
10970 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
10971}
10972
10973BasicBlock *CanonicalLoopInfo::getPreheader() const {
10974 assert(isValid() && "Requires a valid canonical loop");
10975 for (BasicBlock *Pred : predecessors(Header)) {
10976 if (Pred != Latch)
10977 return Pred;
10978 }
10979 llvm_unreachable("Missing preheader");
10980}
10981
10982void CanonicalLoopInfo::setTripCount(Value *TripCount) {
10983 assert(isValid() && "Requires a valid canonical loop");
10984
10985 Instruction *CmpI = &getCond()->front();
10986 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
10987 CmpI->setOperand(1, TripCount);
10988
10989#ifndef NDEBUG
10990 assertOK();
10991#endif
10992}
10993
10994void CanonicalLoopInfo::mapIndVar(
10995 llvm::function_ref<Value *(Instruction *)> Updater) {
10996 assert(isValid() && "Requires a valid canonical loop");
10997
10998 Instruction *OldIV = getIndVar();
10999
11000 // Record all uses excluding those introduced by the updater. Uses by the
11001 // CanonicalLoopInfo itself to keep track of the number of iterations are
11002 // excluded.
11003 SmallVector<Use *> ReplacableUses;
11004 for (Use &U : OldIV->uses()) {
11005 auto *User = dyn_cast<Instruction>(U.getUser());
11006 if (!User)
11007 continue;
11008 if (User->getParent() == getCond())
11009 continue;
11010 if (User->getParent() == getLatch())
11011 continue;
11012 ReplacableUses.push_back(&U);
11013 }
11014
11015 // Run the updater that may introduce new uses
11016 Value *NewIV = Updater(OldIV);
11017
11018 // Replace the old uses with the value returned by the updater.
11019 for (Use *U : ReplacableUses)
11020 U->set(NewIV);
11021
11022#ifndef NDEBUG
11023 assertOK();
11024#endif
11025}
11026
11027void CanonicalLoopInfo::assertOK() const {
11028#ifndef NDEBUG
11029 // No constraints if this object currently does not describe a loop.
11030 if (!isValid())
11031 return;
11032
11033 BasicBlock *Preheader = getPreheader();
11034 BasicBlock *Body = getBody();
11035 BasicBlock *After = getAfter();
11036
11037 // Verify standard control-flow we use for OpenMP loops.
11038 assert(Preheader);
11039 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11040 "Preheader must terminate with unconditional branch");
11041 assert(Preheader->getSingleSuccessor() == Header &&
11042 "Preheader must jump to header");
11043
11044 assert(Header);
11045 assert(isa<BranchInst>(Header->getTerminator()) &&
11046 "Header must terminate with unconditional branch");
11047 assert(Header->getSingleSuccessor() == Cond &&
11048 "Header must jump to exiting block");
11049
11050 assert(Cond);
11051 assert(Cond->getSinglePredecessor() == Header &&
11052 "Exiting block only reachable from header");
11053
11054 assert(isa<BranchInst>(Cond->getTerminator()) &&
11055 "Exiting block must terminate with conditional branch");
11056 assert(size(successors(Cond)) == 2 &&
11057 "Exiting block must have two successors");
11058 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11059 "Exiting block's first successor jump to the body");
11060 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11061 "Exiting block's second successor must exit the loop");
11062
11063 assert(Body);
11064 assert(Body->getSinglePredecessor() == Cond &&
11065 "Body only reachable from exiting block");
11066 assert(!isa<PHINode>(Body->front()));
11067
11068 assert(Latch);
11070 "Latch must terminate with unconditional branch");
11071 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11072 // TODO: To support simple redirecting of the end of the body code that has
11073 // multiple; introduce another auxiliary basic block like preheader and after.
11074 assert(Latch->getSinglePredecessor() != nullptr);
11075 assert(!isa<PHINode>(Latch->front()));
11076
11077 assert(Exit);
11078 assert(isa<BranchInst>(Exit->getTerminator()) &&
11079 "Exit block must terminate with unconditional branch");
11080 assert(Exit->getSingleSuccessor() == After &&
11081 "Exit block must jump to after block");
11082
11083 assert(After);
11084 assert(After->getSinglePredecessor() == Exit &&
11085 "After block only reachable from exit block");
11086 assert(After->empty() || !isa<PHINode>(After->front()));
11087
11088 Instruction *IndVar = getIndVar();
11089 assert(IndVar && "Canonical induction variable not found?");
11090 assert(isa<IntegerType>(IndVar->getType()) &&
11091 "Induction variable must be an integer");
11092 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11093 "Induction variable must be a PHI in the loop header");
11094 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11095 assert(
11096 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11097 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11098
11099 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11100 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11101 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11102 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11103 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11104 ->isOne());
11105
11106 Value *TripCount = getTripCount();
11107 assert(TripCount && "Loop trip count not found?");
11108 assert(IndVar->getType() == TripCount->getType() &&
11109 "Trip count and induction variable must have the same type");
11110
11111 auto *CmpI = cast<CmpInst>(&Cond->front());
11112 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11113 "Exit condition must be a signed less-than comparison");
11114 assert(CmpI->getOperand(0) == IndVar &&
11115 "Exit condition must compare the induction variable");
11116 assert(CmpI->getOperand(1) == TripCount &&
11117 "Exit condition must compare with the trip count");
11118#endif
11119}
11120
11121void CanonicalLoopInfo::invalidate() {
11122 Header = nullptr;
11123 Cond = nullptr;
11124 Latch = nullptr;
11125 Exit = nullptr;
11126}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
unsigned getNumSuccessors() const
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setSuccessor(unsigned idx, BasicBlock *NewSucc)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:715
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:131
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:124
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:640
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:447
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:668
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:58
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:228
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1064
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1126
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:413
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1142
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1655
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:360
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2136
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...