LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
29#include "llvm/IR/Attributes.h"
30#include "llvm/IR/BasicBlock.h"
31#include "llvm/IR/CFG.h"
32#include "llvm/IR/CallingConv.h"
33#include "llvm/IR/Constant.h"
34#include "llvm/IR/Constants.h"
35#include "llvm/IR/DIBuilder.h"
38#include "llvm/IR/Function.h"
40#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/LLVMContext.h"
44#include "llvm/IR/MDBuilder.h"
45#include "llvm/IR/Metadata.h"
47#include "llvm/IR/PassManager.h"
49#include "llvm/IR/Value.h"
62
63#include <cstdint>
64#include <optional>
65
66#define DEBUG_TYPE "openmp-ir-builder"
67
68using namespace llvm;
69using namespace omp;
70
71static cl::opt<bool>
72 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
73 cl::desc("Use optimistic attributes describing "
74 "'as-if' properties of runtime calls."),
75 cl::init(false));
76
78 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
79 cl::desc("Factor for the unroll threshold to account for code "
80 "simplifications still taking place"),
81 cl::init(1.5));
82
83#ifndef NDEBUG
84/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
85/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
86/// an InsertPoint stores the instruction before something is inserted. For
87/// instance, if both point to the same instruction, two IRBuilders alternating
88/// creating instruction will cause the instructions to be interleaved.
91 if (!IP1.isSet() || !IP2.isSet())
92 return false;
93 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
94}
95
97 // Valid ordered/unordered and base algorithm combinations.
98 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
99 case OMPScheduleType::UnorderedStaticChunked:
100 case OMPScheduleType::UnorderedStatic:
101 case OMPScheduleType::UnorderedDynamicChunked:
102 case OMPScheduleType::UnorderedGuidedChunked:
103 case OMPScheduleType::UnorderedRuntime:
104 case OMPScheduleType::UnorderedAuto:
105 case OMPScheduleType::UnorderedTrapezoidal:
106 case OMPScheduleType::UnorderedGreedy:
107 case OMPScheduleType::UnorderedBalanced:
108 case OMPScheduleType::UnorderedGuidedIterativeChunked:
109 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
110 case OMPScheduleType::UnorderedSteal:
111 case OMPScheduleType::UnorderedStaticBalancedChunked:
112 case OMPScheduleType::UnorderedGuidedSimd:
113 case OMPScheduleType::UnorderedRuntimeSimd:
114 case OMPScheduleType::OrderedStaticChunked:
115 case OMPScheduleType::OrderedStatic:
116 case OMPScheduleType::OrderedDynamicChunked:
117 case OMPScheduleType::OrderedGuidedChunked:
118 case OMPScheduleType::OrderedRuntime:
119 case OMPScheduleType::OrderedAuto:
120 case OMPScheduleType::OrderdTrapezoidal:
121 case OMPScheduleType::NomergeUnorderedStaticChunked:
122 case OMPScheduleType::NomergeUnorderedStatic:
123 case OMPScheduleType::NomergeUnorderedDynamicChunked:
124 case OMPScheduleType::NomergeUnorderedGuidedChunked:
125 case OMPScheduleType::NomergeUnorderedRuntime:
126 case OMPScheduleType::NomergeUnorderedAuto:
127 case OMPScheduleType::NomergeUnorderedTrapezoidal:
128 case OMPScheduleType::NomergeUnorderedGreedy:
129 case OMPScheduleType::NomergeUnorderedBalanced:
130 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
131 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
132 case OMPScheduleType::NomergeUnorderedSteal:
133 case OMPScheduleType::NomergeOrderedStaticChunked:
134 case OMPScheduleType::NomergeOrderedStatic:
135 case OMPScheduleType::NomergeOrderedDynamicChunked:
136 case OMPScheduleType::NomergeOrderedGuidedChunked:
137 case OMPScheduleType::NomergeOrderedRuntime:
138 case OMPScheduleType::NomergeOrderedAuto:
139 case OMPScheduleType::NomergeOrderedTrapezoidal:
140 case OMPScheduleType::OrderedDistributeChunked:
141 case OMPScheduleType::OrderedDistribute:
142 break;
143 default:
144 return false;
145 }
146
147 // Must not set both monotonicity modifiers at the same time.
148 OMPScheduleType MonotonicityFlags =
149 SchedType & OMPScheduleType::MonotonicityMask;
150 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
151 return false;
152
153 return true;
154}
155#endif
156
157/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
158/// debug location to the last instruction in the specified basic block if the
159/// insert point points to the end of the block.
162 Builder.restoreIP(IP);
163 llvm::BasicBlock *BB = Builder.GetInsertBlock();
164 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
165 if (!BB->empty() && I == BB->end())
166 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
167}
168
169static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
170 if (T.isAMDGPU()) {
171 StringRef Features =
172 Kernel->getFnAttribute("target-features").getValueAsString();
173 if (Features.count("+wavefrontsize64"))
176 }
177 if (T.isNVPTX())
179 if (T.isSPIRV())
181 llvm_unreachable("No grid value available for this architecture!");
182}
183
184/// Determine which scheduling algorithm to use, determined from schedule clause
185/// arguments.
186static OMPScheduleType
187getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
188 bool HasSimdModifier, bool HasDistScheduleChunks) {
189 // Currently, the default schedule it static.
190 switch (ClauseKind) {
191 case OMP_SCHEDULE_Default:
192 case OMP_SCHEDULE_Static:
193 return HasChunks ? OMPScheduleType::BaseStaticChunked
194 : OMPScheduleType::BaseStatic;
195 case OMP_SCHEDULE_Dynamic:
196 return OMPScheduleType::BaseDynamicChunked;
197 case OMP_SCHEDULE_Guided:
198 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
199 : OMPScheduleType::BaseGuidedChunked;
200 case OMP_SCHEDULE_Auto:
202 case OMP_SCHEDULE_Runtime:
203 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
204 : OMPScheduleType::BaseRuntime;
205 case OMP_SCHEDULE_Distribute:
206 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
207 : OMPScheduleType::BaseDistribute;
208 }
209 llvm_unreachable("unhandled schedule clause argument");
210}
211
212/// Adds ordering modifier flags to schedule type.
213static OMPScheduleType
215 bool HasOrderedClause) {
216 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
217 OMPScheduleType::None &&
218 "Must not have ordering nor monotonicity flags already set");
219
220 OMPScheduleType OrderingModifier = HasOrderedClause
221 ? OMPScheduleType::ModifierOrdered
222 : OMPScheduleType::ModifierUnordered;
223 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
224
225 // Unsupported combinations
226 if (OrderingScheduleType ==
227 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
228 return OMPScheduleType::OrderedGuidedChunked;
229 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
230 OMPScheduleType::ModifierOrdered))
231 return OMPScheduleType::OrderedRuntime;
232
233 return OrderingScheduleType;
234}
235
236/// Adds monotonicity modifier flags to schedule type.
237static OMPScheduleType
239 bool HasSimdModifier, bool HasMonotonic,
240 bool HasNonmonotonic, bool HasOrderedClause) {
241 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
242 OMPScheduleType::None &&
243 "Must not have monotonicity flags already set");
244 assert((!HasMonotonic || !HasNonmonotonic) &&
245 "Monotonic and Nonmonotonic are contradicting each other");
246
247 if (HasMonotonic) {
248 return ScheduleType | OMPScheduleType::ModifierMonotonic;
249 } else if (HasNonmonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
251 } else {
252 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
253 // If the static schedule kind is specified or if the ordered clause is
254 // specified, and if the nonmonotonic modifier is not specified, the
255 // effect is as if the monotonic modifier is specified. Otherwise, unless
256 // the monotonic modifier is specified, the effect is as if the
257 // nonmonotonic modifier is specified.
258 OMPScheduleType BaseScheduleType =
259 ScheduleType & ~OMPScheduleType::ModifierMask;
260 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
261 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
262 HasOrderedClause) {
263 // The monotonic is used by default in openmp runtime library, so no need
264 // to set it.
265 return ScheduleType;
266 } else {
267 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
268 }
269 }
270}
271
272/// Determine the schedule type using schedule and ordering clause arguments.
273static OMPScheduleType
274computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
275 bool HasSimdModifier, bool HasMonotonicModifier,
276 bool HasNonmonotonicModifier, bool HasOrderedClause,
277 bool HasDistScheduleChunks) {
279 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
280 OMPScheduleType OrderedSchedule =
281 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
283 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
284 HasNonmonotonicModifier, HasOrderedClause);
285
287 return Result;
288}
289
290/// Make \p Source branch to \p Target.
291///
292/// Handles two situations:
293/// * \p Source already has an unconditional branch.
294/// * \p Source is a degenerate block (no terminator because the BB is
295/// the current head of the IR construction).
297 if (Instruction *Term = Source->getTerminator()) {
298 auto *Br = cast<BranchInst>(Term);
299 assert(!Br->isConditional() &&
300 "BB's terminator must be an unconditional branch (or degenerate)");
301 BasicBlock *Succ = Br->getSuccessor(0);
302 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
303 Br->setSuccessor(0, Target);
304 return;
305 }
306
307 auto *NewBr = BranchInst::Create(Target, Source);
308 NewBr->setDebugLoc(DL);
309}
310
311void llvm::spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
312 bool CreateBranch, DebugLoc DL) {
313 assert(New->getFirstInsertionPt() == New->begin() &&
314 "Target BB must not have PHI nodes");
315
316 // Move instructions to new block.
317 BasicBlock *Old = IP.getBlock();
318 // If the `Old` block is empty then there are no instructions to move. But in
319 // the new debug scheme, it could have trailing debug records which will be
320 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
321 // reasons:
322 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
323 // 2. Even if `New` is not empty, the rationale to move those records to `New`
324 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
325 // assumes that `Old` is optimized out and is going away. This is not the case
326 // here. The `Old` block is still being used e.g. a branch instruction is
327 // added to it later in this function.
328 // So we call `BasicBlock::splice` only when `Old` is not empty.
329 if (!Old->empty())
330 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
331
332 if (CreateBranch) {
333 auto *NewBr = BranchInst::Create(New, Old);
334 NewBr->setDebugLoc(DL);
335 }
336}
337
338void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
339 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
340 BasicBlock *Old = Builder.GetInsertBlock();
341
342 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
343 if (CreateBranch)
344 Builder.SetInsertPoint(Old->getTerminator());
345 else
346 Builder.SetInsertPoint(Old);
347
348 // SetInsertPoint also updates the Builder's debug location, but we want to
349 // keep the one the Builder was configured to use.
350 Builder.SetCurrentDebugLocation(DebugLoc);
351}
352
353BasicBlock *llvm::splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
354 DebugLoc DL, llvm::Twine Name) {
355 BasicBlock *Old = IP.getBlock();
357 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
358 Old->getParent(), Old->getNextNode());
359 spliceBB(IP, New, CreateBranch, DL);
360 New->replaceSuccessorsPhiUsesWith(Old, New);
361 return New;
362}
363
364BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
365 llvm::Twine Name) {
366 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
367 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
368 if (CreateBranch)
369 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
370 else
371 Builder.SetInsertPoint(Builder.GetInsertBlock());
372 // SetInsertPoint also updates the Builder's debug location, but we want to
373 // keep the one the Builder was configured to use.
374 Builder.SetCurrentDebugLocation(DebugLoc);
375 return New;
376}
377
378BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
379 llvm::Twine Name) {
380 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
381 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
382 if (CreateBranch)
383 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
384 else
385 Builder.SetInsertPoint(Builder.GetInsertBlock());
386 // SetInsertPoint also updates the Builder's debug location, but we want to
387 // keep the one the Builder was configured to use.
388 Builder.SetCurrentDebugLocation(DebugLoc);
389 return New;
390}
391
392BasicBlock *llvm::splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
393 llvm::Twine Suffix) {
394 BasicBlock *Old = Builder.GetInsertBlock();
395 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
396}
397
398// This function creates a fake integer value and a fake use for the integer
399// value. It returns the fake value created. This is useful in modeling the
400// extra arguments to the outlined functions.
402 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
404 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
405 const Twine &Name = "", bool AsPtr = true) {
406 Builder.restoreIP(OuterAllocaIP);
407 Instruction *FakeVal;
408 AllocaInst *FakeValAddr =
409 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, Name + ".addr");
410 ToBeDeleted.push_back(FakeValAddr);
411
412 if (AsPtr) {
413 FakeVal = FakeValAddr;
414 } else {
415 FakeVal =
416 Builder.CreateLoad(Builder.getInt32Ty(), FakeValAddr, Name + ".val");
417 ToBeDeleted.push_back(FakeVal);
418 }
419
420 // Generate a fake use of this value
421 Builder.restoreIP(InnerAllocaIP);
422 Instruction *UseFakeVal;
423 if (AsPtr) {
424 UseFakeVal =
425 Builder.CreateLoad(Builder.getInt32Ty(), FakeVal, Name + ".use");
426 } else {
427 UseFakeVal =
428 cast<BinaryOperator>(Builder.CreateAdd(FakeVal, Builder.getInt32(10)));
429 }
430 ToBeDeleted.push_back(UseFakeVal);
431 return FakeVal;
432}
433
434//===----------------------------------------------------------------------===//
435// OpenMPIRBuilderConfig
436//===----------------------------------------------------------------------===//
437
438namespace {
440/// Values for bit flags for marking which requires clauses have been used.
441enum OpenMPOffloadingRequiresDirFlags {
442 /// flag undefined.
443 OMP_REQ_UNDEFINED = 0x000,
444 /// no requires directive present.
445 OMP_REQ_NONE = 0x001,
446 /// reverse_offload clause.
447 OMP_REQ_REVERSE_OFFLOAD = 0x002,
448 /// unified_address clause.
449 OMP_REQ_UNIFIED_ADDRESS = 0x004,
450 /// unified_shared_memory clause.
451 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
452 /// dynamic_allocators clause.
453 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
454 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
455};
456
457} // anonymous namespace
458
459OpenMPIRBuilderConfig::OpenMPIRBuilderConfig()
460 : RequiresFlags(OMP_REQ_UNDEFINED) {}
461
462OpenMPIRBuilderConfig::OpenMPIRBuilderConfig(
463 bool IsTargetDevice, bool IsGPU, bool OpenMPOffloadMandatory,
464 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
465 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
466 : IsTargetDevice(IsTargetDevice), IsGPU(IsGPU),
467 OpenMPOffloadMandatory(OpenMPOffloadMandatory),
468 RequiresFlags(OMP_REQ_UNDEFINED) {
469 if (HasRequiresReverseOffload)
470 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
471 if (HasRequiresUnifiedAddress)
472 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
473 if (HasRequiresUnifiedSharedMemory)
474 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
475 if (HasRequiresDynamicAllocators)
476 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
477}
478
479bool OpenMPIRBuilderConfig::hasRequiresReverseOffload() const {
480 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
481}
482
483bool OpenMPIRBuilderConfig::hasRequiresUnifiedAddress() const {
484 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
485}
486
487bool OpenMPIRBuilderConfig::hasRequiresUnifiedSharedMemory() const {
488 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
489}
490
491bool OpenMPIRBuilderConfig::hasRequiresDynamicAllocators() const {
492 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
493}
494
495int64_t OpenMPIRBuilderConfig::getRequiresFlags() const {
496 return hasRequiresFlags() ? RequiresFlags
497 : static_cast<int64_t>(OMP_REQ_NONE);
498}
499
500void OpenMPIRBuilderConfig::setHasRequiresReverseOffload(bool Value) {
501 if (Value)
502 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
503 else
504 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
505}
506
507void OpenMPIRBuilderConfig::setHasRequiresUnifiedAddress(bool Value) {
508 if (Value)
509 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
510 else
511 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
512}
513
514void OpenMPIRBuilderConfig::setHasRequiresUnifiedSharedMemory(bool Value) {
515 if (Value)
516 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
517 else
518 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
519}
520
521void OpenMPIRBuilderConfig::setHasRequiresDynamicAllocators(bool Value) {
522 if (Value)
523 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
524 else
525 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
526}
527
528//===----------------------------------------------------------------------===//
529// OpenMPIRBuilder
530//===----------------------------------------------------------------------===//
531
532void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
533 IRBuilderBase &Builder,
534 SmallVector<Value *> &ArgsVector) {
535 Value *Version = Builder.getInt32(OMP_KERNEL_ARG_VERSION);
536 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
537 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
538 constexpr size_t MaxDim = 3;
539 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
540
541 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
542
543 Value *DynCGroupMemFallbackFlag =
544 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
545 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
546 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
547
548 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
549
550 Value *NumTeams3D =
551 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
552 Value *NumThreads3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
554 for (unsigned I :
555 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
556 NumTeams3D =
557 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
558 for (unsigned I :
559 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
560 NumThreads3D =
561 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
562
563 ArgsVector = {Version,
564 PointerNum,
565 KernelArgs.RTArgs.BasePointersArray,
566 KernelArgs.RTArgs.PointersArray,
567 KernelArgs.RTArgs.SizesArray,
568 KernelArgs.RTArgs.MapTypesArray,
569 KernelArgs.RTArgs.MapNamesArray,
570 KernelArgs.RTArgs.MappersArray,
571 KernelArgs.NumIterations,
572 Flags,
573 NumTeams3D,
574 NumThreads3D,
575 KernelArgs.DynCGroupMem};
576}
577
578void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
579 LLVMContext &Ctx = Fn.getContext();
580
581 // Get the function's current attributes.
582 auto Attrs = Fn.getAttributes();
583 auto FnAttrs = Attrs.getFnAttrs();
584 auto RetAttrs = Attrs.getRetAttrs();
586 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
587 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
588
589 // Add AS to FnAS while taking special care with integer extensions.
590 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
591 bool Param = true) -> void {
592 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
593 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
594 if (HasSignExt || HasZeroExt) {
595 assert(AS.getNumAttributes() == 1 &&
596 "Currently not handling extension attr combined with others.");
597 if (Param) {
598 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
599 FnAS = FnAS.addAttribute(Ctx, AK);
600 } else if (auto AK =
601 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
602 FnAS = FnAS.addAttribute(Ctx, AK);
603 } else {
604 FnAS = FnAS.addAttributes(Ctx, AS);
605 }
606 };
607
608#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
609#include "llvm/Frontend/OpenMP/OMPKinds.def"
610
611 // Add attributes to the function declaration.
612 switch (FnID) {
613#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
614 case Enum: \
615 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
616 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
617 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
618 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
619 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
620 break;
621#include "llvm/Frontend/OpenMP/OMPKinds.def"
622 default:
623 // Attributes are optional.
624 break;
625 }
626}
627
629OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
630 FunctionType *FnTy = nullptr;
631 Function *Fn = nullptr;
632
633 // Try to find the declation in the module first.
634 switch (FnID) {
635#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
636 case Enum: \
637 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
638 IsVarArg); \
639 Fn = M.getFunction(Str); \
640 break;
641#include "llvm/Frontend/OpenMP/OMPKinds.def"
642 }
643
644 if (!Fn) {
645 // Create a new declaration if we need one.
646 switch (FnID) {
647#define OMP_RTL(Enum, Str, ...) \
648 case Enum: \
649 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
650 break;
651#include "llvm/Frontend/OpenMP/OMPKinds.def"
652 }
653 Fn->setCallingConv(Config.getRuntimeCC());
654 // Add information if the runtime function takes a callback function
655 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
656 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
657 LLVMContext &Ctx = Fn->getContext();
658 MDBuilder MDB(Ctx);
659 // Annotate the callback behavior of the runtime function:
660 // - The callback callee is argument number 2 (microtask).
661 // - The first two arguments of the callback callee are unknown (-1).
662 // - All variadic arguments to the runtime function are passed to the
663 // callback callee.
664 Fn->addMetadata(
665 LLVMContext::MD_callback,
666 *MDNode::get(Ctx, {MDB.createCallbackEncoding(
667 2, {-1, -1}, /* VarArgsArePassed */ true)}));
668 }
669 }
670
671 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
672 << " with type " << *Fn->getFunctionType() << "\n");
673 addAttributes(FnID, *Fn);
674
675 } else {
676 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 }
679
680 assert(Fn && "Failed to create OpenMP runtime function");
681
682 return {FnTy, Fn};
683}
684
686OpenMPIRBuilder::FinalizationInfo::getFiniBB(IRBuilderBase &Builder) {
687 if (!FiniBB) {
688 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
689 IRBuilderBase::InsertPointGuard Guard(Builder);
690 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
691 Builder.SetInsertPoint(FiniBB);
692 // FiniCB adds the branch to the exit stub.
693 if (Error Err = FiniCB(Builder.saveIP()))
694 return Err;
695 }
696 return FiniBB;
697}
698
699Error OpenMPIRBuilder::FinalizationInfo::mergeFiniBB(IRBuilderBase &Builder,
700 BasicBlock *OtherFiniBB) {
701 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
702 if (!FiniBB) {
703 FiniBB = OtherFiniBB;
704
705 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
706 if (Error Err = FiniCB(Builder.saveIP()))
707 return Err;
708
709 return Error::success();
710 }
711
712 // Move instructions from FiniBB to the start of OtherFiniBB.
713 auto EndIt = FiniBB->end();
714 if (FiniBB->size() >= 1)
715 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
716 EndIt = Prev;
717 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
718 EndIt);
719
720 FiniBB->replaceAllUsesWith(OtherFiniBB);
721 FiniBB->eraseFromParent();
722 FiniBB = OtherFiniBB;
723 return Error::success();
724}
725
726Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
727 FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
728 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
729 assert(Fn && "Failed to create OpenMP runtime function pointer");
730 return Fn;
731}
732
733CallInst *OpenMPIRBuilder::createRuntimeFunctionCall(FunctionCallee Callee,
735 StringRef Name) {
736 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
737 Call->setCallingConv(Config.getRuntimeCC());
738 return Call;
739}
740
741void OpenMPIRBuilder::initialize() { initializeTypes(M); }
742
745 BasicBlock &EntryBlock = Function->getEntryBlock();
746 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
747
748 // Loop over blocks looking for constant allocas, skipping the entry block
749 // as any allocas there are already in the desired location.
750 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
751 Block++) {
752 for (auto Inst = Block->getReverseIterator()->begin();
753 Inst != Block->getReverseIterator()->end();) {
755 Inst++;
757 continue;
758 AllocaInst->moveBeforePreserving(MoveLocInst);
759 } else {
760 Inst++;
761 }
762 }
763 }
764}
765
766void OpenMPIRBuilder::finalize(Function *Fn) {
767 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
769 SmallVector<OutlineInfo, 16> DeferredOutlines;
770 for (OutlineInfo &OI : OutlineInfos) {
771 // Skip functions that have not finalized yet; may happen with nested
772 // function generation.
773 if (Fn && OI.getFunction() != Fn) {
774 DeferredOutlines.push_back(OI);
775 continue;
776 }
777
778 ParallelRegionBlockSet.clear();
779 Blocks.clear();
780 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
781
782 Function *OuterFn = OI.getFunction();
783 CodeExtractorAnalysisCache CEAC(*OuterFn);
784 // If we generate code for the target device, we need to allocate
785 // struct for aggregate params in the device default alloca address space.
786 // OpenMP runtime requires that the params of the extracted functions are
787 // passed as zero address space pointers. This flag ensures that
788 // CodeExtractor generates correct code for extracted functions
789 // which are used by OpenMP runtime.
790 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
791 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
792 /* AggregateArgs */ true,
793 /* BlockFrequencyInfo */ nullptr,
794 /* BranchProbabilityInfo */ nullptr,
795 /* AssumptionCache */ nullptr,
796 /* AllowVarArgs */ true,
797 /* AllowAlloca */ true,
798 /* AllocaBlock*/ OI.OuterAllocaBB,
799 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
800
801 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
802 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
803 << " Exit: " << OI.ExitBB->getName() << "\n");
804 assert(Extractor.isEligible() &&
805 "Expected OpenMP outlining to be possible!");
806
807 for (auto *V : OI.ExcludeArgsFromAggregate)
808 Extractor.excludeArgFromAggregate(V);
809
810 Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
811
812 // Forward target-cpu, target-features attributes to the outlined function.
813 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
814 if (TargetCpuAttr.isStringAttribute())
815 OutlinedFn->addFnAttr(TargetCpuAttr);
816
817 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
818 if (TargetFeaturesAttr.isStringAttribute())
819 OutlinedFn->addFnAttr(TargetFeaturesAttr);
820
821 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
822 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
823 assert(OutlinedFn->getReturnType()->isVoidTy() &&
824 "OpenMP outlined functions should not return a value!");
825
826 // For compability with the clang CG we move the outlined function after the
827 // one with the parallel region.
828 OutlinedFn->removeFromParent();
829 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
830
831 // Remove the artificial entry introduced by the extractor right away, we
832 // made our own entry block after all.
833 {
834 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
835 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
836 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
837 // Move instructions from the to-be-deleted ArtificialEntry to the entry
838 // basic block of the parallel region. CodeExtractor generates
839 // instructions to unwrap the aggregate argument and may sink
840 // allocas/bitcasts for values that are solely used in the outlined region
841 // and do not escape.
842 assert(!ArtificialEntry.empty() &&
843 "Expected instructions to add in the outlined region entry");
844 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
845 End = ArtificialEntry.rend();
846 It != End;) {
847 Instruction &I = *It;
848 It++;
849
850 if (I.isTerminator()) {
851 // Absorb any debug value that terminator may have
852 if (OI.EntryBB->getTerminator())
853 OI.EntryBB->getTerminator()->adoptDbgRecords(
854 &ArtificialEntry, I.getIterator(), false);
855 continue;
856 }
857
858 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
859 }
860
861 OI.EntryBB->moveBefore(&ArtificialEntry);
862 ArtificialEntry.eraseFromParent();
863 }
864 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
865 assert(OutlinedFn && OutlinedFn->hasNUses(1));
866
867 // Run a user callback, e.g. to add attributes.
868 if (OI.PostOutlineCB)
869 OI.PostOutlineCB(*OutlinedFn);
870 }
871
872 // Remove work items that have been completed.
873 OutlineInfos = std::move(DeferredOutlines);
874
875 // The createTarget functions embeds user written code into
876 // the target region which may inject allocas which need to
877 // be moved to the entry block of our target or risk malformed
878 // optimisations by later passes, this is only relevant for
879 // the device pass which appears to be a little more delicate
880 // when it comes to optimisations (however, we do not block on
881 // that here, it's up to the inserter to the list to do so).
882 // This notbaly has to occur after the OutlinedInfo candidates
883 // have been extracted so we have an end product that will not
884 // be implicitly adversely affected by any raises unless
885 // intentionally appended to the list.
886 // NOTE: This only does so for ConstantData, it could be extended
887 // to ConstantExpr's with further effort, however, they should
888 // largely be folded when they get here. Extending it to runtime
889 // defined/read+writeable allocation sizes would be non-trivial
890 // (need to factor in movement of any stores to variables the
891 // allocation size depends on, as well as the usual loads,
892 // otherwise it'll yield the wrong result after movement) and
893 // likely be more suitable as an LLVM optimisation pass.
894 for (Function *F : ConstantAllocaRaiseCandidates)
896
897 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
898 [](EmitMetadataErrorKind Kind,
899 const TargetRegionEntryInfo &EntryInfo) -> void {
900 errs() << "Error of kind: " << Kind
901 << " when emitting offload entries and metadata during "
902 "OMPIRBuilder finalization \n";
903 };
904
905 if (!OffloadInfoManager.empty())
906 createOffloadEntriesAndInfoMetadata(ErrorReportFn);
907
908 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
909 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
910 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
911 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
912 }
913
914 IsFinalized = true;
915}
916
917bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
918
919OpenMPIRBuilder::~OpenMPIRBuilder() {
920 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
921}
922
923GlobalValue *OpenMPIRBuilder::createGlobalFlag(unsigned Value, StringRef Name) {
924 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
925 auto *GV =
926 new GlobalVariable(M, I32Ty,
927 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
928 ConstantInt::get(I32Ty, Value), Name);
929 GV->setVisibility(GlobalValue::HiddenVisibility);
930
931 return GV;
932}
933
934void OpenMPIRBuilder::emitUsed(StringRef Name, ArrayRef<WeakTrackingVH> List) {
935 if (List.empty())
936 return;
937
938 // Convert List to what ConstantArray needs.
940 UsedArray.resize(List.size());
941 for (unsigned I = 0, E = List.size(); I != E; ++I)
943 cast<Constant>(&*List[I]), Builder.getPtrTy());
944
945 if (UsedArray.empty())
946 return;
947 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
948
949 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
950 ConstantArray::get(ATy, UsedArray), Name);
951
952 GV->setSection("llvm.metadata");
953}
954
956OpenMPIRBuilder::emitKernelExecutionMode(StringRef KernelName,
958 auto *Int8Ty = Builder.getInt8Ty();
959 auto *GVMode = new GlobalVariable(
960 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
961 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
962 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
963 return GVMode;
964}
965
966Constant *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
967 uint32_t SrcLocStrSize,
968 IdentFlag LocFlags,
969 unsigned Reserve2Flags) {
970 // Enable "C-mode".
971 LocFlags |= OMP_IDENT_FLAG_KMPC;
972
973 Constant *&Ident =
974 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
975 if (!Ident) {
976 Constant *I32Null = ConstantInt::getNullValue(Int32);
977 Constant *IdentData[] = {I32Null,
978 ConstantInt::get(Int32, uint32_t(LocFlags)),
979 ConstantInt::get(Int32, Reserve2Flags),
980 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
981
982 size_t SrcLocStrArgIdx = 4;
983 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
985 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
986 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
987 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
988 Constant *Initializer =
989 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
990
991 // Look for existing encoding of the location + flags, not needed but
992 // minimizes the difference to the existing solution while we transition.
993 for (GlobalVariable &GV : M.globals())
994 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
995 if (GV.getInitializer() == Initializer)
996 Ident = &GV;
997
998 if (!Ident) {
999 auto *GV = new GlobalVariable(
1000 M, OpenMPIRBuilder::Ident,
1001 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1003 M.getDataLayout().getDefaultGlobalsAddressSpace());
1004 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1005 GV->setAlignment(Align(8));
1006 Ident = GV;
1007 }
1008 }
1009
1010 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1011}
1012
1013Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef LocStr,
1014 uint32_t &SrcLocStrSize) {
1015 SrcLocStrSize = LocStr.size();
1016 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1017 if (!SrcLocStr) {
1018 Constant *Initializer =
1019 ConstantDataArray::getString(M.getContext(), LocStr);
1020
1021 // Look for existing encoding of the location, not needed but minimizes the
1022 // difference to the existing solution while we transition.
1023 for (GlobalVariable &GV : M.globals())
1024 if (GV.isConstant() && GV.hasInitializer() &&
1025 GV.getInitializer() == Initializer)
1026 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1027
1028 SrcLocStr = Builder.CreateGlobalString(
1029 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1030 &M);
1031 }
1032 return SrcLocStr;
1033}
1034
1035Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(StringRef FunctionName,
1036 StringRef FileName,
1037 unsigned Line, unsigned Column,
1038 uint32_t &SrcLocStrSize) {
1039 SmallString<128> Buffer;
1040 Buffer.push_back(';');
1041 Buffer.append(FileName);
1042 Buffer.push_back(';');
1043 Buffer.append(FunctionName);
1044 Buffer.push_back(';');
1045 Buffer.append(std::to_string(Line));
1046 Buffer.push_back(';');
1047 Buffer.append(std::to_string(Column));
1048 Buffer.push_back(';');
1049 Buffer.push_back(';');
1050 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1051}
1052
1053Constant *
1054OpenMPIRBuilder::getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize) {
1055 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1056 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1057}
1058
1059Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(DebugLoc DL,
1060 uint32_t &SrcLocStrSize,
1061 Function *F) {
1062 DILocation *DIL = DL.get();
1063 if (!DIL)
1064 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1065 StringRef FileName = M.getName();
1066 if (DIFile *DIF = DIL->getFile())
1067 if (std::optional<StringRef> Source = DIF->getSource())
1068 FileName = *Source;
1069 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1070 if (Function.empty() && F)
1071 Function = F->getName();
1072 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1073 DIL->getColumn(), SrcLocStrSize);
1074}
1075
1076Constant *OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc,
1077 uint32_t &SrcLocStrSize) {
1078 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1079 Loc.IP.getBlock()->getParent());
1080}
1081
1082Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
1083 return createRuntimeFunctionCall(
1084 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1085 "omp_global_thread_num");
1086}
1087
1088OpenMPIRBuilder::InsertPointOrErrorTy
1089OpenMPIRBuilder::createBarrier(const LocationDescription &Loc, Directive Kind,
1090 bool ForceSimpleCall, bool CheckCancelFlag) {
1091 if (!updateToLocation(Loc))
1092 return Loc.IP;
1093
1094 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1095 // __kmpc_barrier(loc, thread_id);
1096
1097 IdentFlag BarrierLocFlags;
1098 switch (Kind) {
1099 case OMPD_for:
1100 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1101 break;
1102 case OMPD_sections:
1103 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1104 break;
1105 case OMPD_single:
1106 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1107 break;
1108 case OMPD_barrier:
1109 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1110 break;
1111 default:
1112 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1113 break;
1114 }
1115
1116 uint32_t SrcLocStrSize;
1117 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1118 Value *Args[] = {
1119 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1120 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1121
1122 // If we are in a cancellable parallel region, barriers are cancellation
1123 // points.
1124 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1125 bool UseCancelBarrier =
1126 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1127
1128 Value *Result = createRuntimeFunctionCall(
1129 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1130 ? OMPRTL___kmpc_cancel_barrier
1131 : OMPRTL___kmpc_barrier),
1132 Args);
1133
1134 if (UseCancelBarrier && CheckCancelFlag)
1135 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1136 return Err;
1137
1138 return Builder.saveIP();
1139}
1140
1141OpenMPIRBuilder::InsertPointOrErrorTy
1142OpenMPIRBuilder::createCancel(const LocationDescription &Loc,
1143 Value *IfCondition,
1144 omp::Directive CanceledDirective) {
1145 if (!updateToLocation(Loc))
1146 return Loc.IP;
1147
1148 // LLVM utilities like blocks with terminators.
1149 auto *UI = Builder.CreateUnreachable();
1150
1151 Instruction *ThenTI = UI, *ElseTI = nullptr;
1152 if (IfCondition) {
1153 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1154
1155 // Even if the if condition evaluates to false, this should count as a
1156 // cancellation point
1157 Builder.SetInsertPoint(ElseTI);
1158 auto ElseIP = Builder.saveIP();
1159
1160 InsertPointOrErrorTy IPOrErr = createCancellationPoint(
1161 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1162 if (!IPOrErr)
1163 return IPOrErr;
1164 }
1165
1166 Builder.SetInsertPoint(ThenTI);
1167
1168 Value *CancelKind = nullptr;
1169 switch (CanceledDirective) {
1170#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1171 case DirectiveEnum: \
1172 CancelKind = Builder.getInt32(Value); \
1173 break;
1174#include "llvm/Frontend/OpenMP/OMPKinds.def"
1175 default:
1176 llvm_unreachable("Unknown cancel kind!");
1177 }
1178
1179 uint32_t SrcLocStrSize;
1180 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1181 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1182 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1183 Value *Result = createRuntimeFunctionCall(
1184 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1185
1186 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1187 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1188 return Err;
1189
1190 // Update the insertion point and remove the terminator we introduced.
1191 Builder.SetInsertPoint(UI->getParent());
1192 UI->eraseFromParent();
1193
1194 return Builder.saveIP();
1195}
1196
1197OpenMPIRBuilder::InsertPointOrErrorTy
1198OpenMPIRBuilder::createCancellationPoint(const LocationDescription &Loc,
1199 omp::Directive CanceledDirective) {
1200 if (!updateToLocation(Loc))
1201 return Loc.IP;
1202
1203 // LLVM utilities like blocks with terminators.
1204 auto *UI = Builder.CreateUnreachable();
1205 Builder.SetInsertPoint(UI);
1206
1207 Value *CancelKind = nullptr;
1208 switch (CanceledDirective) {
1209#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1210 case DirectiveEnum: \
1211 CancelKind = Builder.getInt32(Value); \
1212 break;
1213#include "llvm/Frontend/OpenMP/OMPKinds.def"
1214 default:
1215 llvm_unreachable("Unknown cancel kind!");
1216 }
1217
1218 uint32_t SrcLocStrSize;
1219 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1220 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1221 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1222 Value *Result = createRuntimeFunctionCall(
1223 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1224
1225 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1226 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1227 return Err;
1228
1229 // Update the insertion point and remove the terminator we introduced.
1230 Builder.SetInsertPoint(UI->getParent());
1231 UI->eraseFromParent();
1232
1233 return Builder.saveIP();
1234}
1235
1236OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitTargetKernel(
1237 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1238 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1239 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1240 if (!updateToLocation(Loc))
1241 return Loc.IP;
1242
1243 Builder.restoreIP(AllocaIP);
1244 auto *KernelArgsPtr =
1245 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1246 updateToLocation(Loc);
1247
1248 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1249 llvm::Value *Arg =
1250 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1251 Builder.CreateAlignedStore(
1252 KernelArgs[I], Arg,
1253 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1254 }
1255
1256 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1257 NumThreads, HostPtr, KernelArgsPtr};
1258
1259 Return = createRuntimeFunctionCall(
1260 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1261 OffloadingArgs);
1262
1263 return Builder.saveIP();
1264}
1265
1266OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitKernelLaunch(
1267 const LocationDescription &Loc, Value *OutlinedFnID,
1268 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1269 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1270
1271 if (!updateToLocation(Loc))
1272 return Loc.IP;
1273
1274 // On top of the arrays that were filled up, the target offloading call
1275 // takes as arguments the device id as well as the host pointer. The host
1276 // pointer is used by the runtime library to identify the current target
1277 // region, so it only has to be unique and not necessarily point to
1278 // anything. It could be the pointer to the outlined function that
1279 // implements the target region, but we aren't using that so that the
1280 // compiler doesn't need to keep that, and could therefore inline the host
1281 // function if proven worthwhile during optimization.
1282
1283 // From this point on, we need to have an ID of the target region defined.
1284 assert(OutlinedFnID && "Invalid outlined function ID!");
1285 (void)OutlinedFnID;
1286
1287 // Return value of the runtime offloading call.
1288 Value *Return = nullptr;
1289
1290 // Arguments for the target kernel.
1291 SmallVector<Value *> ArgsVector;
1292 getKernelArgsVector(Args, Builder, ArgsVector);
1293
1294 // The target region is an outlined function launched by the runtime
1295 // via calls to __tgt_target_kernel().
1296 //
1297 // Note that on the host and CPU targets, the runtime implementation of
1298 // these calls simply call the outlined function without forking threads.
1299 // The outlined functions themselves have runtime calls to
1300 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1301 // the compiler in emitTeamsCall() and emitParallelCall().
1302 //
1303 // In contrast, on the NVPTX target, the implementation of
1304 // __tgt_target_teams() launches a GPU kernel with the requested number
1305 // of teams and threads so no additional calls to the runtime are required.
1306 // Check the error code and execute the host version if required.
1307 Builder.restoreIP(emitTargetKernel(
1308 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1309 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1310
1311 BasicBlock *OffloadFailedBlock =
1312 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1313 BasicBlock *OffloadContBlock =
1314 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1315 Value *Failed = Builder.CreateIsNotNull(Return);
1316 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1317
1318 auto CurFn = Builder.GetInsertBlock()->getParent();
1319 emitBlock(OffloadFailedBlock, CurFn);
1320 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1321 if (!AfterIP)
1322 return AfterIP.takeError();
1323 Builder.restoreIP(*AfterIP);
1324 emitBranch(OffloadContBlock);
1325 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1326 return Builder.saveIP();
1327}
1328
1329Error OpenMPIRBuilder::emitCancelationCheckImpl(
1330 Value *CancelFlag, omp::Directive CanceledDirective) {
1331 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1332 "Unexpected cancellation!");
1333
1334 // For a cancel barrier we create two new blocks.
1335 BasicBlock *BB = Builder.GetInsertBlock();
1336 BasicBlock *NonCancellationBlock;
1337 if (Builder.GetInsertPoint() == BB->end()) {
1338 // TODO: This branch will not be needed once we moved to the
1339 // OpenMPIRBuilder codegen completely.
1340 NonCancellationBlock = BasicBlock::Create(
1341 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1342 } else {
1343 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1345 Builder.SetInsertPoint(BB);
1346 }
1347 BasicBlock *CancellationBlock = BasicBlock::Create(
1348 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1349
1350 // Jump to them based on the return value.
1351 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1352 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1353 /* TODO weight */ nullptr, nullptr);
1354
1355 // From the cancellation block we finalize all variables and go to the
1356 // post finalization block that is known to the FiniCB callback.
1357 auto &FI = FinalizationStack.back();
1358 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1359 if (!FiniBBOrErr)
1360 return FiniBBOrErr.takeError();
1361 Builder.SetInsertPoint(CancellationBlock);
1362 Builder.CreateBr(*FiniBBOrErr);
1363
1364 // The continuation block is where code generation continues.
1365 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1366 return Error::success();
1367}
1368
1369// Callback used to create OpenMP runtime calls to support
1370// omp parallel clause for the device.
1371// We need to use this callback to replace call to the OutlinedFn in OuterFn
1372// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1374 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1375 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1376 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1377 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1378 // Add some known attributes.
1379 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1380 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1381 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1382 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1383 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1384 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1385
1386 assert(OutlinedFn.arg_size() >= 2 &&
1387 "Expected at least tid and bounded tid as arguments");
1388 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1389
1390 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1391 assert(CI && "Expected call instruction to outlined function");
1392 CI->getParent()->setName("omp_parallel");
1393
1394 Builder.SetInsertPoint(CI);
1395 Type *PtrTy = OMPIRBuilder->VoidPtr;
1396 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1397
1398 // Add alloca for kernel args
1399 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1400 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1401 AllocaInst *ArgsAlloca =
1402 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1403 Value *Args = ArgsAlloca;
1404 // Add address space cast if array for storing arguments is not allocated
1405 // in address space 0
1406 if (ArgsAlloca->getAddressSpace())
1407 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1408 Builder.restoreIP(CurrentIP);
1409
1410 // Store captured vars which are used by kmpc_parallel_60
1411 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1412 Value *V = *(CI->arg_begin() + 2 + Idx);
1413 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1414 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1415 Builder.CreateStore(V, StoreAddress);
1416 }
1417
1418 Value *Cond =
1419 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1420 : Builder.getInt32(1);
1421
1422 // Build kmpc_parallel_60 call
1423 Value *Parallel60CallArgs[] = {
1424 /* identifier*/ Ident,
1425 /* global thread num*/ ThreadID,
1426 /* if expression */ Cond,
1427 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1428 /* Proc bind */ Builder.getInt32(-1),
1429 /* outlined function */ &OutlinedFn,
1430 /* wrapper function */ NullPtrValue,
1431 /* arguments of the outlined funciton*/ Args,
1432 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1433 /* strict for number of threads */ Builder.getInt32(0)};
1434
1435 FunctionCallee RTLFn =
1436 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1437
1438 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1439
1440 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1441 << *Builder.GetInsertBlock()->getParent() << "\n");
1442
1443 // Initialize the local TID stack location with the argument value.
1444 Builder.SetInsertPoint(PrivTID);
1445 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1446 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1447 PrivTIDAddr);
1448
1449 // Remove redundant call to the outlined function.
1450 CI->eraseFromParent();
1451
1452 for (Instruction *I : ToBeDeleted) {
1453 I->eraseFromParent();
1454 }
1455}
1456
1457// Callback used to create OpenMP runtime calls to support
1458// omp parallel clause for the host.
1459// We need to use this callback to replace call to the OutlinedFn in OuterFn
1460// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1461static void
1462hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn,
1463 Function *OuterFn, Value *Ident, Value *IfCondition,
1464 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1465 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1466 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1467 FunctionCallee RTLFn;
1468 if (IfCondition) {
1469 RTLFn =
1470 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1471 } else {
1472 RTLFn =
1473 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1474 }
1475 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1476 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1477 LLVMContext &Ctx = F->getContext();
1478 MDBuilder MDB(Ctx);
1479 // Annotate the callback behavior of the __kmpc_fork_call:
1480 // - The callback callee is argument number 2 (microtask).
1481 // - The first two arguments of the callback callee are unknown (-1).
1482 // - All variadic arguments to the __kmpc_fork_call are passed to the
1483 // callback callee.
1484 F->addMetadata(LLVMContext::MD_callback,
1486 2, {-1, -1},
1487 /* VarArgsArePassed */ true)}));
1488 }
1489 }
1490 // Add some known attributes.
1491 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1492 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1493 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1494
1495 assert(OutlinedFn.arg_size() >= 2 &&
1496 "Expected at least tid and bounded tid as arguments");
1497 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1498
1499 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1500 CI->getParent()->setName("omp_parallel");
1501 Builder.SetInsertPoint(CI);
1502
1503 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1504 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1505 &OutlinedFn};
1506
1507 SmallVector<Value *, 16> RealArgs;
1508 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1509 if (IfCondition) {
1510 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1511 RealArgs.push_back(Cond);
1512 }
1513 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1514
1515 // __kmpc_fork_call_if always expects a void ptr as the last argument
1516 // If there are no arguments, pass a null pointer.
1517 auto PtrTy = OMPIRBuilder->VoidPtr;
1518 if (IfCondition && NumCapturedVars == 0) {
1519 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1520 RealArgs.push_back(NullPtrValue);
1521 }
1522
1523 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1524
1525 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1526 << *Builder.GetInsertBlock()->getParent() << "\n");
1527
1528 // Initialize the local TID stack location with the argument value.
1529 Builder.SetInsertPoint(PrivTID);
1530 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1531 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1532 PrivTIDAddr);
1533
1534 // Remove redundant call to the outlined function.
1535 CI->eraseFromParent();
1536
1537 for (Instruction *I : ToBeDeleted) {
1538 I->eraseFromParent();
1539 }
1540}
1541
1542OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createParallel(
1543 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1544 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1545 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1546 omp::ProcBindKind ProcBind, bool IsCancellable) {
1547 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1548
1549 if (!updateToLocation(Loc))
1550 return Loc.IP;
1551
1552 uint32_t SrcLocStrSize;
1553 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1554 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1555 Value *ThreadID = getOrCreateThreadID(Ident);
1556 // If we generate code for the target device, we need to allocate
1557 // struct for aggregate params in the device default alloca address space.
1558 // OpenMP runtime requires that the params of the extracted functions are
1559 // passed as zero address space pointers. This flag ensures that extracted
1560 // function arguments are declared in zero address space
1561 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1562
1563 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1564 // only if we compile for host side.
1565 if (NumThreads && !Config.isTargetDevice()) {
1566 Value *Args[] = {
1567 Ident, ThreadID,
1568 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1569 createRuntimeFunctionCall(
1570 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1571 }
1572
1573 if (ProcBind != OMP_PROC_BIND_default) {
1574 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1575 Value *Args[] = {
1576 Ident, ThreadID,
1577 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1578 createRuntimeFunctionCall(
1579 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1580 }
1581
1582 BasicBlock *InsertBB = Builder.GetInsertBlock();
1583 Function *OuterFn = InsertBB->getParent();
1584
1585 // Save the outer alloca block because the insertion iterator may get
1586 // invalidated and we still need this later.
1587 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1588
1589 // Vector to remember instructions we used only during the modeling but which
1590 // we want to delete at the end.
1592
1593 // Change the location to the outer alloca insertion point to create and
1594 // initialize the allocas we pass into the parallel region.
1595 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1596 Builder.restoreIP(NewOuter);
1597 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1598 AllocaInst *ZeroAddrAlloca =
1599 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1600 Instruction *TIDAddr = TIDAddrAlloca;
1601 Instruction *ZeroAddr = ZeroAddrAlloca;
1602 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1603 // Add additional casts to enforce pointers in zero address space
1604 TIDAddr = new AddrSpaceCastInst(
1605 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1606 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1607 ToBeDeleted.push_back(TIDAddr);
1608 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1609 PointerType ::get(M.getContext(), 0),
1610 "zero.addr.ascast");
1611 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1612 ToBeDeleted.push_back(ZeroAddr);
1613 }
1614
1615 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1616 // associated arguments in the outlined function, so we delete them later.
1617 ToBeDeleted.push_back(TIDAddrAlloca);
1618 ToBeDeleted.push_back(ZeroAddrAlloca);
1619
1620 // Create an artificial insertion point that will also ensure the blocks we
1621 // are about to split are not degenerated.
1622 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1623
1624 BasicBlock *EntryBB = UI->getParent();
1625 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1626 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1627 BasicBlock *PRegPreFiniBB =
1628 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1629 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1630
1631 auto FiniCBWrapper = [&](InsertPointTy IP) {
1632 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1633 // target to the region exit block.
1634 if (IP.getBlock()->end() == IP.getPoint()) {
1635 IRBuilder<>::InsertPointGuard IPG(Builder);
1636 Builder.restoreIP(IP);
1637 Instruction *I = Builder.CreateBr(PRegExitBB);
1638 IP = InsertPointTy(I->getParent(), I->getIterator());
1639 }
1641 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1642 "Unexpected insertion point for finalization call!");
1643 return FiniCB(IP);
1644 };
1645
1646 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1647
1648 // Generate the privatization allocas in the block that will become the entry
1649 // of the outlined function.
1650 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1651 InsertPointTy InnerAllocaIP = Builder.saveIP();
1652
1653 AllocaInst *PrivTIDAddr =
1654 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1655 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1656
1657 // Add some fake uses for OpenMP provided arguments.
1658 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1659 Instruction *ZeroAddrUse =
1660 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1661 ToBeDeleted.push_back(ZeroAddrUse);
1662
1663 // EntryBB
1664 // |
1665 // V
1666 // PRegionEntryBB <- Privatization allocas are placed here.
1667 // |
1668 // V
1669 // PRegionBodyBB <- BodeGen is invoked here.
1670 // |
1671 // V
1672 // PRegPreFiniBB <- The block we will start finalization from.
1673 // |
1674 // V
1675 // PRegionExitBB <- A common exit to simplify block collection.
1676 //
1677
1678 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1679
1680 // Let the caller create the body.
1681 assert(BodyGenCB && "Expected body generation callback!");
1682 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1683 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1684 return Err;
1685
1686 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1687
1688 OutlineInfo OI;
1689 if (Config.isTargetDevice()) {
1690 // Generate OpenMP target specific runtime call
1691 OI.PostOutlineCB = [=, ToBeDeletedVec =
1692 std::move(ToBeDeleted)](Function &OutlinedFn) {
1693 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1694 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1695 ThreadID, ToBeDeletedVec);
1696 };
1697 } else {
1698 // Generate OpenMP host runtime call
1699 OI.PostOutlineCB = [=, ToBeDeletedVec =
1700 std::move(ToBeDeleted)](Function &OutlinedFn) {
1701 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1702 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1703 };
1704 }
1705
1706 OI.OuterAllocaBB = OuterAllocaBlock;
1707 OI.EntryBB = PRegEntryBB;
1708 OI.ExitBB = PRegExitBB;
1709
1710 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1712 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1713
1714 CodeExtractorAnalysisCache CEAC(*OuterFn);
1715 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1716 /* AggregateArgs */ false,
1717 /* BlockFrequencyInfo */ nullptr,
1718 /* BranchProbabilityInfo */ nullptr,
1719 /* AssumptionCache */ nullptr,
1720 /* AllowVarArgs */ true,
1721 /* AllowAlloca */ true,
1722 /* AllocationBlock */ OuterAllocaBlock,
1723 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1724
1725 // Find inputs to, outputs from the code region.
1726 BasicBlock *CommonExit = nullptr;
1727 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1728 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1729
1730 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1731 /*CollectGlobalInputs=*/true);
1732
1733 Inputs.remove_if([&](Value *I) {
1735 return GV->getValueType() == OpenMPIRBuilder::Ident;
1736
1737 return false;
1738 });
1739
1740 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1741
1742 FunctionCallee TIDRTLFn =
1743 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1744
1745 auto PrivHelper = [&](Value &V) -> Error {
1746 if (&V == TIDAddr || &V == ZeroAddr) {
1747 OI.ExcludeArgsFromAggregate.push_back(&V);
1748 return Error::success();
1749 }
1750
1752 for (Use &U : V.uses())
1753 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1754 if (ParallelRegionBlockSet.count(UserI->getParent()))
1755 Uses.insert(&U);
1756
1757 // __kmpc_fork_call expects extra arguments as pointers. If the input
1758 // already has a pointer type, everything is fine. Otherwise, store the
1759 // value onto stack and load it back inside the to-be-outlined region. This
1760 // will ensure only the pointer will be passed to the function.
1761 // FIXME: if there are more than 15 trailing arguments, they must be
1762 // additionally packed in a struct.
1763 Value *Inner = &V;
1764 if (!V.getType()->isPointerTy()) {
1765 IRBuilder<>::InsertPointGuard Guard(Builder);
1766 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1767
1768 Builder.restoreIP(OuterAllocaIP);
1769 Value *Ptr =
1770 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1771
1772 // Store to stack at end of the block that currently branches to the entry
1773 // block of the to-be-outlined region.
1774 Builder.SetInsertPoint(InsertBB,
1775 InsertBB->getTerminator()->getIterator());
1776 Builder.CreateStore(&V, Ptr);
1777
1778 // Load back next to allocations in the to-be-outlined region.
1779 Builder.restoreIP(InnerAllocaIP);
1780 Inner = Builder.CreateLoad(V.getType(), Ptr);
1781 }
1782
1783 Value *ReplacementValue = nullptr;
1784 CallInst *CI = dyn_cast<CallInst>(&V);
1785 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1786 ReplacementValue = PrivTID;
1787 } else {
1788 InsertPointOrErrorTy AfterIP =
1789 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1790 if (!AfterIP)
1791 return AfterIP.takeError();
1792 Builder.restoreIP(*AfterIP);
1793 InnerAllocaIP = {
1794 InnerAllocaIP.getBlock(),
1795 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1796
1797 assert(ReplacementValue &&
1798 "Expected copy/create callback to set replacement value!");
1799 if (ReplacementValue == &V)
1800 return Error::success();
1801 }
1802
1803 for (Use *UPtr : Uses)
1804 UPtr->set(ReplacementValue);
1805
1806 return Error::success();
1807 };
1808
1809 // Reset the inner alloca insertion as it will be used for loading the values
1810 // wrapped into pointers before passing them into the to-be-outlined region.
1811 // Configure it to insert immediately after the fake use of zero address so
1812 // that they are available in the generated body and so that the
1813 // OpenMP-related values (thread ID and zero address pointers) remain leading
1814 // in the argument list.
1815 InnerAllocaIP = IRBuilder<>::InsertPoint(
1816 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1817
1818 // Reset the outer alloca insertion point to the entry of the relevant block
1819 // in case it was invalidated.
1820 OuterAllocaIP = IRBuilder<>::InsertPoint(
1821 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1822
1823 for (Value *Input : Inputs) {
1824 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1825 if (Error Err = PrivHelper(*Input))
1826 return Err;
1827 }
1828 LLVM_DEBUG({
1829 for (Value *Output : Outputs)
1830 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1831 });
1832 assert(Outputs.empty() &&
1833 "OpenMP outlining should not produce live-out values!");
1834
1835 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1836 LLVM_DEBUG({
1837 for (auto *BB : Blocks)
1838 dbgs() << " PBR: " << BB->getName() << "\n";
1839 });
1840
1841 // Adjust the finalization stack, verify the adjustment, and call the
1842 // finalize function a last time to finalize values between the pre-fini
1843 // block and the exit block if we left the parallel "the normal way".
1844 auto FiniInfo = FinalizationStack.pop_back_val();
1845 (void)FiniInfo;
1846 assert(FiniInfo.DK == OMPD_parallel &&
1847 "Unexpected finalization stack state!");
1848
1849 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1850
1851 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1852 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1853 if (!FiniBBOrErr)
1854 return FiniBBOrErr.takeError();
1855 {
1856 IRBuilderBase::InsertPointGuard Guard(Builder);
1857 Builder.restoreIP(PreFiniIP);
1858 Builder.CreateBr(*FiniBBOrErr);
1859 // There's currently a branch to omp.par.exit. Delete it. We will get there
1860 // via the fini block
1861 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1862 Term->eraseFromParent();
1863 }
1864
1865 // Register the outlined info.
1866 addOutlineInfo(std::move(OI));
1867
1868 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1869 UI->eraseFromParent();
1870
1871 return AfterIP;
1872}
1873
1874void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
1875 // Build call void __kmpc_flush(ident_t *loc)
1876 uint32_t SrcLocStrSize;
1877 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1878 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1879
1880 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush),
1881 Args);
1882}
1883
1884void OpenMPIRBuilder::createFlush(const LocationDescription &Loc) {
1885 if (!updateToLocation(Loc))
1886 return;
1887 emitFlush(Loc);
1888}
1889
1890void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
1891 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1892 // global_tid);
1893 uint32_t SrcLocStrSize;
1894 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1895 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1896 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1897
1898 // Ignore return result until untied tasks are supported.
1899 createRuntimeFunctionCall(
1900 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1901}
1902
1903void OpenMPIRBuilder::createTaskwait(const LocationDescription &Loc) {
1904 if (!updateToLocation(Loc))
1905 return;
1906 emitTaskwaitImpl(Loc);
1907}
1908
1909void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
1910 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1911 uint32_t SrcLocStrSize;
1912 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1913 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1914 Constant *I32Null = ConstantInt::getNullValue(Int32);
1915 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1916
1917 createRuntimeFunctionCall(
1918 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1919}
1920
1921void OpenMPIRBuilder::createTaskyield(const LocationDescription &Loc) {
1922 if (!updateToLocation(Loc))
1923 return;
1924 emitTaskyieldImpl(Loc);
1925}
1926
1927// Processes the dependencies in Dependencies and does the following
1928// - Allocates space on the stack of an array of DependInfo objects
1929// - Populates each DependInfo object with relevant information of
1930// the corresponding dependence.
1931// - All code is inserted in the entry block of the current function.
1933 OpenMPIRBuilder &OMPBuilder,
1935 // Early return if we have no dependencies to process
1936 if (Dependencies.empty())
1937 return nullptr;
1938
1939 // Given a vector of DependData objects, in this function we create an
1940 // array on the stack that holds kmp_dep_info objects corresponding
1941 // to each dependency. This is then passed to the OpenMP runtime.
1942 // For example, if there are 'n' dependencies then the following psedo
1943 // code is generated. Assume the first dependence is on a variable 'a'
1944 //
1945 // \code{c}
1946 // DepArray = alloc(n x sizeof(kmp_depend_info);
1947 // idx = 0;
1948 // DepArray[idx].base_addr = ptrtoint(&a);
1949 // DepArray[idx].len = 8;
1950 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1951 // ++idx;
1952 // DepArray[idx].base_addr = ...;
1953 // \endcode
1954
1955 IRBuilderBase &Builder = OMPBuilder.Builder;
1956 Type *DependInfo = OMPBuilder.DependInfo;
1957 Module &M = OMPBuilder.M;
1958
1959 Value *DepArray = nullptr;
1960 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1961 Builder.SetInsertPoint(
1962 OldIP.getBlock()->getParent()->getEntryBlock().getTerminator());
1963
1964 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1965 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
1966
1967 Builder.restoreIP(OldIP);
1968
1969 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
1970 Value *Base =
1971 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
1972 // Store the pointer to the variable
1973 Value *Addr = Builder.CreateStructGEP(
1974 DependInfo, Base,
1975 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1976 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
1977 Builder.CreateStore(DepValPtr, Addr);
1978 // Store the size of the variable
1979 Value *Size = Builder.CreateStructGEP(
1980 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
1981 Builder.CreateStore(
1982 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1983 Size);
1984 // Store the dependency kind
1985 Value *Flags = Builder.CreateStructGEP(
1986 DependInfo, Base,
1987 static_cast<unsigned int>(RTLDependInfoFields::Flags));
1988 Builder.CreateStore(
1989 ConstantInt::get(Builder.getInt8Ty(),
1990 static_cast<unsigned int>(Dep.DepKind)),
1991 Flags);
1992 }
1993 return DepArray;
1994}
1995
1996OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTask(
1997 const LocationDescription &Loc, InsertPointTy AllocaIP,
1998 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
1999 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
2000 Value *Priority) {
2001
2002 if (!updateToLocation(Loc))
2003 return InsertPointTy();
2004
2005 uint32_t SrcLocStrSize;
2006 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2007 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2008 // The current basic block is split into four basic blocks. After outlining,
2009 // they will be mapped as follows:
2010 // ```
2011 // def current_fn() {
2012 // current_basic_block:
2013 // br label %task.exit
2014 // task.exit:
2015 // ; instructions after task
2016 // }
2017 // def outlined_fn() {
2018 // task.alloca:
2019 // br label %task.body
2020 // task.body:
2021 // ret void
2022 // }
2023 // ```
2024 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2025 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2026 BasicBlock *TaskAllocaBB =
2027 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2028
2029 InsertPointTy TaskAllocaIP =
2030 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2031 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2032 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2033 return Err;
2034
2035 OutlineInfo OI;
2036 OI.EntryBB = TaskAllocaBB;
2037 OI.OuterAllocaBB = AllocaIP.getBlock();
2038 OI.ExitBB = TaskExitBB;
2039
2040 // Add the thread ID argument.
2042 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2043 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2044
2045 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2046 Mergeable, Priority, EventHandle, TaskAllocaBB,
2047 ToBeDeleted](Function &OutlinedFn) mutable {
2048 // Replace the Stale CI by appropriate RTL function call.
2049 assert(OutlinedFn.hasOneUse() &&
2050 "there must be a single user for the outlined function");
2051 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2052
2053 // HasShareds is true if any variables are captured in the outlined region,
2054 // false otherwise.
2055 bool HasShareds = StaleCI->arg_size() > 1;
2056 Builder.SetInsertPoint(StaleCI);
2057
2058 // Gather the arguments for emitting the runtime call for
2059 // @__kmpc_omp_task_alloc
2060 Function *TaskAllocFn =
2061 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2062
2063 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2064 // call.
2065 Value *ThreadID = getOrCreateThreadID(Ident);
2066
2067 // Argument - `flags`
2068 // Task is tied iff (Flags & 1) == 1.
2069 // Task is untied iff (Flags & 1) == 0.
2070 // Task is final iff (Flags & 2) == 2.
2071 // Task is not final iff (Flags & 2) == 0.
2072 // Task is mergeable iff (Flags & 4) == 4.
2073 // Task is not mergeable iff (Flags & 4) == 0.
2074 // Task is priority iff (Flags & 32) == 32.
2075 // Task is not priority iff (Flags & 32) == 0.
2076 // TODO: Handle the other flags.
2077 Value *Flags = Builder.getInt32(Tied);
2078 if (Final) {
2079 Value *FinalFlag =
2080 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2081 Flags = Builder.CreateOr(FinalFlag, Flags);
2082 }
2083
2084 if (Mergeable)
2085 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2086 if (Priority)
2087 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2088
2089 // Argument - `sizeof_kmp_task_t` (TaskSize)
2090 // Tasksize refers to the size in bytes of kmp_task_t data structure
2091 // including private vars accessed in task.
2092 // TODO: add kmp_task_t_with_privates (privates)
2093 Value *TaskSize = Builder.getInt64(
2094 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2095
2096 // Argument - `sizeof_shareds` (SharedsSize)
2097 // SharedsSize refers to the shareds array size in the kmp_task_t data
2098 // structure.
2099 Value *SharedsSize = Builder.getInt64(0);
2100 if (HasShareds) {
2101 AllocaInst *ArgStructAlloca =
2103 assert(ArgStructAlloca &&
2104 "Unable to find the alloca instruction corresponding to arguments "
2105 "for extracted function");
2106 StructType *ArgStructType =
2107 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2108 assert(ArgStructType && "Unable to find struct type corresponding to "
2109 "arguments for extracted function");
2110 SharedsSize =
2111 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2112 }
2113 // Emit the @__kmpc_omp_task_alloc runtime call
2114 // The runtime call returns a pointer to an area where the task captured
2115 // variables must be copied before the task is run (TaskData)
2116 CallInst *TaskData = createRuntimeFunctionCall(
2117 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2118 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2119 /*task_func=*/&OutlinedFn});
2120
2121 // Emit detach clause initialization.
2122 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2123 // task_descriptor);
2124 if (EventHandle) {
2125 Function *TaskDetachFn = getOrCreateRuntimeFunctionPtr(
2126 OMPRTL___kmpc_task_allow_completion_event);
2127 llvm::Value *EventVal =
2128 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2129 llvm::Value *EventHandleAddr =
2130 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2131 Builder.getPtrTy(0));
2132 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2133 Builder.CreateStore(EventVal, EventHandleAddr);
2134 }
2135 // Copy the arguments for outlined function
2136 if (HasShareds) {
2137 Value *Shareds = StaleCI->getArgOperand(1);
2138 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2139 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2140 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2141 SharedsSize);
2142 }
2143
2144 if (Priority) {
2145 //
2146 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2147 // we populate the priority information into the "kmp_task_t" here
2148 //
2149 // The struct "kmp_task_t" definition is available in kmp.h
2150 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2151 // data2 is used for priority
2152 //
2153 Type *Int32Ty = Builder.getInt32Ty();
2154 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2155 // kmp_task_t* => { ptr }
2156 Type *TaskPtr = StructType::get(VoidPtr);
2157 Value *TaskGEP =
2158 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2159 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2160 Type *TaskStructType = StructType::get(
2161 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2162 Value *PriorityData = Builder.CreateInBoundsGEP(
2163 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2164 // kmp_cmplrdata_t => { ptr, ptr }
2165 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2166 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2167 PriorityData, {Zero, Zero});
2168 Builder.CreateStore(Priority, CmplrData);
2169 }
2170
2171 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2172
2173 // In the presence of the `if` clause, the following IR is generated:
2174 // ...
2175 // %data = call @__kmpc_omp_task_alloc(...)
2176 // br i1 %if_condition, label %then, label %else
2177 // then:
2178 // call @__kmpc_omp_task(...)
2179 // br label %exit
2180 // else:
2181 // ;; Wait for resolution of dependencies, if any, before
2182 // ;; beginning the task
2183 // call @__kmpc_omp_wait_deps(...)
2184 // call @__kmpc_omp_task_begin_if0(...)
2185 // call @outlined_fn(...)
2186 // call @__kmpc_omp_task_complete_if0(...)
2187 // br label %exit
2188 // exit:
2189 // ...
2190 if (IfCondition) {
2191 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2192 // terminator.
2193 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2194 Instruction *IfTerminator =
2195 Builder.GetInsertPoint()->getParent()->getTerminator();
2196 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2197 Builder.SetInsertPoint(IfTerminator);
2198 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2199 &ElseTI);
2200 Builder.SetInsertPoint(ElseTI);
2201
2202 if (Dependencies.size()) {
2203 Function *TaskWaitFn =
2204 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2205 createRuntimeFunctionCall(
2206 TaskWaitFn,
2207 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2208 ConstantInt::get(Builder.getInt32Ty(), 0),
2210 }
2211 Function *TaskBeginFn =
2212 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2213 Function *TaskCompleteFn =
2214 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2215 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2216 CallInst *CI = nullptr;
2217 if (HasShareds)
2218 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2219 else
2220 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2221 CI->setDebugLoc(StaleCI->getDebugLoc());
2222 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2223 Builder.SetInsertPoint(ThenTI);
2224 }
2225
2226 if (Dependencies.size()) {
2227 Function *TaskFn =
2228 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2229 createRuntimeFunctionCall(
2230 TaskFn,
2231 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2232 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2234
2235 } else {
2236 // Emit the @__kmpc_omp_task runtime call to spawn the task
2237 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2238 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2239 }
2240
2241 StaleCI->eraseFromParent();
2242
2243 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2244 if (HasShareds) {
2245 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2246 OutlinedFn.getArg(1)->replaceUsesWithIf(
2247 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2248 }
2249
2250 for (Instruction *I : llvm::reverse(ToBeDeleted))
2251 I->eraseFromParent();
2252 };
2253
2254 addOutlineInfo(std::move(OI));
2255 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2256
2257 return Builder.saveIP();
2258}
2259
2260OpenMPIRBuilder::InsertPointOrErrorTy
2261OpenMPIRBuilder::createTaskgroup(const LocationDescription &Loc,
2262 InsertPointTy AllocaIP,
2263 BodyGenCallbackTy BodyGenCB) {
2264 if (!updateToLocation(Loc))
2265 return InsertPointTy();
2266
2267 uint32_t SrcLocStrSize;
2268 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2269 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2270 Value *ThreadID = getOrCreateThreadID(Ident);
2271
2272 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2273 Function *TaskgroupFn =
2274 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2275 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2276
2277 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2278 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2279 return Err;
2280
2281 Builder.SetInsertPoint(TaskgroupExitBB);
2282 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2283 Function *EndTaskgroupFn =
2284 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2285 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2286
2287 return Builder.saveIP();
2288}
2289
2290OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSections(
2291 const LocationDescription &Loc, InsertPointTy AllocaIP,
2292 ArrayRef<StorableBodyGenCallbackTy> SectionCBs, PrivatizeCallbackTy PrivCB,
2293 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2294 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2295
2296 if (!updateToLocation(Loc))
2297 return Loc.IP;
2298
2299 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2300
2301 // Each section is emitted as a switch case
2302 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2303 // -> OMP.createSection() which generates the IR for each section
2304 // Iterate through all sections and emit a switch construct:
2305 // switch (IV) {
2306 // case 0:
2307 // <SectionStmt[0]>;
2308 // break;
2309 // ...
2310 // case <NumSection> - 1:
2311 // <SectionStmt[<NumSection> - 1]>;
2312 // break;
2313 // }
2314 // ...
2315 // section_loop.after:
2316 // <FiniCB>;
2317 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2318 Builder.restoreIP(CodeGenIP);
2320 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2321 Function *CurFn = Continue->getParent();
2322 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2323
2324 unsigned CaseNumber = 0;
2325 for (auto SectionCB : SectionCBs) {
2327 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2328 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2329 Builder.SetInsertPoint(CaseBB);
2330 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2331 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2332 CaseEndBr->getIterator()}))
2333 return Err;
2334 CaseNumber++;
2335 }
2336 // remove the existing terminator from body BB since there can be no
2337 // terminators after switch/case
2338 return Error::success();
2339 };
2340 // Loop body ends here
2341 // LowerBound, UpperBound, and STride for createCanonicalLoop
2342 Type *I32Ty = Type::getInt32Ty(M.getContext());
2343 Value *LB = ConstantInt::get(I32Ty, 0);
2344 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2345 Value *ST = ConstantInt::get(I32Ty, 1);
2346 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
2347 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2348 if (!LoopInfo)
2349 return LoopInfo.takeError();
2350
2351 InsertPointOrErrorTy WsloopIP =
2352 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2353 WorksharingLoopType::ForStaticLoop, !IsNowait);
2354 if (!WsloopIP)
2355 return WsloopIP.takeError();
2356 InsertPointTy AfterIP = *WsloopIP;
2357
2358 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2359 assert(LoopFini && "Bad structure of static workshare loop finalization");
2360
2361 // Apply the finalization callback in LoopAfterBB
2362 auto FiniInfo = FinalizationStack.pop_back_val();
2363 assert(FiniInfo.DK == OMPD_sections &&
2364 "Unexpected finalization stack state!");
2365 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2366 return Err;
2367
2368 return AfterIP;
2369}
2370
2371OpenMPIRBuilder::InsertPointOrErrorTy
2372OpenMPIRBuilder::createSection(const LocationDescription &Loc,
2373 BodyGenCallbackTy BodyGenCB,
2374 FinalizeCallbackTy FiniCB) {
2375 if (!updateToLocation(Loc))
2376 return Loc.IP;
2377
2378 auto FiniCBWrapper = [&](InsertPointTy IP) {
2379 if (IP.getBlock()->end() != IP.getPoint())
2380 return FiniCB(IP);
2381 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2382 // will fail because that function requires the Finalization Basic Block to
2383 // have a terminator, which is already removed by EmitOMPRegionBody.
2384 // IP is currently at cancelation block.
2385 // We need to backtrack to the condition block to fetch
2386 // the exit block and create a branch from cancelation
2387 // to exit block.
2388 IRBuilder<>::InsertPointGuard IPG(Builder);
2389 Builder.restoreIP(IP);
2390 auto *CaseBB = Loc.IP.getBlock();
2391 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2392 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2393 Instruction *I = Builder.CreateBr(ExitBB);
2394 IP = InsertPointTy(I->getParent(), I->getIterator());
2395 return FiniCB(IP);
2396 };
2397
2398 Directive OMPD = Directive::OMPD_sections;
2399 // Since we are using Finalization Callback here, HasFinalize
2400 // and IsCancellable have to be true
2401 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2402 /*Conditional*/ false, /*hasFinalize*/ true,
2403 /*IsCancellable*/ true);
2404}
2405
2406static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I) {
2408 IT++;
2409 return OpenMPIRBuilder::InsertPointTy(I->getParent(), IT);
2410}
2411
2412Value *OpenMPIRBuilder::getGPUThreadID() {
2413 return createRuntimeFunctionCall(
2414 getOrCreateRuntimeFunction(M,
2415 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2416 {});
2417}
2418
2419Value *OpenMPIRBuilder::getGPUWarpSize() {
2420 return createRuntimeFunctionCall(
2421 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2422}
2423
2424Value *OpenMPIRBuilder::getNVPTXWarpID() {
2425 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2426 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2427}
2428
2429Value *OpenMPIRBuilder::getNVPTXLaneID() {
2430 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2431 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2432 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2433 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2434 "nvptx_lane_id");
2435}
2436
2437Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2438 Type *ToType) {
2439 Type *FromType = From->getType();
2440 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2441 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2442 assert(FromSize > 0 && "From size must be greater than zero");
2443 assert(ToSize > 0 && "To size must be greater than zero");
2444 if (FromType == ToType)
2445 return From;
2446 if (FromSize == ToSize)
2447 return Builder.CreateBitCast(From, ToType);
2448 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2449 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2450 InsertPointTy SaveIP = Builder.saveIP();
2451 Builder.restoreIP(AllocaIP);
2452 Value *CastItem = Builder.CreateAlloca(ToType);
2453 Builder.restoreIP(SaveIP);
2454
2455 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2456 CastItem, Builder.getPtrTy(0));
2457 Builder.CreateStore(From, ValCastItem);
2458 return Builder.CreateLoad(ToType, CastItem);
2459}
2460
2461Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2462 Value *Element,
2463 Type *ElementType,
2464 Value *Offset) {
2465 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2466 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2467
2468 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2469 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2470 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2471 Value *WarpSize =
2472 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2473 Function *ShuffleFunc = getOrCreateRuntimeFunctionPtr(
2474 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2475 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2476 Value *WarpSizeCast =
2477 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2478 Value *ShuffleCall =
2479 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2480 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2481}
2482
2483void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2484 Value *DstAddr, Type *ElemType,
2485 Value *Offset, Type *ReductionArrayTy,
2486 bool IsByRefElem) {
2487 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2488 // Create the loop over the big sized data.
2489 // ptr = (void*)Elem;
2490 // ptrEnd = (void*) Elem + 1;
2491 // Step = 8;
2492 // while (ptr + Step < ptrEnd)
2493 // shuffle((int64_t)*ptr);
2494 // Step = 4;
2495 // while (ptr + Step < ptrEnd)
2496 // shuffle((int32_t)*ptr);
2497 // ...
2498 Type *IndexTy = Builder.getIndexTy(
2499 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2500 Value *ElemPtr = DstAddr;
2501 Value *Ptr = SrcAddr;
2502 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2503 if (Size < IntSize)
2504 continue;
2505 Type *IntType = Builder.getIntNTy(IntSize * 8);
2506 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2507 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2508 Value *SrcAddrGEP =
2509 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2510 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2511 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2512
2513 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2514 if ((Size / IntSize) > 1) {
2515 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2516 SrcAddrGEP, Builder.getPtrTy());
2517 BasicBlock *PreCondBB =
2518 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2519 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2520 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2521 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2522 emitBlock(PreCondBB, CurFunc);
2523 PHINode *PhiSrc =
2524 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2525 PhiSrc->addIncoming(Ptr, CurrentBB);
2526 PHINode *PhiDest =
2527 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2528 PhiDest->addIncoming(ElemPtr, CurrentBB);
2529 Ptr = PhiSrc;
2530 ElemPtr = PhiDest;
2531 Value *PtrDiff = Builder.CreatePtrDiff(
2532 Builder.getInt8Ty(), PtrEnd,
2533 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2534 Builder.CreateCondBr(
2535 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2536 ExitBB);
2537 emitBlock(ThenBB, CurFunc);
2538 Value *Res = createRuntimeShuffleFunction(
2539 AllocaIP,
2540 Builder.CreateAlignedLoad(
2541 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2542 IntType, Offset);
2543 Builder.CreateAlignedStore(Res, ElemPtr,
2544 M.getDataLayout().getPrefTypeAlign(ElemType));
2545 Value *LocalPtr =
2546 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2547 Value *LocalElemPtr =
2548 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2549 PhiSrc->addIncoming(LocalPtr, ThenBB);
2550 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2551 emitBranch(PreCondBB);
2552 emitBlock(ExitBB, CurFunc);
2553 } else {
2554 Value *Res = createRuntimeShuffleFunction(
2555 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2556 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2557 Res->getType()->getScalarSizeInBits())
2558 Res = Builder.CreateTrunc(Res, ElemType);
2559 Builder.CreateStore(Res, ElemPtr);
2560 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2561 ElemPtr =
2562 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2563 }
2564 Size = Size % IntSize;
2565 }
2566}
2567
2568Error OpenMPIRBuilder::emitReductionListCopy(
2569 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2570 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2571 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
2572 Type *IndexTy = Builder.getIndexTy(
2573 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2574 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2575
2576 // Iterates, element-by-element, through the source Reduce list and
2577 // make a copy.
2578 for (auto En : enumerate(ReductionInfos)) {
2579 const ReductionInfo &RI = En.value();
2580 Value *SrcElementAddr = nullptr;
2581 AllocaInst *DestAlloca = nullptr;
2582 Value *DestElementAddr = nullptr;
2583 Value *DestElementPtrAddr = nullptr;
2584 // Should we shuffle in an element from a remote lane?
2585 bool ShuffleInElement = false;
2586 // Set to true to update the pointer in the dest Reduce list to a
2587 // newly created element.
2588 bool UpdateDestListPtr = false;
2589
2590 // Step 1.1: Get the address for the src element in the Reduce list.
2591 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
2592 ReductionArrayTy, SrcBase,
2593 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2594 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
2595
2596 // Step 1.2: Create a temporary to store the element in the destination
2597 // Reduce list.
2598 DestElementPtrAddr = Builder.CreateInBoundsGEP(
2599 ReductionArrayTy, DestBase,
2600 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
2601 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
2602 switch (Action) {
2603 case CopyAction::RemoteLaneToThread: {
2604 InsertPointTy CurIP = Builder.saveIP();
2605 Builder.restoreIP(AllocaIP);
2606
2607 Type *DestAllocaType =
2608 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
2609 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
2610 ".omp.reduction.element");
2611 DestAlloca->setAlignment(
2612 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
2613 DestElementAddr = DestAlloca;
2614 DestElementAddr =
2615 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
2616 DestElementAddr->getName() + ".ascast");
2617 Builder.restoreIP(CurIP);
2618 ShuffleInElement = true;
2619 UpdateDestListPtr = true;
2620 break;
2621 }
2622 case CopyAction::ThreadCopy: {
2623 DestElementAddr =
2624 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
2625 break;
2626 }
2627 }
2628
2629 // Now that all active lanes have read the element in the
2630 // Reduce list, shuffle over the value from the remote lane.
2631 if (ShuffleInElement) {
2632 Type *ShuffleType = RI.ElementType;
2633 Value *ShuffleSrcAddr = SrcElementAddr;
2634 Value *ShuffleDestAddr = DestElementAddr;
2635 AllocaInst *LocalStorage = nullptr;
2636
2637 if (IsByRefElem) {
2638 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
2639 assert(RI.ByRefAllocatedType &&
2640 "Expected by-ref allocated type to be set");
2641 // For by-ref reductions, we need to copy from the remote lane the
2642 // actual value of the partial reduction computed by that remote lane;
2643 // rather than, for example, a pointer to that data or, even worse, a
2644 // pointer to the descriptor of the by-ref reduction element.
2645 ShuffleType = RI.ByRefElementType;
2646
2647 InsertPointOrErrorTy GenResult =
2648 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
2649
2650 if (!GenResult)
2651 return GenResult.takeError();
2652
2653 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
2654
2655 {
2656 InsertPointTy OldIP = Builder.saveIP();
2657 Builder.restoreIP(AllocaIP);
2658
2659 LocalStorage = Builder.CreateAlloca(ShuffleType);
2660 Builder.restoreIP(OldIP);
2661 ShuffleDestAddr = LocalStorage;
2662 }
2663 }
2664
2665 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
2666 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
2667
2668 if (IsByRefElem) {
2669 Value *GEP;
2670 InsertPointOrErrorTy GenResult =
2671 RI.DataPtrPtrGen(Builder.saveIP(),
2672 Builder.CreatePointerBitCastOrAddrSpaceCast(
2673 DestAlloca, Builder.getPtrTy(), ".ascast"),
2674 GEP);
2675
2676 if (!GenResult)
2677 return GenResult.takeError();
2678
2679 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
2680 LocalStorage, Builder.getPtrTy(), ".ascast"),
2681 GEP);
2682 }
2683 } else {
2684 switch (RI.EvaluationKind) {
2685 case EvalKind::Scalar: {
2686 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
2687 // Store the source element value to the dest element address.
2688 Builder.CreateStore(Elem, DestElementAddr);
2689 break;
2690 }
2691 case EvalKind::Complex: {
2692 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
2693 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
2694 Value *SrcReal = Builder.CreateLoad(
2695 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
2696 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
2697 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
2698 Value *SrcImg = Builder.CreateLoad(
2699 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
2700
2701 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
2702 RI.ElementType, DestElementAddr, 0, 0, ".realp");
2703 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
2704 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
2705 Builder.CreateStore(SrcReal, DestRealPtr);
2706 Builder.CreateStore(SrcImg, DestImgPtr);
2707 break;
2708 }
2709 case EvalKind::Aggregate: {
2710 Value *SizeVal = Builder.getInt64(
2711 M.getDataLayout().getTypeStoreSize(RI.ElementType));
2712 Builder.CreateMemCpy(
2713 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2714 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
2715 SizeVal, false);
2716 break;
2717 }
2718 };
2719 }
2720
2721 // Step 3.1: Modify reference in dest Reduce list as needed.
2722 // Modifying the reference in Reduce list to point to the newly
2723 // created element. The element is live in the current function
2724 // scope and that of functions it invokes (i.e., reduce_function).
2725 // RemoteReduceData[i] = (void*)&RemoteElem
2726 if (UpdateDestListPtr) {
2727 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2728 DestElementAddr, Builder.getPtrTy(),
2729 DestElementAddr->getName() + ".ascast");
2730 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
2731 }
2732 }
2733
2734 return Error::success();
2735}
2736
2737Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
2738 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
2739 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
2740 InsertPointTy SavedIP = Builder.saveIP();
2741 LLVMContext &Ctx = M.getContext();
2743 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
2744 /* IsVarArg */ false);
2745 Function *WcFunc =
2747 "_omp_reduction_inter_warp_copy_func", &M);
2748 WcFunc->setAttributes(FuncAttrs);
2749 WcFunc->addParamAttr(0, Attribute::NoUndef);
2750 WcFunc->addParamAttr(1, Attribute::NoUndef);
2751 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
2752 Builder.SetInsertPoint(EntryBB);
2753
2754 // ReduceList: thread local Reduce list.
2755 // At the stage of the computation when this function is called, partially
2756 // aggregated values reside in the first lane of every active warp.
2757 Argument *ReduceListArg = WcFunc->getArg(0);
2758 // NumWarps: number of warps active in the parallel region. This could
2759 // be smaller than 32 (max warps in a CTA) for partial block reduction.
2760 Argument *NumWarpsArg = WcFunc->getArg(1);
2761
2762 // This array is used as a medium to transfer, one reduce element at a time,
2763 // the data from the first lane of every warp to lanes in the first warp
2764 // in order to perform the final step of a reduction in a parallel region
2765 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2766 // for reduced latency, as well as to have a distinct copy for concurrently
2767 // executing target regions. The array is declared with common linkage so
2768 // as to be shared across compilation units.
2769 StringRef TransferMediumName =
2770 "__openmp_nvptx_data_transfer_temporary_storage";
2771 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
2772 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
2773 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
2774 if (!TransferMedium) {
2775 TransferMedium = new GlobalVariable(
2776 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
2777 UndefValue::get(ArrayTy), TransferMediumName,
2778 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
2779 /*AddressSpace=*/3);
2780 }
2781
2782 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2783 Value *GPUThreadID = getGPUThreadID();
2784 // nvptx_lane_id = nvptx_id % warpsize
2785 Value *LaneID = getNVPTXLaneID();
2786 // nvptx_warp_id = nvptx_id / warpsize
2787 Value *WarpID = getNVPTXWarpID();
2788
2789 InsertPointTy AllocaIP =
2790 InsertPointTy(Builder.GetInsertBlock(),
2791 Builder.GetInsertBlock()->getFirstInsertionPt());
2792 Type *Arg0Type = ReduceListArg->getType();
2793 Type *Arg1Type = NumWarpsArg->getType();
2794 Builder.restoreIP(AllocaIP);
2795 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
2796 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
2797 AllocaInst *NumWarpsAlloca =
2798 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
2799 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2800 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
2801 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
2802 NumWarpsAlloca, Builder.getPtrTy(0),
2803 NumWarpsAlloca->getName() + ".ascast");
2804 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
2805 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
2806 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
2807 InsertPointTy CodeGenIP =
2808 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
2809 Builder.restoreIP(CodeGenIP);
2810
2811 Value *ReduceList =
2812 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
2813
2814 for (auto En : enumerate(ReductionInfos)) {
2815 //
2816 // Warp master copies reduce element to transfer medium in __shared__
2817 // memory.
2818 //
2819 const ReductionInfo &RI = En.value();
2820 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
2821 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
2822 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
2823 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
2824 Type *CType = Builder.getIntNTy(TySize * 8);
2825
2826 unsigned NumIters = RealTySize / TySize;
2827 if (NumIters == 0)
2828 continue;
2829 Value *Cnt = nullptr;
2830 Value *CntAddr = nullptr;
2831 BasicBlock *PrecondBB = nullptr;
2832 BasicBlock *ExitBB = nullptr;
2833 if (NumIters > 1) {
2834 CodeGenIP = Builder.saveIP();
2835 Builder.restoreIP(AllocaIP);
2836 CntAddr =
2837 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
2838
2839 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
2840 CntAddr->getName() + ".ascast");
2841 Builder.restoreIP(CodeGenIP);
2842 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
2843 CntAddr,
2844 /*Volatile=*/false);
2845 PrecondBB = BasicBlock::Create(Ctx, "precond");
2846 ExitBB = BasicBlock::Create(Ctx, "exit");
2847 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
2848 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
2849 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
2850 /*Volatile=*/false);
2851 Value *Cmp = Builder.CreateICmpULT(
2852 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
2853 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
2854 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
2855 }
2856
2857 // kmpc_barrier.
2858 InsertPointOrErrorTy BarrierIP1 =
2859 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2860 omp::Directive::OMPD_unknown,
2861 /* ForceSimpleCall */ false,
2862 /* CheckCancelFlag */ true);
2863 if (!BarrierIP1)
2864 return BarrierIP1.takeError();
2865 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
2866 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
2867 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
2868
2869 // if (lane_id == 0)
2870 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
2871 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2872 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
2873
2874 // Reduce element = LocalReduceList[i]
2875 auto *RedListArrayTy =
2876 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
2877 Type *IndexTy = Builder.getIndexTy(
2878 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2879 Value *ElemPtrPtr =
2880 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2881 {ConstantInt::get(IndexTy, 0),
2882 ConstantInt::get(IndexTy, En.index())});
2883 // elemptr = ((CopyType*)(elemptrptr)) + I
2884 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
2885
2886 if (IsByRefElem) {
2887 InsertPointOrErrorTy GenRes =
2888 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
2889
2890 if (!GenRes)
2891 return GenRes.takeError();
2892
2893 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
2894 }
2895
2896 if (NumIters > 1)
2897 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
2898
2899 // Get pointer to location in transfer medium.
2900 // MediumPtr = &medium[warp_id]
2901 Value *MediumPtr = Builder.CreateInBoundsGEP(
2902 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
2903 // elem = *elemptr
2904 //*MediumPtr = elem
2905 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
2906 // Store the source element value to the dest element address.
2907 Builder.CreateStore(Elem, MediumPtr,
2908 /*IsVolatile*/ true);
2909 Builder.CreateBr(MergeBB);
2910
2911 // else
2912 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
2913 Builder.CreateBr(MergeBB);
2914
2915 // endif
2916 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
2917 InsertPointOrErrorTy BarrierIP2 =
2918 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
2919 omp::Directive::OMPD_unknown,
2920 /* ForceSimpleCall */ false,
2921 /* CheckCancelFlag */ true);
2922 if (!BarrierIP2)
2923 return BarrierIP2.takeError();
2924
2925 // Warp 0 copies reduce element from transfer medium
2926 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
2927 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
2928 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
2929
2930 Value *NumWarpsVal =
2931 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
2932 // Up to 32 threads in warp 0 are active.
2933 Value *IsActiveThread =
2934 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
2935 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2936
2937 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
2938
2939 // SecMediumPtr = &medium[tid]
2940 // SrcMediumVal = *SrcMediumPtr
2941 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
2942 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
2943 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
2944 Value *TargetElemPtrPtr =
2945 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
2946 {ConstantInt::get(IndexTy, 0),
2947 ConstantInt::get(IndexTy, En.index())});
2948 Value *TargetElemPtrVal =
2949 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
2950 Value *TargetElemPtr = TargetElemPtrVal;
2951
2952 if (IsByRefElem) {
2953 InsertPointOrErrorTy GenRes =
2954 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
2955
2956 if (!GenRes)
2957 return GenRes.takeError();
2958
2959 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
2960 }
2961
2962 if (NumIters > 1)
2963 TargetElemPtr =
2964 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
2965
2966 // *TargetElemPtr = SrcMediumVal;
2967 Value *SrcMediumValue =
2968 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
2969 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
2970 Builder.CreateBr(W0MergeBB);
2971
2972 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
2973 Builder.CreateBr(W0MergeBB);
2974
2975 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
2976
2977 if (NumIters > 1) {
2978 Cnt = Builder.CreateNSWAdd(
2979 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
2980 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
2981
2982 auto *CurFn = Builder.GetInsertBlock()->getParent();
2983 emitBranch(PrecondBB);
2984 emitBlock(ExitBB, CurFn);
2985 }
2986 RealTySize %= TySize;
2987 }
2988 }
2989
2990 Builder.CreateRetVoid();
2991 Builder.restoreIP(SavedIP);
2992
2993 return WcFunc;
2994}
2995
2996Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
2997 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
2998 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
2999 LLVMContext &Ctx = M.getContext();
3000 FunctionType *FuncTy =
3001 FunctionType::get(Builder.getVoidTy(),
3002 {Builder.getPtrTy(), Builder.getInt16Ty(),
3003 Builder.getInt16Ty(), Builder.getInt16Ty()},
3004 /* IsVarArg */ false);
3005 Function *SarFunc =
3007 "_omp_reduction_shuffle_and_reduce_func", &M);
3008 SarFunc->setAttributes(FuncAttrs);
3009 SarFunc->addParamAttr(0, Attribute::NoUndef);
3010 SarFunc->addParamAttr(1, Attribute::NoUndef);
3011 SarFunc->addParamAttr(2, Attribute::NoUndef);
3012 SarFunc->addParamAttr(3, Attribute::NoUndef);
3013 SarFunc->addParamAttr(1, Attribute::SExt);
3014 SarFunc->addParamAttr(2, Attribute::SExt);
3015 SarFunc->addParamAttr(3, Attribute::SExt);
3016 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3017 Builder.SetInsertPoint(EntryBB);
3018
3019 // Thread local Reduce list used to host the values of data to be reduced.
3020 Argument *ReduceListArg = SarFunc->getArg(0);
3021 // Current lane id; could be logical.
3022 Argument *LaneIDArg = SarFunc->getArg(1);
3023 // Offset of the remote source lane relative to the current lane.
3024 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3025 // Algorithm version. This is expected to be known at compile time.
3026 Argument *AlgoVerArg = SarFunc->getArg(3);
3027
3028 Type *ReduceListArgType = ReduceListArg->getType();
3029 Type *LaneIDArgType = LaneIDArg->getType();
3030 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3031 Value *ReduceListAlloca = Builder.CreateAlloca(
3032 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3033 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3034 LaneIDArg->getName() + ".addr");
3035 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3036 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3037 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3038 AlgoVerArg->getName() + ".addr");
3039 ArrayType *RedListArrayTy =
3040 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3041
3042 // Create a local thread-private variable to host the Reduce list
3043 // from a remote lane.
3044 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3045 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3046
3047 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3048 ReduceListAlloca, ReduceListArgType,
3049 ReduceListAlloca->getName() + ".ascast");
3050 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3051 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3052 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3053 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3054 RemoteLaneOffsetAlloca->getName() + ".ascast");
3055 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3056 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3057 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3058 RemoteReductionListAlloca, Builder.getPtrTy(),
3059 RemoteReductionListAlloca->getName() + ".ascast");
3060
3061 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3062 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3063 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3064 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3065
3066 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3067 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3068 Value *RemoteLaneOffset =
3069 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3070 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3071
3072 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3073
3074 // This loop iterates through the list of reduce elements and copies,
3075 // element by element, from a remote lane in the warp to RemoteReduceList,
3076 // hosted on the thread's stack.
3077 Error EmitRedLsCpRes = emitReductionListCopy(
3078 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3079 ReduceList, RemoteListAddrCast, IsByRef,
3080 {RemoteLaneOffset, nullptr, nullptr});
3081
3082 if (EmitRedLsCpRes)
3083 return EmitRedLsCpRes;
3084
3085 // The actions to be performed on the Remote Reduce list is dependent
3086 // on the algorithm version.
3087 //
3088 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3089 // LaneId % 2 == 0 && Offset > 0):
3090 // do the reduction value aggregation
3091 //
3092 // The thread local variable Reduce list is mutated in place to host the
3093 // reduced data, which is the aggregated value produced from local and
3094 // remote lanes.
3095 //
3096 // Note that AlgoVer is expected to be a constant integer known at compile
3097 // time.
3098 // When AlgoVer==0, the first conjunction evaluates to true, making
3099 // the entire predicate true during compile time.
3100 // When AlgoVer==1, the second conjunction has only the second part to be
3101 // evaluated during runtime. Other conjunctions evaluates to false
3102 // during compile time.
3103 // When AlgoVer==2, the third conjunction has only the second part to be
3104 // evaluated during runtime. Other conjunctions evaluates to false
3105 // during compile time.
3106 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3107 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3108 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3109 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3110 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3111 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3112 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3113 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3114 Value *RemoteOffsetComp =
3115 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3116 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3117 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3118 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3119
3120 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3121 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3122 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3123
3124 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3125 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3126 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3127 ReduceList, Builder.getPtrTy());
3128 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3129 RemoteListAddrCast, Builder.getPtrTy());
3130 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3131 ->addFnAttr(Attribute::NoUnwind);
3132 Builder.CreateBr(MergeBB);
3133
3134 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3135 Builder.CreateBr(MergeBB);
3136
3137 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3138
3139 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3140 // Reduce list.
3141 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3142 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3143 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3144
3145 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3146 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3147 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3148 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3149
3150 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3151
3152 EmitRedLsCpRes = emitReductionListCopy(
3153 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3154 RemoteListAddrCast, ReduceList, IsByRef);
3155
3156 if (EmitRedLsCpRes)
3157 return EmitRedLsCpRes;
3158
3159 Builder.CreateBr(CpyMergeBB);
3160
3161 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3162 Builder.CreateBr(CpyMergeBB);
3163
3164 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3165
3166 Builder.CreateRetVoid();
3167
3168 return SarFunc;
3169}
3170
3171Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3172 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3173 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3174 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3175 LLVMContext &Ctx = M.getContext();
3177 Builder.getVoidTy(),
3178 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3179 /* IsVarArg */ false);
3180 Function *LtGCFunc =
3182 "_omp_reduction_list_to_global_copy_func", &M);
3183 LtGCFunc->setAttributes(FuncAttrs);
3184 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3185 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3186 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3187
3188 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3189 Builder.SetInsertPoint(EntryBlock);
3190
3191 // Buffer: global reduction buffer.
3192 Argument *BufferArg = LtGCFunc->getArg(0);
3193 // Idx: index of the buffer.
3194 Argument *IdxArg = LtGCFunc->getArg(1);
3195 // ReduceList: thread local Reduce list.
3196 Argument *ReduceListArg = LtGCFunc->getArg(2);
3197
3198 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3199 BufferArg->getName() + ".addr");
3200 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3201 IdxArg->getName() + ".addr");
3202 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3203 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3204 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 BufferArgAlloca, Builder.getPtrTy(),
3206 BufferArgAlloca->getName() + ".ascast");
3207 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3208 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3209 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3210 ReduceListArgAlloca, Builder.getPtrTy(),
3211 ReduceListArgAlloca->getName() + ".ascast");
3212
3213 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3214 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3215 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3216
3217 Value *LocalReduceList =
3218 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3219 Value *BufferArgVal =
3220 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3221 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3222 Type *IndexTy = Builder.getIndexTy(
3223 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3224 for (auto En : enumerate(ReductionInfos)) {
3225 const ReductionInfo &RI = En.value();
3226 auto *RedListArrayTy =
3227 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3228 // Reduce element = LocalReduceList[i]
3229 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3230 RedListArrayTy, LocalReduceList,
3231 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3232 // elemptr = ((CopyType*)(elemptrptr)) + I
3233 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3234
3235 // Global = Buffer.VD[Idx];
3236 Value *BufferVD =
3237 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3238 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3239 ReductionsBufferTy, BufferVD, 0, En.index());
3240
3241 switch (RI.EvaluationKind) {
3242 case EvalKind::Scalar: {
3243 Value *TargetElement;
3244
3245 if (IsByRef.empty() || !IsByRef[En.index()]) {
3246 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3247 } else {
3248 InsertPointOrErrorTy GenResult =
3249 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3250
3251 if (!GenResult)
3252 return GenResult.takeError();
3253
3254 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3255 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3256 }
3257
3258 Builder.CreateStore(TargetElement, GlobVal);
3259 break;
3260 }
3261 case EvalKind::Complex: {
3262 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3263 RI.ElementType, ElemPtr, 0, 0, ".realp");
3264 Value *SrcReal = Builder.CreateLoad(
3265 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3266 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3267 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3268 Value *SrcImg = Builder.CreateLoad(
3269 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3270
3271 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3272 RI.ElementType, GlobVal, 0, 0, ".realp");
3273 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3274 RI.ElementType, GlobVal, 0, 1, ".imagp");
3275 Builder.CreateStore(SrcReal, DestRealPtr);
3276 Builder.CreateStore(SrcImg, DestImgPtr);
3277 break;
3278 }
3279 case EvalKind::Aggregate: {
3280 Value *SizeVal =
3281 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3282 Builder.CreateMemCpy(
3283 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3284 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3285 break;
3286 }
3287 }
3288 }
3289
3290 Builder.CreateRetVoid();
3291 Builder.restoreIP(OldIP);
3292 return LtGCFunc;
3293}
3294
3295Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3296 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3297 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3298 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3299 LLVMContext &Ctx = M.getContext();
3301 Builder.getVoidTy(),
3302 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3303 /* IsVarArg */ false);
3304 Function *LtGRFunc =
3306 "_omp_reduction_list_to_global_reduce_func", &M);
3307 LtGRFunc->setAttributes(FuncAttrs);
3308 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3309 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3310 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3311
3312 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3313 Builder.SetInsertPoint(EntryBlock);
3314
3315 // Buffer: global reduction buffer.
3316 Argument *BufferArg = LtGRFunc->getArg(0);
3317 // Idx: index of the buffer.
3318 Argument *IdxArg = LtGRFunc->getArg(1);
3319 // ReduceList: thread local Reduce list.
3320 Argument *ReduceListArg = LtGRFunc->getArg(2);
3321
3322 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3323 BufferArg->getName() + ".addr");
3324 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3325 IdxArg->getName() + ".addr");
3326 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3327 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3328 auto *RedListArrayTy =
3329 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3330
3331 // 1. Build a list of reduction variables.
3332 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3333 Value *LocalReduceList =
3334 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3335
3336 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3337
3338 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3339 BufferArgAlloca, Builder.getPtrTy(),
3340 BufferArgAlloca->getName() + ".ascast");
3341 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3342 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3343 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3344 ReduceListArgAlloca, Builder.getPtrTy(),
3345 ReduceListArgAlloca->getName() + ".ascast");
3346 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3347 LocalReduceList, Builder.getPtrTy(),
3348 LocalReduceList->getName() + ".ascast");
3349
3350 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3351 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3352 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3353
3354 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3355 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3356 Type *IndexTy = Builder.getIndexTy(
3357 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3358 for (auto En : enumerate(ReductionInfos)) {
3359 const ReductionInfo &RI = En.value();
3360 Value *ByRefAlloc;
3361
3362 if (!IsByRef.empty() && IsByRef[En.index()]) {
3363 InsertPointTy OldIP = Builder.saveIP();
3364 Builder.restoreIP(AllocaIP);
3365
3366 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3367 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3368 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3369
3370 Builder.restoreIP(OldIP);
3371 }
3372
3373 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3374 RedListArrayTy, LocalReduceListAddrCast,
3375 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3376 Value *BufferVD =
3377 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3378 // Global = Buffer.VD[Idx];
3379 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3380 ReductionsBufferTy, BufferVD, 0, En.index());
3381
3382 if (!IsByRef.empty() && IsByRef[En.index()]) {
3383 Value *ByRefDataPtr;
3384
3385 InsertPointOrErrorTy GenResult =
3386 RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr);
3387
3388 if (!GenResult)
3389 return GenResult.takeError();
3390
3391 Builder.CreateStore(GlobValPtr, ByRefDataPtr);
3392 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3393 } else {
3394 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3395 }
3396 }
3397
3398 // Call reduce_function(GlobalReduceList, ReduceList)
3399 Value *ReduceList =
3400 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3401 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3402 ->addFnAttr(Attribute::NoUnwind);
3403 Builder.CreateRetVoid();
3404 Builder.restoreIP(OldIP);
3405 return LtGRFunc;
3406}
3407
3408Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3409 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3410 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3411 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3412 LLVMContext &Ctx = M.getContext();
3414 Builder.getVoidTy(),
3415 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3416 /* IsVarArg */ false);
3417 Function *GtLCFunc =
3419 "_omp_reduction_global_to_list_copy_func", &M);
3420 GtLCFunc->setAttributes(FuncAttrs);
3421 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3422 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3423 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3424
3425 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3426 Builder.SetInsertPoint(EntryBlock);
3427
3428 // Buffer: global reduction buffer.
3429 Argument *BufferArg = GtLCFunc->getArg(0);
3430 // Idx: index of the buffer.
3431 Argument *IdxArg = GtLCFunc->getArg(1);
3432 // ReduceList: thread local Reduce list.
3433 Argument *ReduceListArg = GtLCFunc->getArg(2);
3434
3435 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3436 BufferArg->getName() + ".addr");
3437 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3438 IdxArg->getName() + ".addr");
3439 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3440 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3441 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3442 BufferArgAlloca, Builder.getPtrTy(),
3443 BufferArgAlloca->getName() + ".ascast");
3444 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3445 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3446 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3447 ReduceListArgAlloca, Builder.getPtrTy(),
3448 ReduceListArgAlloca->getName() + ".ascast");
3449 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3450 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3451 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3452
3453 Value *LocalReduceList =
3454 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3455 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3456 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3457 Type *IndexTy = Builder.getIndexTy(
3458 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3459 for (auto En : enumerate(ReductionInfos)) {
3460 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3461 auto *RedListArrayTy =
3462 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3463 // Reduce element = LocalReduceList[i]
3464 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3465 RedListArrayTy, LocalReduceList,
3466 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3467 // elemptr = ((CopyType*)(elemptrptr)) + I
3468 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3469 // Global = Buffer.VD[Idx];
3470 Value *BufferVD =
3471 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3472 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3473 ReductionsBufferTy, BufferVD, 0, En.index());
3474
3475 switch (RI.EvaluationKind) {
3476 case EvalKind::Scalar: {
3477 Type *ElemType = RI.ElementType;
3478
3479 if (!IsByRef.empty() && IsByRef[En.index()]) {
3480 ElemType = RI.ByRefElementType;
3481 InsertPointOrErrorTy GenResult =
3482 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3483
3484 if (!GenResult)
3485 return GenResult.takeError();
3486
3487 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3488 }
3489
3490 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3491 Builder.CreateStore(TargetElement, ElemPtr);
3492 break;
3493 }
3494 case EvalKind::Complex: {
3495 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3496 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3497 Value *SrcReal = Builder.CreateLoad(
3498 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3499 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3500 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3501 Value *SrcImg = Builder.CreateLoad(
3502 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3503
3504 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3505 RI.ElementType, ElemPtr, 0, 0, ".realp");
3506 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3507 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3508 Builder.CreateStore(SrcReal, DestRealPtr);
3509 Builder.CreateStore(SrcImg, DestImgPtr);
3510 break;
3511 }
3512 case EvalKind::Aggregate: {
3513 Value *SizeVal =
3514 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3515 Builder.CreateMemCpy(
3516 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3517 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3518 SizeVal, false);
3519 break;
3520 }
3521 }
3522 }
3523
3524 Builder.CreateRetVoid();
3525 Builder.restoreIP(OldIP);
3526 return GtLCFunc;
3527}
3528
3529Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
3530 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3531 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3532 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3533 LLVMContext &Ctx = M.getContext();
3534 auto *FuncTy = FunctionType::get(
3535 Builder.getVoidTy(),
3536 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3537 /* IsVarArg */ false);
3538 Function *GtLRFunc =
3540 "_omp_reduction_global_to_list_reduce_func", &M);
3541 GtLRFunc->setAttributes(FuncAttrs);
3542 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
3543 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
3544 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
3545
3546 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
3547 Builder.SetInsertPoint(EntryBlock);
3548
3549 // Buffer: global reduction buffer.
3550 Argument *BufferArg = GtLRFunc->getArg(0);
3551 // Idx: index of the buffer.
3552 Argument *IdxArg = GtLRFunc->getArg(1);
3553 // ReduceList: thread local Reduce list.
3554 Argument *ReduceListArg = GtLRFunc->getArg(2);
3555
3556 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3557 BufferArg->getName() + ".addr");
3558 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3559 IdxArg->getName() + ".addr");
3560 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3561 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3562 ArrayType *RedListArrayTy =
3563 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3564
3565 // 1. Build a list of reduction variables.
3566 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3567 Value *LocalReduceList =
3568 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3569
3570 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3571
3572 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3573 BufferArgAlloca, Builder.getPtrTy(),
3574 BufferArgAlloca->getName() + ".ascast");
3575 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3576 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3577 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3578 ReduceListArgAlloca, Builder.getPtrTy(),
3579 ReduceListArgAlloca->getName() + ".ascast");
3580 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3581 LocalReduceList, Builder.getPtrTy(),
3582 LocalReduceList->getName() + ".ascast");
3583
3584 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3585 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3586 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3587
3588 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3589 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3590 Type *IndexTy = Builder.getIndexTy(
3591 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3592 for (auto En : enumerate(ReductionInfos)) {
3593 const ReductionInfo &RI = En.value();
3594 Value *ByRefAlloc;
3595
3596 if (!IsByRef.empty() && IsByRef[En.index()]) {
3597 InsertPointTy OldIP = Builder.saveIP();
3598 Builder.restoreIP(AllocaIP);
3599
3600 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3601 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3602 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3603
3604 Builder.restoreIP(OldIP);
3605 }
3606
3607 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3608 RedListArrayTy, ReductionList,
3609 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3610 // Global = Buffer.VD[Idx];
3611 Value *BufferVD =
3612 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3613 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3614 ReductionsBufferTy, BufferVD, 0, En.index());
3615
3616 if (!IsByRef.empty() && IsByRef[En.index()]) {
3617 Value *ByRefDataPtr;
3618 InsertPointOrErrorTy GenResult =
3619 RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr);
3620 if (!GenResult)
3621 return GenResult.takeError();
3622
3623 Builder.CreateStore(GlobValPtr, ByRefDataPtr);
3624 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3625 } else {
3626 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3627 }
3628 }
3629
3630 // Call reduce_function(ReduceList, GlobalReduceList)
3631 Value *ReduceList =
3632 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3633 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
3634 ->addFnAttr(Attribute::NoUnwind);
3635 Builder.CreateRetVoid();
3636 Builder.restoreIP(OldIP);
3637 return GtLRFunc;
3638}
3639
3640std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
3641 std::string Suffix =
3642 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
3643 return (Name + Suffix).str();
3644}
3645
3646Expected<Function *> OpenMPIRBuilder::createReductionFunction(
3647 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
3648 ArrayRef<bool> IsByRef, ReductionGenCBKind ReductionGenCBKind,
3649 AttributeList FuncAttrs) {
3650 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
3651 {Builder.getPtrTy(), Builder.getPtrTy()},
3652 /* IsVarArg */ false);
3653 std::string Name = getReductionFuncName(ReducerName);
3654 Function *ReductionFunc =
3656 ReductionFunc->setAttributes(FuncAttrs);
3657 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
3658 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
3659 BasicBlock *EntryBB =
3660 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
3661 Builder.SetInsertPoint(EntryBB);
3662
3663 // Need to alloca memory here and deal with the pointers before getting
3664 // LHS/RHS pointers out
3665 Value *LHSArrayPtr = nullptr;
3666 Value *RHSArrayPtr = nullptr;
3667 Argument *Arg0 = ReductionFunc->getArg(0);
3668 Argument *Arg1 = ReductionFunc->getArg(1);
3669 Type *Arg0Type = Arg0->getType();
3670 Type *Arg1Type = Arg1->getType();
3671
3672 Value *LHSAlloca =
3673 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
3674 Value *RHSAlloca =
3675 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
3676 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3677 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
3678 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3679 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
3680 Builder.CreateStore(Arg0, LHSAddrCast);
3681 Builder.CreateStore(Arg1, RHSAddrCast);
3682 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
3683 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
3684
3685 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3686 Type *IndexTy = Builder.getIndexTy(
3687 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3688 SmallVector<Value *> LHSPtrs, RHSPtrs;
3689 for (auto En : enumerate(ReductionInfos)) {
3690 const ReductionInfo &RI = En.value();
3691 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
3692 RedArrayTy, RHSArrayPtr,
3693 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3694 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
3695 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3696 RHSI8Ptr, RI.PrivateVariable->getType(),
3697 RHSI8Ptr->getName() + ".ascast");
3698
3699 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
3700 RedArrayTy, LHSArrayPtr,
3701 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3702 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
3703 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3704 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
3705
3706 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3707 LHSPtrs.emplace_back(LHSPtr);
3708 RHSPtrs.emplace_back(RHSPtr);
3709 } else {
3710 Value *LHS = LHSPtr;
3711 Value *RHS = RHSPtr;
3712
3713 if (!IsByRef.empty() && !IsByRef[En.index()]) {
3714 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
3715 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
3716 }
3717
3718 Value *Reduced;
3719 InsertPointOrErrorTy AfterIP =
3720 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
3721 if (!AfterIP)
3722 return AfterIP.takeError();
3723 if (!Builder.GetInsertBlock())
3724 return ReductionFunc;
3725
3726 Builder.restoreIP(*AfterIP);
3727
3728 if (!IsByRef.empty() && !IsByRef[En.index()])
3729 Builder.CreateStore(Reduced, LHSPtr);
3730 }
3731 }
3732
3733 if (ReductionGenCBKind == ReductionGenCBKind::Clang)
3734 for (auto En : enumerate(ReductionInfos)) {
3735 unsigned Index = En.index();
3736 const ReductionInfo &RI = En.value();
3737 Value *LHSFixupPtr, *RHSFixupPtr;
3738 Builder.restoreIP(RI.ReductionGenClang(
3739 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
3740
3741 // Fix the CallBack code genereated to use the correct Values for the LHS
3742 // and RHS
3743 LHSFixupPtr->replaceUsesWithIf(
3744 LHSPtrs[Index], [ReductionFunc](const Use &U) {
3745 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3746 ReductionFunc;
3747 });
3748 RHSFixupPtr->replaceUsesWithIf(
3749 RHSPtrs[Index], [ReductionFunc](const Use &U) {
3750 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3751 ReductionFunc;
3752 });
3753 }
3754
3755 Builder.CreateRetVoid();
3756 return ReductionFunc;
3757}
3758
3759static void
3761 bool IsGPU) {
3762 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
3763 (void)RI;
3764 assert(RI.Variable && "expected non-null variable");
3765 assert(RI.PrivateVariable && "expected non-null private variable");
3766 assert((RI.ReductionGen || RI.ReductionGenClang) &&
3767 "expected non-null reduction generator callback");
3768 if (!IsGPU) {
3769 assert(
3770 RI.Variable->getType() == RI.PrivateVariable->getType() &&
3771 "expected variables and their private equivalents to have the same "
3772 "type");
3773 }
3774 assert(RI.Variable->getType()->isPointerTy() &&
3775 "expected variables to be pointers");
3776 }
3777}
3778
3779OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductionsGPU(
3780 const LocationDescription &Loc, InsertPointTy AllocaIP,
3781 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
3782 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
3783 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
3784 unsigned ReductionBufNum, Value *SrcLocInfo) {
3785 if (!updateToLocation(Loc))
3786 return InsertPointTy();
3787 Builder.restoreIP(CodeGenIP);
3788 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
3789 LLVMContext &Ctx = M.getContext();
3790
3791 // Source location for the ident struct
3792 if (!SrcLocInfo) {
3793 uint32_t SrcLocStrSize;
3794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
3795 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
3796 }
3797
3798 if (ReductionInfos.size() == 0)
3799 return Builder.saveIP();
3800
3801 BasicBlock *ContinuationBlock = nullptr;
3802 if (ReductionGenCBKind != ReductionGenCBKind::Clang) {
3803 // Copied code from createReductions
3804 BasicBlock *InsertBlock = Loc.IP.getBlock();
3805 ContinuationBlock =
3806 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
3807 InsertBlock->getTerminator()->eraseFromParent();
3808 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
3809 }
3810
3811 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3812 AttributeList FuncAttrs;
3813 AttrBuilder AttrBldr(Ctx);
3814 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
3815 AttrBldr.addAttribute(Attr);
3816 AttrBldr.removeAttribute(Attribute::OptimizeNone);
3817 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
3818
3819 CodeGenIP = Builder.saveIP();
3820 Expected<Function *> ReductionResult = createReductionFunction(
3821 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
3822 ReductionGenCBKind, FuncAttrs);
3823 if (!ReductionResult)
3824 return ReductionResult.takeError();
3825 Function *ReductionFunc = *ReductionResult;
3826 Builder.restoreIP(CodeGenIP);
3827
3828 // Set the grid value in the config needed for lowering later on
3829 if (GridValue.has_value())
3830 Config.setGridValue(GridValue.value());
3831 else
3832 Config.setGridValue(getGridValue(T, ReductionFunc));
3833
3834 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3835 // RedList, shuffle_reduce_func, interwarp_copy_func);
3836 // or
3837 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
3838 Value *Res;
3839
3840 // 1. Build a list of reduction variables.
3841 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3842 auto Size = ReductionInfos.size();
3843 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
3844 Type *FuncPtrTy =
3845 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
3846 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
3847 CodeGenIP = Builder.saveIP();
3848 Builder.restoreIP(AllocaIP);
3849 Value *ReductionListAlloca =
3850 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
3851 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3852 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
3853 Builder.restoreIP(CodeGenIP);
3854 Type *IndexTy = Builder.getIndexTy(
3855 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3856 for (auto En : enumerate(ReductionInfos)) {
3857 const ReductionInfo &RI = En.value();
3858 Value *ElemPtr = Builder.CreateInBoundsGEP(
3859 RedArrayTy, ReductionList,
3860 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3861
3862 Value *PrivateVar = RI.PrivateVariable;
3863 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3864 if (IsByRefElem)
3865 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
3866
3867 Value *CastElem =
3868 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
3869 Builder.CreateStore(CastElem, ElemPtr);
3870 }
3871 CodeGenIP = Builder.saveIP();
3872 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
3873 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
3874
3875 if (!SarFunc)
3876 return SarFunc.takeError();
3877
3878 Expected<Function *> CopyResult =
3879 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
3880 if (!CopyResult)
3881 return CopyResult.takeError();
3882 Function *WcFunc = *CopyResult;
3883 Builder.restoreIP(CodeGenIP);
3884
3885 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
3886
3887 unsigned MaxDataSize = 0;
3888 SmallVector<Type *> ReductionTypeArgs;
3889 for (auto En : enumerate(ReductionInfos)) {
3890 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
3891 if (Size > MaxDataSize)
3892 MaxDataSize = Size;
3893 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
3894 ? En.value().ByRefElementType
3895 : En.value().ElementType;
3896 ReductionTypeArgs.emplace_back(RedTypeArg);
3897 }
3898 Value *ReductionDataSize =
3899 Builder.getInt64(MaxDataSize * ReductionInfos.size());
3900 if (!IsTeamsReduction) {
3901 Value *SarFuncCast =
3902 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
3903 Value *WcFuncCast =
3904 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
3905 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
3906 WcFuncCast};
3907 Function *Pv2Ptr = getOrCreateRuntimeFunctionPtr(
3908 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
3909 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
3910 } else {
3911 CodeGenIP = Builder.saveIP();
3912 StructType *ReductionsBufferTy = StructType::create(
3913 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
3914 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
3915 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
3916
3917 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
3918 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
3919 if (!LtGCFunc)
3920 return LtGCFunc.takeError();
3921
3922 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
3923 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
3924 if (!LtGRFunc)
3925 return LtGRFunc.takeError();
3926
3927 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
3928 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
3929 if (!GtLCFunc)
3930 return GtLCFunc.takeError();
3931
3932 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
3933 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
3934 if (!GtLRFunc)
3935 return GtLRFunc.takeError();
3936
3937 Builder.restoreIP(CodeGenIP);
3938
3939 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
3940 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
3941
3942 Value *Args3[] = {SrcLocInfo,
3943 KernelTeamsReductionPtr,
3944 Builder.getInt32(ReductionBufNum),
3945 ReductionDataSize,
3946 RL,
3947 *SarFunc,
3948 WcFunc,
3949 *LtGCFunc,
3950 *LtGRFunc,
3951 *GtLCFunc,
3952 *GtLRFunc};
3953
3954 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
3955 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
3956 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
3957 }
3958
3959 // 5. Build if (res == 1)
3960 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
3961 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
3962 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
3963 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
3964
3965 // 6. Build then branch: where we have reduced values in the master
3966 // thread in each team.
3967 // __kmpc_end_reduce{_nowait}(<gtid>);
3968 // break;
3969 emitBlock(ThenBB, CurFunc);
3970
3971 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3972 for (auto En : enumerate(ReductionInfos)) {
3973 const ReductionInfo &RI = En.value();
3974 Type *ValueType = RI.ElementType;
3975 Value *RedValue = RI.Variable;
3976 Value *RHS =
3977 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
3978
3979 if (ReductionGenCBKind == ReductionGenCBKind::Clang) {
3980 Value *LHSPtr, *RHSPtr;
3981 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
3982 &LHSPtr, &RHSPtr, CurFunc));
3983
3984 // Fix the CallBack code genereated to use the correct Values for the LHS
3985 // and RHS
3986 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
3987 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3988 ReductionFunc;
3989 });
3990 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
3991 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
3992 ReductionFunc;
3993 });
3994 } else {
3995 if (IsByRef.empty() || !IsByRef[En.index()]) {
3996 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
3997 "red.value." + Twine(En.index()));
3998 }
3999 Value *PrivateRedValue = Builder.CreateLoad(
4000 ValueType, RHS, "red.private.value" + Twine(En.index()));
4001 Value *Reduced;
4002 InsertPointOrErrorTy AfterIP =
4003 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4004 if (!AfterIP)
4005 return AfterIP.takeError();
4006 Builder.restoreIP(*AfterIP);
4007
4008 if (!IsByRef.empty() && !IsByRef[En.index()])
4009 Builder.CreateStore(Reduced, RI.Variable);
4010 }
4011 }
4012 emitBlock(ExitBB, CurFunc);
4013 if (ContinuationBlock) {
4014 Builder.CreateBr(ContinuationBlock);
4015 Builder.SetInsertPoint(ContinuationBlock);
4016 }
4017 Config.setEmitLLVMUsed();
4018
4019 return Builder.saveIP();
4020}
4021
4023 Type *VoidTy = Type::getVoidTy(M.getContext());
4024 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4025 auto *FuncTy =
4026 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4028 ".omp.reduction.func", &M);
4029}
4030
4032 Function *ReductionFunc,
4034 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4035 Module *Module = ReductionFunc->getParent();
4036 BasicBlock *ReductionFuncBlock =
4037 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4038 Builder.SetInsertPoint(ReductionFuncBlock);
4039 Value *LHSArrayPtr = nullptr;
4040 Value *RHSArrayPtr = nullptr;
4041 if (IsGPU) {
4042 // Need to alloca memory here and deal with the pointers before getting
4043 // LHS/RHS pointers out
4044 //
4045 Argument *Arg0 = ReductionFunc->getArg(0);
4046 Argument *Arg1 = ReductionFunc->getArg(1);
4047 Type *Arg0Type = Arg0->getType();
4048 Type *Arg1Type = Arg1->getType();
4049
4050 Value *LHSAlloca =
4051 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4052 Value *RHSAlloca =
4053 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4054 Value *LHSAddrCast =
4055 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4056 Value *RHSAddrCast =
4057 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4058 Builder.CreateStore(Arg0, LHSAddrCast);
4059 Builder.CreateStore(Arg1, RHSAddrCast);
4060 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4061 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4062 } else {
4063 LHSArrayPtr = ReductionFunc->getArg(0);
4064 RHSArrayPtr = ReductionFunc->getArg(1);
4065 }
4066
4067 unsigned NumReductions = ReductionInfos.size();
4068 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4069
4070 for (auto En : enumerate(ReductionInfos)) {
4071 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4072 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4073 RedArrayTy, LHSArrayPtr, 0, En.index());
4074 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4075 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4076 LHSI8Ptr, RI.Variable->getType());
4077 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4078 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4079 RedArrayTy, RHSArrayPtr, 0, En.index());
4080 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4081 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4082 RHSI8Ptr, RI.PrivateVariable->getType());
4083 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4084 Value *Reduced;
4085 OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4086 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4087 if (!AfterIP)
4088 return AfterIP.takeError();
4089
4090 Builder.restoreIP(*AfterIP);
4091 // TODO: Consider flagging an error.
4092 if (!Builder.GetInsertBlock())
4093 return Error::success();
4094
4095 // store is inside of the reduction region when using by-ref
4096 if (!IsByRef[En.index()])
4097 Builder.CreateStore(Reduced, LHSPtr);
4098 }
4099 Builder.CreateRetVoid();
4100 return Error::success();
4101}
4102
4103OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createReductions(
4104 const LocationDescription &Loc, InsertPointTy AllocaIP,
4105 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4106 bool IsNoWait, bool IsTeamsReduction) {
4107 assert(ReductionInfos.size() == IsByRef.size());
4108 if (Config.isGPU())
4109 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4110 IsByRef, IsNoWait, IsTeamsReduction);
4111
4112 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4113
4114 if (!updateToLocation(Loc))
4115 return InsertPointTy();
4116
4117 if (ReductionInfos.size() == 0)
4118 return Builder.saveIP();
4119
4120 BasicBlock *InsertBlock = Loc.IP.getBlock();
4121 BasicBlock *ContinuationBlock =
4122 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4123 InsertBlock->getTerminator()->eraseFromParent();
4124
4125 // Create and populate array of type-erased pointers to private reduction
4126 // values.
4127 unsigned NumReductions = ReductionInfos.size();
4128 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4129 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4130 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4131
4132 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4133
4134 for (auto En : enumerate(ReductionInfos)) {
4135 unsigned Index = En.index();
4136 const ReductionInfo &RI = En.value();
4137 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4138 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4139 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4140 }
4141
4142 // Emit a call to the runtime function that orchestrates the reduction.
4143 // Declare the reduction function in the process.
4144 Type *IndexTy = Builder.getIndexTy(
4145 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4146 Function *Func = Builder.GetInsertBlock()->getParent();
4147 Module *Module = Func->getParent();
4148 uint32_t SrcLocStrSize;
4149 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4150 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4151 return RI.AtomicReductionGen;
4152 });
4153 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4154 CanGenerateAtomic
4155 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4156 : IdentFlag(0));
4157 Value *ThreadId = getOrCreateThreadID(Ident);
4158 Constant *NumVariables = Builder.getInt32(NumReductions);
4159 const DataLayout &DL = Module->getDataLayout();
4160 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4161 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4162 Function *ReductionFunc = getFreshReductionFunc(*Module);
4163 Value *Lock = getOMPCriticalRegionLock(".reduction");
4164 Function *ReduceFunc = getOrCreateRuntimeFunctionPtr(
4165 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4166 : RuntimeFunction::OMPRTL___kmpc_reduce);
4167 CallInst *ReduceCall =
4168 createRuntimeFunctionCall(ReduceFunc,
4169 {Ident, ThreadId, NumVariables, RedArraySize,
4170 RedArray, ReductionFunc, Lock},
4171 "reduce");
4172
4173 // Create final reduction entry blocks for the atomic and non-atomic case.
4174 // Emit IR that dispatches control flow to one of the blocks based on the
4175 // reduction supporting the atomic mode.
4176 BasicBlock *NonAtomicRedBlock =
4177 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4178 BasicBlock *AtomicRedBlock =
4179 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4181 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4182 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4183 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4184
4185 // Populate the non-atomic reduction using the elementwise reduction function.
4186 // This loads the elements from the global and private variables and reduces
4187 // them before storing back the result to the global variable.
4188 Builder.SetInsertPoint(NonAtomicRedBlock);
4189 for (auto En : enumerate(ReductionInfos)) {
4190 const ReductionInfo &RI = En.value();
4191 Type *ValueType = RI.ElementType;
4192 // We have one less load for by-ref case because that load is now inside of
4193 // the reduction region
4194 Value *RedValue = RI.Variable;
4195 if (!IsByRef[En.index()]) {
4196 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4197 "red.value." + Twine(En.index()));
4198 }
4199 Value *PrivateRedValue =
4200 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4201 "red.private.value." + Twine(En.index()));
4202 Value *Reduced;
4203 InsertPointOrErrorTy AfterIP =
4204 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4205 if (!AfterIP)
4206 return AfterIP.takeError();
4207 Builder.restoreIP(*AfterIP);
4208
4209 if (!Builder.GetInsertBlock())
4210 return InsertPointTy();
4211 // for by-ref case, the load is inside of the reduction region
4212 if (!IsByRef[En.index()])
4213 Builder.CreateStore(Reduced, RI.Variable);
4214 }
4215 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4216 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4217 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4218 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4219 Builder.CreateBr(ContinuationBlock);
4220
4221 // Populate the atomic reduction using the atomic elementwise reduction
4222 // function. There are no loads/stores here because they will be happening
4223 // inside the atomic elementwise reduction.
4224 Builder.SetInsertPoint(AtomicRedBlock);
4225 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4226 for (const ReductionInfo &RI : ReductionInfos) {
4227 InsertPointOrErrorTy AfterIP = RI.AtomicReductionGen(
4228 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4229 if (!AfterIP)
4230 return AfterIP.takeError();
4231 Builder.restoreIP(*AfterIP);
4232 if (!Builder.GetInsertBlock())
4233 return InsertPointTy();
4234 }
4235 Builder.CreateBr(ContinuationBlock);
4236 } else {
4237 Builder.CreateUnreachable();
4238 }
4239
4240 // Populate the outlined reduction function using the elementwise reduction
4241 // function. Partial values are extracted from the type-erased array of
4242 // pointers to private variables.
4243 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4244 IsByRef, /*isGPU=*/false);
4245 if (Err)
4246 return Err;
4247
4248 if (!Builder.GetInsertBlock())
4249 return InsertPointTy();
4250
4251 Builder.SetInsertPoint(ContinuationBlock);
4252 return Builder.saveIP();
4253}
4254
4255OpenMPIRBuilder::InsertPointOrErrorTy
4256OpenMPIRBuilder::createMaster(const LocationDescription &Loc,
4257 BodyGenCallbackTy BodyGenCB,
4258 FinalizeCallbackTy FiniCB) {
4259 if (!updateToLocation(Loc))
4260 return Loc.IP;
4261
4262 Directive OMPD = Directive::OMPD_master;
4263 uint32_t SrcLocStrSize;
4264 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4265 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4266 Value *ThreadId = getOrCreateThreadID(Ident);
4267 Value *Args[] = {Ident, ThreadId};
4268
4269 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4270 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4271
4272 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4273 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4274
4275 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4276 /*Conditional*/ true, /*hasFinalize*/ true);
4277}
4278
4279OpenMPIRBuilder::InsertPointOrErrorTy
4280OpenMPIRBuilder::createMasked(const LocationDescription &Loc,
4281 BodyGenCallbackTy BodyGenCB,
4282 FinalizeCallbackTy FiniCB, Value *Filter) {
4283 if (!updateToLocation(Loc))
4284 return Loc.IP;
4285
4286 Directive OMPD = Directive::OMPD_masked;
4287 uint32_t SrcLocStrSize;
4288 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4289 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4290 Value *ThreadId = getOrCreateThreadID(Ident);
4291 Value *Args[] = {Ident, ThreadId, Filter};
4292 Value *ArgsEnd[] = {Ident, ThreadId};
4293
4294 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4295 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4296
4297 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4298 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4299
4300 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4301 /*Conditional*/ true, /*hasFinalize*/ true);
4302}
4303
4305 llvm::FunctionCallee Callee,
4307 const llvm::Twine &Name) {
4308 llvm::CallInst *Call = Builder.CreateCall(
4309 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4310 Call->setDoesNotThrow();
4311 return Call;
4312}
4313
4314// Expects input basic block is dominated by BeforeScanBB.
4315// Once Scan directive is encountered, the code after scan directive should be
4316// dominated by AfterScanBB. Scan directive splits the code sequence to
4317// scan and input phase. Based on whether inclusive or exclusive
4318// clause is used in the scan directive and whether input loop or scan loop
4319// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4320// input loop and second is the scan loop. The code generated handles only
4321// inclusive scans now.
4322OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createScan(
4323 const LocationDescription &Loc, InsertPointTy AllocaIP,
4324 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4325 bool IsInclusive, ScanInfo *ScanRedInfo) {
4326 if (ScanRedInfo->OMPFirstScanLoop) {
4327 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4328 ScanVarsType, ScanRedInfo);
4329 if (Err)
4330 return Err;
4331 }
4332 if (!updateToLocation(Loc))
4333 return Loc.IP;
4334
4335 llvm::Value *IV = ScanRedInfo->IV;
4336
4337 if (ScanRedInfo->OMPFirstScanLoop) {
4338 // Emit buffer[i] = red; at the end of the input phase.
4339 for (size_t i = 0; i < ScanVars.size(); i++) {
4340 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4341 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4342 Type *DestTy = ScanVarsType[i];
4343 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4344 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4345
4346 Builder.CreateStore(Src, Val);
4347 }
4348 }
4349 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4350 emitBlock(ScanRedInfo->OMPScanDispatch,
4351 Builder.GetInsertBlock()->getParent());
4352
4353 if (!ScanRedInfo->OMPFirstScanLoop) {
4354 IV = ScanRedInfo->IV;
4355 // Emit red = buffer[i]; at the entrance to the scan phase.
4356 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4357 for (size_t i = 0; i < ScanVars.size(); i++) {
4358 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4359 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4360 Type *DestTy = ScanVarsType[i];
4361 Value *SrcPtr =
4362 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4363 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4364 Builder.CreateStore(Src, ScanVars[i]);
4365 }
4366 }
4367
4368 // TODO: Update it to CreateBr and remove dead blocks
4369 llvm::Value *CmpI = Builder.getInt1(true);
4370 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4371 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4372 ScanRedInfo->OMPAfterScanBlock);
4373 } else {
4374 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4375 ScanRedInfo->OMPBeforeScanBlock);
4376 }
4377 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4378 Builder.GetInsertBlock()->getParent());
4379 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4380 return Builder.saveIP();
4381}
4382
4383Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4384 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4385 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4386
4387 Builder.restoreIP(AllocaIP);
4388 // Create the shared pointer at alloca IP.
4389 for (size_t i = 0; i < ScanVars.size(); i++) {
4390 llvm::Value *BuffPtr =
4391 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4392 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4393 }
4394
4395 // Allocate temporary buffer by master thread
4396 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4397 InsertPointTy CodeGenIP) -> Error {
4398 Builder.restoreIP(CodeGenIP);
4399 Value *AllocSpan =
4400 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4401 for (size_t i = 0; i < ScanVars.size(); i++) {
4402 Type *IntPtrTy = Builder.getInt32Ty();
4403 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4404 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4405 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4406 AllocSpan, nullptr, "arr");
4407 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4408 }
4409 return Error::success();
4410 };
4411 // TODO: Perform finalization actions for variables. This has to be
4412 // called for variables which have destructors/finalizers.
4413 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4414
4415 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4416 llvm::Value *FilterVal = Builder.getInt32(0);
4417 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4418 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4419
4420 if (!AfterIP)
4421 return AfterIP.takeError();
4422 Builder.restoreIP(*AfterIP);
4423 BasicBlock *InputBB = Builder.GetInsertBlock();
4424 if (InputBB->getTerminator())
4425 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4426 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4427 if (!AfterIP)
4428 return AfterIP.takeError();
4429 Builder.restoreIP(*AfterIP);
4430
4431 return Error::success();
4432}
4433
4434Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4435 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4436 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4437 InsertPointTy CodeGenIP) -> Error {
4438 Builder.restoreIP(CodeGenIP);
4439 for (ReductionInfo RedInfo : ReductionInfos) {
4440 Value *PrivateVar = RedInfo.PrivateVariable;
4441 Value *OrigVar = RedInfo.Variable;
4442 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4443 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4444
4445 Type *SrcTy = RedInfo.ElementType;
4446 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4447 "arrayOffset");
4448 Value *Src = Builder.CreateLoad(SrcTy, Val);
4449
4450 Builder.CreateStore(Src, OrigVar);
4451 Builder.CreateFree(Buff);
4452 }
4453 return Error::success();
4454 };
4455 // TODO: Perform finalization actions for variables. This has to be
4456 // called for variables which have destructors/finalizers.
4457 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4458
4459 if (ScanRedInfo->OMPScanFinish->getTerminator())
4460 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4461 else
4462 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4463
4464 llvm::Value *FilterVal = Builder.getInt32(0);
4465 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4466 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4467
4468 if (!AfterIP)
4469 return AfterIP.takeError();
4470 Builder.restoreIP(*AfterIP);
4471 BasicBlock *InputBB = Builder.GetInsertBlock();
4472 if (InputBB->getTerminator())
4473 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4474 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4475 if (!AfterIP)
4476 return AfterIP.takeError();
4477 Builder.restoreIP(*AfterIP);
4478 return Error::success();
4479}
4480
4481OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitScanReduction(
4482 const LocationDescription &Loc,
4484 ScanInfo *ScanRedInfo) {
4485
4486 if (!updateToLocation(Loc))
4487 return Loc.IP;
4488 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4489 InsertPointTy CodeGenIP) -> Error {
4490 Builder.restoreIP(CodeGenIP);
4491 Function *CurFn = Builder.GetInsertBlock()->getParent();
4492 // for (int k = 0; k <= ceil(log2(n)); ++k)
4493 llvm::BasicBlock *LoopBB =
4494 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4495 llvm::BasicBlock *ExitBB =
4496 splitBB(Builder, false, "omp.outer.log.scan.exit");
4498 Builder.GetInsertBlock()->getModule(),
4499 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4500 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4501 llvm::Value *Arg =
4502 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4503 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4505 Builder.GetInsertBlock()->getModule(),
4506 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4507 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4508 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4509 llvm::Value *NMin1 = Builder.CreateNUWSub(
4510 ScanRedInfo->Span,
4511 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4512 Builder.SetInsertPoint(InputBB);
4513 Builder.CreateBr(LoopBB);
4514 emitBlock(LoopBB, CurFn);
4515 Builder.SetInsertPoint(LoopBB);
4516
4517 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4518 // size pow2k = 1;
4519 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4520 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4521 InputBB);
4522 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4523 InputBB);
4524 // for (size i = n - 1; i >= 2 ^ k; --i)
4525 // tmp[i] op= tmp[i-pow2k];
4526 llvm::BasicBlock *InnerLoopBB =
4527 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4528 llvm::BasicBlock *InnerExitBB =
4529 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4530 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4531 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4532 emitBlock(InnerLoopBB, CurFn);
4533 Builder.SetInsertPoint(InnerLoopBB);
4534 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4535 IVal->addIncoming(NMin1, LoopBB);
4536 for (ReductionInfo RedInfo : ReductionInfos) {
4537 Value *ReductionVal = RedInfo.PrivateVariable;
4538 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4539 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4540 Type *DestTy = RedInfo.ElementType;
4541 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4542 Value *LHSPtr =
4543 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4544 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4545 Value *RHSPtr =
4546 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4547 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4548 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4550 InsertPointOrErrorTy AfterIP =
4551 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4552 if (!AfterIP)
4553 return AfterIP.takeError();
4554 Builder.CreateStore(Result, LHSPtr);
4555 }
4556 llvm::Value *NextIVal = Builder.CreateNUWSub(
4557 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4558 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4559 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4560 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4561 emitBlock(InnerExitBB, CurFn);
4562 llvm::Value *Next = Builder.CreateNUWAdd(
4563 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4564 Counter->addIncoming(Next, Builder.GetInsertBlock());
4565 // pow2k <<= 1;
4566 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4567 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4568 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4569 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4570 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4571 return Error::success();
4572 };
4573
4574 // TODO: Perform finalization actions for variables. This has to be
4575 // called for variables which have destructors/finalizers.
4576 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4577
4578 llvm::Value *FilterVal = Builder.getInt32(0);
4579 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
4580 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4581
4582 if (!AfterIP)
4583 return AfterIP.takeError();
4584 Builder.restoreIP(*AfterIP);
4585 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4586
4587 if (!AfterIP)
4588 return AfterIP.takeError();
4589 Builder.restoreIP(*AfterIP);
4590 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
4591 if (Err)
4592 return Err;
4593
4594 return AfterIP;
4595}
4596
4597Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
4598 llvm::function_ref<Error()> InputLoopGen,
4599 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
4600 ScanInfo *ScanRedInfo) {
4601
4602 {
4603 // Emit loop with input phase:
4604 // for (i: 0..<num_iters>) {
4605 // <input phase>;
4606 // buffer[i] = red;
4607 // }
4608 ScanRedInfo->OMPFirstScanLoop = true;
4609 Error Err = InputLoopGen();
4610 if (Err)
4611 return Err;
4612 }
4613 {
4614 // Emit loop with scan phase:
4615 // for (i: 0..<num_iters>) {
4616 // red = buffer[i];
4617 // <scan phase>;
4618 // }
4619 ScanRedInfo->OMPFirstScanLoop = false;
4620 Error Err = ScanLoopGen(Builder.saveIP());
4621 if (Err)
4622 return Err;
4623 }
4624 return Error::success();
4625}
4626
4627void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
4628 Function *Fun = Builder.GetInsertBlock()->getParent();
4629 ScanRedInfo->OMPScanDispatch =
4630 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
4631 ScanRedInfo->OMPAfterScanBlock =
4632 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
4633 ScanRedInfo->OMPBeforeScanBlock =
4634 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
4635 ScanRedInfo->OMPScanLoopExit =
4636 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
4637}
4638CanonicalLoopInfo *OpenMPIRBuilder::createLoopSkeleton(
4639 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
4640 BasicBlock *PostInsertBefore, const Twine &Name) {
4641 Module *M = F->getParent();
4642 LLVMContext &Ctx = M->getContext();
4643 Type *IndVarTy = TripCount->getType();
4644
4645 // Create the basic block structure.
4646 BasicBlock *Preheader =
4647 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
4648 BasicBlock *Header =
4649 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
4650 BasicBlock *Cond =
4651 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
4652 BasicBlock *Body =
4653 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
4654 BasicBlock *Latch =
4655 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
4656 BasicBlock *Exit =
4657 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
4658 BasicBlock *After =
4659 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
4660
4661 // Use specified DebugLoc for new instructions.
4662 Builder.SetCurrentDebugLocation(DL);
4663
4664 Builder.SetInsertPoint(Preheader);
4665 Builder.CreateBr(Header);
4666
4667 Builder.SetInsertPoint(Header);
4668 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
4669 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
4670 Builder.CreateBr(Cond);
4671
4672 Builder.SetInsertPoint(Cond);
4673 Value *Cmp =
4674 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
4675 Builder.CreateCondBr(Cmp, Body, Exit);
4676
4677 Builder.SetInsertPoint(Body);
4678 Builder.CreateBr(Latch);
4679
4680 Builder.SetInsertPoint(Latch);
4681 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
4682 "omp_" + Name + ".next", /*HasNUW=*/true);
4683 Builder.CreateBr(Header);
4684 IndVarPHI->addIncoming(Next, Latch);
4685
4686 Builder.SetInsertPoint(Exit);
4687 Builder.CreateBr(After);
4688
4689 // Remember and return the canonical control flow.
4690 LoopInfos.emplace_front();
4691 CanonicalLoopInfo *CL = &LoopInfos.front();
4692
4693 CL->Header = Header;
4694 CL->Cond = Cond;
4695 CL->Latch = Latch;
4696 CL->Exit = Exit;
4697
4698#ifndef NDEBUG
4699 CL->assertOK();
4700#endif
4701 return CL;
4702}
4703
4705OpenMPIRBuilder::createCanonicalLoop(const LocationDescription &Loc,
4706 LoopBodyGenCallbackTy BodyGenCB,
4707 Value *TripCount, const Twine &Name) {
4708 BasicBlock *BB = Loc.IP.getBlock();
4709 BasicBlock *NextBB = BB->getNextNode();
4710
4711 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
4712 NextBB, NextBB, Name);
4713 BasicBlock *After = CL->getAfter();
4714
4715 // If location is not set, don't connect the loop.
4716 if (updateToLocation(Loc)) {
4717 // Split the loop at the insertion point: Branch to the preheader and move
4718 // every following instruction to after the loop (the After BB). Also, the
4719 // new successor is the loop's after block.
4720 spliceBB(Builder, After, /*CreateBranch=*/false);
4721 Builder.CreateBr(CL->getPreheader());
4722 }
4723
4724 // Emit the body content. We do it after connecting the loop to the CFG to
4725 // avoid that the callback encounters degenerate BBs.
4726 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
4727 return Err;
4728
4729#ifndef NDEBUG
4730 CL->assertOK();
4731#endif
4732 return CL;
4733}
4734
4735Expected<ScanInfo *> OpenMPIRBuilder::scanInfoInitialize() {
4736 ScanInfos.emplace_front();
4737 ScanInfo *Result = &ScanInfos.front();
4738 return Result;
4739}
4740
4742OpenMPIRBuilder::createCanonicalScanLoops(
4743 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4744 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4745 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
4746 LocationDescription ComputeLoc =
4747 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4748 updateToLocation(ComputeLoc);
4749
4751
4752 Value *TripCount = calculateCanonicalLoopTripCount(
4753 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4754 ScanRedInfo->Span = TripCount;
4755 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
4756 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
4757
4758 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4759 Builder.restoreIP(CodeGenIP);
4760 ScanRedInfo->IV = IV;
4761 createScanBBs(ScanRedInfo);
4762 BasicBlock *InputBlock = Builder.GetInsertBlock();
4763 Instruction *Terminator = InputBlock->getTerminator();
4764 assert(Terminator->getNumSuccessors() == 1);
4765 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
4766 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
4767 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
4768 Builder.GetInsertBlock()->getParent());
4769 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4770 emitBlock(ScanRedInfo->OMPScanLoopExit,
4771 Builder.GetInsertBlock()->getParent());
4772 Builder.CreateBr(ContinueBlock);
4773 Builder.SetInsertPoint(
4774 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
4775 return BodyGenCB(Builder.saveIP(), IV);
4776 };
4777
4778 const auto &&InputLoopGen = [&]() -> Error {
4779 Expected<CanonicalLoopInfo *> LoopInfo = createCanonicalLoop(
4780 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
4781 ComputeIP, Name, true, ScanRedInfo);
4782 if (!LoopInfo)
4783 return LoopInfo.takeError();
4784 Result.push_back(*LoopInfo);
4785 Builder.restoreIP((*LoopInfo)->getAfterIP());
4786 return Error::success();
4787 };
4788 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
4790 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
4791 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
4792 if (!LoopInfo)
4793 return LoopInfo.takeError();
4794 Result.push_back(*LoopInfo);
4795 Builder.restoreIP((*LoopInfo)->getAfterIP());
4796 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
4797 return Error::success();
4798 };
4799 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
4800 if (Err)
4801 return Err;
4802 return Result;
4803}
4804
4805Value *OpenMPIRBuilder::calculateCanonicalLoopTripCount(
4806 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
4807 bool IsSigned, bool InclusiveStop, const Twine &Name) {
4808
4809 // Consider the following difficulties (assuming 8-bit signed integers):
4810 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
4811 // DO I = 1, 100, 50
4812 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
4813 // DO I = 100, 0, -128
4814
4815 // Start, Stop and Step must be of the same integer type.
4816 auto *IndVarTy = cast<IntegerType>(Start->getType());
4817 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
4818 assert(IndVarTy == Step->getType() && "Step type mismatch");
4819
4820 updateToLocation(Loc);
4821
4822 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
4823 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
4824
4825 // Like Step, but always positive.
4826 Value *Incr = Step;
4827
4828 // Distance between Start and Stop; always positive.
4829 Value *Span;
4830
4831 // Condition whether there are no iterations are executed at all, e.g. because
4832 // UB < LB.
4833 Value *ZeroCmp;
4834
4835 if (IsSigned) {
4836 // Ensure that increment is positive. If not, negate and invert LB and UB.
4837 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
4838 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
4839 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
4840 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
4841 Span = Builder.CreateSub(UB, LB, "", false, true);
4842 ZeroCmp = Builder.CreateICmp(
4843 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
4844 } else {
4845 Span = Builder.CreateSub(Stop, Start, "", true);
4846 ZeroCmp = Builder.CreateICmp(
4847 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
4848 }
4849
4850 Value *CountIfLooping;
4851 if (InclusiveStop) {
4852 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
4853 } else {
4854 // Avoid incrementing past stop since it could overflow.
4855 Value *CountIfTwo = Builder.CreateAdd(
4856 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
4857 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
4858 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
4859 }
4860
4861 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
4862 "omp_" + Name + ".tripcount");
4863}
4864
4865Expected<CanonicalLoopInfo *> OpenMPIRBuilder::createCanonicalLoop(
4866 const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB,
4867 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
4868 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
4869 ScanInfo *ScanRedInfo) {
4870 LocationDescription ComputeLoc =
4871 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
4872
4873 Value *TripCount = calculateCanonicalLoopTripCount(
4874 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
4875
4876 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
4877 Builder.restoreIP(CodeGenIP);
4878 Value *Span = Builder.CreateMul(IV, Step);
4879 Value *IndVar = Builder.CreateAdd(Span, Start);
4880 if (InScan)
4881 ScanRedInfo->IV = IndVar;
4882 return BodyGenCB(Builder.saveIP(), IndVar);
4883 };
4884 LocationDescription LoopLoc =
4885 ComputeIP.isSet()
4886 ? Loc
4887 : LocationDescription(Builder.saveIP(),
4888 Builder.getCurrentDebugLocation());
4889 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
4890}
4891
4892// Returns an LLVM function to call for initializing loop bounds using OpenMP
4893// static scheduling for composite `distribute parallel for` depending on
4894// `type`. Only i32 and i64 are supported by the runtime. Always interpret
4895// integers as unsigned similarly to CanonicalLoopInfo.
4896static FunctionCallee
4898 OpenMPIRBuilder &OMPBuilder) {
4899 unsigned Bitwidth = Ty->getIntegerBitWidth();
4900 if (Bitwidth == 32)
4901 return OMPBuilder.getOrCreateRuntimeFunction(
4902 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
4903 if (Bitwidth == 64)
4904 return OMPBuilder.getOrCreateRuntimeFunction(
4905 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
4906 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4907}
4908
4909// Returns an LLVM function to call for initializing loop bounds using OpenMP
4910// static scheduling depending on `type`. Only i32 and i64 are supported by the
4911// runtime. Always interpret integers as unsigned similarly to
4912// CanonicalLoopInfo.
4914 OpenMPIRBuilder &OMPBuilder) {
4915 unsigned Bitwidth = Ty->getIntegerBitWidth();
4916 if (Bitwidth == 32)
4917 return OMPBuilder.getOrCreateRuntimeFunction(
4918 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
4919 if (Bitwidth == 64)
4920 return OMPBuilder.getOrCreateRuntimeFunction(
4921 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
4922 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
4923}
4924
4925OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
4926 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
4927 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
4928 OMPScheduleType DistScheduleSchedType) {
4929 assert(CLI->isValid() && "Requires a valid canonical loop");
4930 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
4931 "Require dedicated allocate IP");
4932
4933 // Set up the source location value for OpenMP runtime.
4934 Builder.restoreIP(CLI->getPreheaderIP());
4935 Builder.SetCurrentDebugLocation(DL);
4936
4937 uint32_t SrcLocStrSize;
4938 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
4939 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4940
4941 // Declare useful OpenMP runtime functions.
4942 Value *IV = CLI->getIndVar();
4943 Type *IVTy = IV->getType();
4944 FunctionCallee StaticInit =
4945 LoopType == WorksharingLoopType::DistributeForStaticLoop
4946 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
4947 : getKmpcForStaticInitForType(IVTy, M, *this);
4948 FunctionCallee StaticFini =
4949 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
4950
4951 // Allocate space for computed loop bounds as expected by the "init" function.
4952 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
4953
4954 Type *I32Type = Type::getInt32Ty(M.getContext());
4955 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
4956 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
4957 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
4958 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
4959 CLI->setLastIter(PLastIter);
4960
4961 // At the end of the preheader, prepare for calling the "init" function by
4962 // storing the current loop bounds into the allocated space. A canonical loop
4963 // always iterates from 0 to trip-count with step 1. Note that "init" expects
4964 // and produces an inclusive upper bound.
4965 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
4966 Constant *Zero = ConstantInt::get(IVTy, 0);
4967 Constant *One = ConstantInt::get(IVTy, 1);
4968 Builder.CreateStore(Zero, PLowerBound);
4969 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
4970 Builder.CreateStore(UpperBound, PUpperBound);
4971 Builder.CreateStore(One, PStride);
4972
4973 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
4974
4975 OMPScheduleType SchedType =
4976 (LoopType == WorksharingLoopType::DistributeStaticLoop)
4977 ? OMPScheduleType::OrderedDistribute
4979 Constant *SchedulingType =
4980 ConstantInt::get(I32Type, static_cast<int>(SchedType));
4981
4982 // Call the "init" function and update the trip count of the loop with the
4983 // value it produced.
4984 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
4985 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
4986 this](Value *SchedulingType, auto &Builder) {
4987 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
4988 PLowerBound, PUpperBound});
4989 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
4990 Value *PDistUpperBound =
4991 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
4992 Args.push_back(PDistUpperBound);
4993 }
4994 Args.append({PStride, One, Zero});
4995 createRuntimeFunctionCall(StaticInit, Args);
4996 };
4997 BuildInitCall(SchedulingType, Builder);
4998 if (HasDistSchedule &&
4999 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5000 Constant *DistScheduleSchedType = ConstantInt::get(
5001 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5002 // We want to emit a second init function call for the dist_schedule clause
5003 // to the Distribute construct. This should only be done however if a
5004 // Workshare Loop is nested within a Distribute Construct
5005 BuildInitCall(DistScheduleSchedType, Builder);
5006 }
5007 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5008 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5009 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5010 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5011 CLI->setTripCount(TripCount);
5012
5013 // Update all uses of the induction variable except the one in the condition
5014 // block that compares it with the actual upper bound, and the increment in
5015 // the latch block.
5016
5017 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5018 Builder.SetInsertPoint(CLI->getBody(),
5019 CLI->getBody()->getFirstInsertionPt());
5020 Builder.SetCurrentDebugLocation(DL);
5021 return Builder.CreateAdd(OldIV, LowerBound);
5022 });
5023
5024 // In the "exit" block, call the "fini" function.
5025 Builder.SetInsertPoint(CLI->getExit(),
5026 CLI->getExit()->getTerminator()->getIterator());
5027 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5028
5029 // Add the barrier if requested.
5030 if (NeedsBarrier) {
5031 InsertPointOrErrorTy BarrierIP =
5032 createBarrier(LocationDescription(Builder.saveIP(), DL),
5033 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5034 /* CheckCancelFlag */ false);
5035 if (!BarrierIP)
5036 return BarrierIP.takeError();
5037 }
5038
5039 InsertPointTy AfterIP = CLI->getAfterIP();
5040 CLI->invalidate();
5041
5042 return AfterIP;
5043}
5044
5045static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5046 LoopInfo &LI);
5047static void addLoopMetadata(CanonicalLoopInfo *Loop,
5048 ArrayRef<Metadata *> Properties);
5049
5050static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI,
5051 LLVMContext &Ctx, Loop *Loop,
5053 SmallVector<Metadata *> &LoopMDList) {
5054 SmallSet<BasicBlock *, 8> Reachable;
5055
5056 // Get the basic blocks from the loop in which memref instructions
5057 // can be found.
5058 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5059 // preferably without running any passes.
5060 for (BasicBlock *Block : Loop->getBlocks()) {
5061 if (Block == CLI->getCond() || Block == CLI->getHeader())
5062 continue;
5063 Reachable.insert(Block);
5064 }
5065
5066 // Add access group metadata to memory-access instructions.
5067 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5068 for (BasicBlock *BB : Reachable)
5069 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5070 // TODO: If the loop has existing parallel access metadata, have
5071 // to combine two lists.
5072 LoopMDList.push_back(MDNode::get(
5073 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5074}
5075
5076OpenMPIRBuilder::InsertPointOrErrorTy
5077OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5078 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5079 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5080 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5081 assert(CLI->isValid() && "Requires a valid canonical loop");
5082 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5083
5084 LLVMContext &Ctx = CLI->getFunction()->getContext();
5085 Value *IV = CLI->getIndVar();
5086 Value *OrigTripCount = CLI->getTripCount();
5087 Type *IVTy = IV->getType();
5088 assert(IVTy->getIntegerBitWidth() <= 64 &&
5089 "Max supported tripcount bitwidth is 64 bits");
5090 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5091 : Type::getInt64Ty(Ctx);
5092 Type *I32Type = Type::getInt32Ty(M.getContext());
5093 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5094 Constant *One = ConstantInt::get(InternalIVTy, 1);
5095
5096 Function *F = CLI->getFunction();
5098 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5099 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5100 LoopAnalysis LIA;
5101 LoopInfo &&LI = LIA.run(*F, FAM);
5102 Loop *L = LI.getLoopFor(CLI->getHeader());
5103 SmallVector<Metadata *> LoopMDList;
5104 if (ChunkSize || DistScheduleChunkSize)
5105 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5106 addLoopMetadata(CLI, LoopMDList);
5107
5108 // Declare useful OpenMP runtime functions.
5109 FunctionCallee StaticInit =
5110 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5111 FunctionCallee StaticFini =
5112 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5113
5114 // Allocate space for computed loop bounds as expected by the "init" function.
5115 Builder.restoreIP(AllocaIP);
5116 Builder.SetCurrentDebugLocation(DL);
5117 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5118 Value *PLowerBound =
5119 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5120 Value *PUpperBound =
5121 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5122 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5123 CLI->setLastIter(PLastIter);
5124
5125 // Set up the source location value for the OpenMP runtime.
5126 Builder.restoreIP(CLI->getPreheaderIP());
5127 Builder.SetCurrentDebugLocation(DL);
5128
5129 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5130 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5131 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5132 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5133 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5134 "distschedulechunksize");
5135 Value *CastedTripCount =
5136 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5137
5138 Constant *SchedulingType =
5139 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5140 Constant *DistSchedulingType =
5141 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5142 Builder.CreateStore(Zero, PLowerBound);
5143 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5144 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5145 Value *UpperBound =
5146 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5147 Builder.CreateStore(UpperBound, PUpperBound);
5148 Builder.CreateStore(One, PStride);
5149
5150 // Call the "init" function and update the trip count of the loop with the
5151 // value it produced.
5152 uint32_t SrcLocStrSize;
5153 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5154 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5155 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5156 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5157 PUpperBound, PStride, One,
5158 this](Value *SchedulingType, Value *ChunkSize,
5159 auto &Builder) {
5160 createRuntimeFunctionCall(
5161 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5162 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5163 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5164 /*pstride=*/PStride, /*incr=*/One,
5165 /*chunk=*/ChunkSize});
5166 };
5167 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5168 if (DistScheduleSchedType != OMPScheduleType::None &&
5169 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5170 SchedType != OMPScheduleType::OrderedDistribute) {
5171 // We want to emit a second init function call for the dist_schedule clause
5172 // to the Distribute construct. This should only be done however if a
5173 // Workshare Loop is nested within a Distribute Construct
5174 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5175 }
5176
5177 // Load values written by the "init" function.
5178 Value *FirstChunkStart =
5179 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5180 Value *FirstChunkStop =
5181 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5182 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5183 Value *ChunkRange =
5184 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5185 Value *NextChunkStride =
5186 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5187
5188 // Create outer "dispatch" loop for enumerating the chunks.
5189 BasicBlock *DispatchEnter = splitBB(Builder, true);
5190 Value *DispatchCounter;
5191
5192 // It is safe to assume this didn't return an error because the callback
5193 // passed into createCanonicalLoop is the only possible error source, and it
5194 // always returns success.
5195 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5196 {Builder.saveIP(), DL},
5197 [&](InsertPointTy BodyIP, Value *Counter) {
5198 DispatchCounter = Counter;
5199 return Error::success();
5200 },
5201 FirstChunkStart, CastedTripCount, NextChunkStride,
5202 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5203 "dispatch"));
5204
5205 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5206 // not have to preserve the canonical invariant.
5207 BasicBlock *DispatchBody = DispatchCLI->getBody();
5208 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5209 BasicBlock *DispatchExit = DispatchCLI->getExit();
5210 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5211 DispatchCLI->invalidate();
5212
5213 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5214 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5215 redirectTo(CLI->getExit(), DispatchLatch, DL);
5216 redirectTo(DispatchBody, DispatchEnter, DL);
5217
5218 // Prepare the prolog of the chunk loop.
5219 Builder.restoreIP(CLI->getPreheaderIP());
5220 Builder.SetCurrentDebugLocation(DL);
5221
5222 // Compute the number of iterations of the chunk loop.
5223 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5224 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5225 Value *IsLastChunk =
5226 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5227 Value *CountUntilOrigTripCount =
5228 Builder.CreateSub(CastedTripCount, DispatchCounter);
5229 Value *ChunkTripCount = Builder.CreateSelect(
5230 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5231 Value *BackcastedChunkTC =
5232 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5233 CLI->setTripCount(BackcastedChunkTC);
5234
5235 // Update all uses of the induction variable except the one in the condition
5236 // block that compares it with the actual upper bound, and the increment in
5237 // the latch block.
5238 Value *BackcastedDispatchCounter =
5239 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5240 CLI->mapIndVar([&](Instruction *) -> Value * {
5241 Builder.restoreIP(CLI->getBodyIP());
5242 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5243 });
5244
5245 // In the "exit" block, call the "fini" function.
5246 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5247 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5248
5249 // Add the barrier if requested.
5250 if (NeedsBarrier) {
5251 InsertPointOrErrorTy AfterIP =
5252 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5253 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5254 if (!AfterIP)
5255 return AfterIP.takeError();
5256 }
5257
5258#ifndef NDEBUG
5259 // Even though we currently do not support applying additional methods to it,
5260 // the chunk loop should remain a canonical loop.
5261 CLI->assertOK();
5262#endif
5263
5264 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5265}
5266
5267// Returns an LLVM function to call for executing an OpenMP static worksharing
5268// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5269// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5270static FunctionCallee
5271getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder,
5272 WorksharingLoopType LoopType) {
5273 unsigned Bitwidth = Ty->getIntegerBitWidth();
5274 Module &M = OMPBuilder->M;
5275 switch (LoopType) {
5276 case WorksharingLoopType::ForStaticLoop:
5277 if (Bitwidth == 32)
5278 return OMPBuilder->getOrCreateRuntimeFunction(
5279 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5280 if (Bitwidth == 64)
5281 return OMPBuilder->getOrCreateRuntimeFunction(
5282 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5283 break;
5284 case WorksharingLoopType::DistributeStaticLoop:
5285 if (Bitwidth == 32)
5286 return OMPBuilder->getOrCreateRuntimeFunction(
5287 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5288 if (Bitwidth == 64)
5289 return OMPBuilder->getOrCreateRuntimeFunction(
5290 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5291 break;
5292 case WorksharingLoopType::DistributeForStaticLoop:
5293 if (Bitwidth == 32)
5294 return OMPBuilder->getOrCreateRuntimeFunction(
5295 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5296 if (Bitwidth == 64)
5297 return OMPBuilder->getOrCreateRuntimeFunction(
5298 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5299 break;
5300 }
5301 if (Bitwidth != 32 && Bitwidth != 64) {
5302 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5303 }
5304 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5305}
5306
5307// Inserts a call to proper OpenMP Device RTL function which handles
5308// loop worksharing.
5309static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder,
5310 WorksharingLoopType LoopType,
5311 BasicBlock *InsertBlock, Value *Ident,
5312 Value *LoopBodyArg, Value *TripCount,
5313 Function &LoopBodyFn, bool NoLoop) {
5314 Type *TripCountTy = TripCount->getType();
5315 Module &M = OMPBuilder->M;
5316 IRBuilder<> &Builder = OMPBuilder->Builder;
5317 FunctionCallee RTLFn =
5318 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5319 SmallVector<Value *, 8> RealArgs;
5320 RealArgs.push_back(Ident);
5321 RealArgs.push_back(&LoopBodyFn);
5322 RealArgs.push_back(LoopBodyArg);
5323 RealArgs.push_back(TripCount);
5324 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5325 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5326 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5327 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5328 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5329 return;
5330 }
5331 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5332 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5333 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5334 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5335
5336 RealArgs.push_back(
5337 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5338 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5339 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5340 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5341 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5342 } else {
5343 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5344 }
5345
5346 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5347}
5348
5350 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5351 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5352 WorksharingLoopType LoopType, bool NoLoop) {
5353 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5354 BasicBlock *Preheader = CLI->getPreheader();
5355 Value *TripCount = CLI->getTripCount();
5356
5357 // After loop body outling, the loop body contains only set up
5358 // of loop body argument structure and the call to the outlined
5359 // loop body function. Firstly, we need to move setup of loop body args
5360 // into loop preheader.
5361 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5362 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5363
5364 // The next step is to remove the whole loop. We do not it need anymore.
5365 // That's why make an unconditional branch from loop preheader to loop
5366 // exit block
5367 Builder.restoreIP({Preheader, Preheader->end()});
5368 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5369 Preheader->getTerminator()->eraseFromParent();
5370 Builder.CreateBr(CLI->getExit());
5371
5372 // Delete dead loop blocks
5373 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5374 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5375 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5376 CleanUpInfo.EntryBB = CLI->getHeader();
5377 CleanUpInfo.ExitBB = CLI->getExit();
5378 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5379 DeleteDeadBlocks(BlocksToBeRemoved);
5380
5381 // Find the instruction which corresponds to loop body argument structure
5382 // and remove the call to loop body function instruction.
5383 Value *LoopBodyArg;
5384 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5385 assert(OutlinedFnUser &&
5386 "Expected unique undroppable user of outlined function");
5387 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5388 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5389 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5390 "Expected outlined function call to be located in loop preheader");
5391 // Check in case no argument structure has been passed.
5392 if (OutlinedFnCallInstruction->arg_size() > 1)
5393 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5394 else
5395 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5396 OutlinedFnCallInstruction->eraseFromParent();
5397
5398 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5399 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5400
5401 for (auto &ToBeDeletedItem : ToBeDeleted)
5402 ToBeDeletedItem->eraseFromParent();
5403 CLI->invalidate();
5404}
5405
5406OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5407 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5408 WorksharingLoopType LoopType, bool NoLoop) {
5409 uint32_t SrcLocStrSize;
5410 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5411 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5412
5413 OutlineInfo OI;
5414 OI.OuterAllocaBB = CLI->getPreheader();
5415 Function *OuterFn = CLI->getPreheader()->getParent();
5416
5417 // Instructions which need to be deleted at the end of code generation
5419
5420 OI.OuterAllocaBB = AllocaIP.getBlock();
5421
5422 // Mark the body loop as region which needs to be extracted
5423 OI.EntryBB = CLI->getBody();
5424 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5425 "omp.prelatch", true);
5426
5427 // Prepare loop body for extraction
5428 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5429
5430 // Insert new loop counter variable which will be used only in loop
5431 // body.
5432 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5433 Instruction *NewLoopCntLoad =
5434 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5435 // New loop counter instructions are redundant in the loop preheader when
5436 // code generation for workshare loop is finshed. That's why mark them as
5437 // ready for deletion.
5438 ToBeDeleted.push_back(NewLoopCntLoad);
5439 ToBeDeleted.push_back(NewLoopCnt);
5440
5441 // Analyse loop body region. Find all input variables which are used inside
5442 // loop body region.
5443 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5445 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5446
5447 CodeExtractorAnalysisCache CEAC(*OuterFn);
5448 CodeExtractor Extractor(Blocks,
5449 /* DominatorTree */ nullptr,
5450 /* AggregateArgs */ true,
5451 /* BlockFrequencyInfo */ nullptr,
5452 /* BranchProbabilityInfo */ nullptr,
5453 /* AssumptionCache */ nullptr,
5454 /* AllowVarArgs */ true,
5455 /* AllowAlloca */ true,
5456 /* AllocationBlock */ CLI->getPreheader(),
5457 /* Suffix */ ".omp_wsloop",
5458 /* AggrArgsIn0AddrSpace */ true);
5459
5460 BasicBlock *CommonExit = nullptr;
5461 SetVector<Value *> SinkingCands, HoistingCands;
5462
5463 // Find allocas outside the loop body region which are used inside loop
5464 // body
5465 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5466
5467 // We need to model loop body region as the function f(cnt, loop_arg).
5468 // That's why we replace loop induction variable by the new counter
5469 // which will be one of loop body function argument
5470 SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
5471 CLI->getIndVar()->user_end());
5472 for (auto Use : Users) {
5473 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5474 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5475 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5476 }
5477 }
5478 }
5479 // Make sure that loop counter variable is not merged into loop body
5480 // function argument structure and it is passed as separate variable
5481 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5482
5483 // PostOutline CB is invoked when loop body function is outlined and
5484 // loop body is replaced by call to outlined function. We need to add
5485 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5486 // function will handle loop control logic.
5487 //
5488 OI.PostOutlineCB = [=, ToBeDeletedVec =
5489 std::move(ToBeDeleted)](Function &OutlinedFn) {
5490 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5491 LoopType, NoLoop);
5492 };
5493 addOutlineInfo(std::move(OI));
5494 return CLI->getAfterIP();
5495}
5496
5497OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyWorkshareLoop(
5498 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5499 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5500 bool HasSimdModifier, bool HasMonotonicModifier,
5501 bool HasNonmonotonicModifier, bool HasOrderedClause,
5502 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5503 Value *DistScheduleChunkSize) {
5504 if (Config.isTargetDevice())
5505 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5506 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5507 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5508 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
5509
5510 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5511 OMPScheduleType::ModifierOrdered;
5512 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
5513 if (HasDistSchedule) {
5514 DistScheduleSchedType = DistScheduleChunkSize
5515 ? OMPScheduleType::OrderedDistributeChunked
5516 : OMPScheduleType::OrderedDistribute;
5517 }
5518 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5519 case OMPScheduleType::BaseStatic:
5520 case OMPScheduleType::BaseDistribute:
5521 assert((!ChunkSize || !DistScheduleChunkSize) &&
5522 "No chunk size with static-chunked schedule");
5523 if (IsOrdered && !HasDistSchedule)
5524 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5525 NeedsBarrier, ChunkSize);
5526 // FIXME: Monotonicity ignored?
5527 if (DistScheduleChunkSize)
5528 return applyStaticChunkedWorkshareLoop(
5529 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5530 DistScheduleChunkSize, DistScheduleSchedType);
5531 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
5532 HasDistSchedule);
5533
5534 case OMPScheduleType::BaseStaticChunked:
5535 case OMPScheduleType::BaseDistributeChunked:
5536 if (IsOrdered && !HasDistSchedule)
5537 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5538 NeedsBarrier, ChunkSize);
5539 // FIXME: Monotonicity ignored?
5540 return applyStaticChunkedWorkshareLoop(
5541 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5542 DistScheduleChunkSize, DistScheduleSchedType);
5543
5544 case OMPScheduleType::BaseRuntime:
5545 case OMPScheduleType::BaseAuto:
5546 case OMPScheduleType::BaseGreedy:
5547 case OMPScheduleType::BaseBalanced:
5548 case OMPScheduleType::BaseSteal:
5549 case OMPScheduleType::BaseGuidedSimd:
5550 case OMPScheduleType::BaseRuntimeSimd:
5551 assert(!ChunkSize &&
5552 "schedule type does not support user-defined chunk sizes");
5553 [[fallthrough]];
5554 case OMPScheduleType::BaseDynamicChunked:
5555 case OMPScheduleType::BaseGuidedChunked:
5556 case OMPScheduleType::BaseGuidedIterativeChunked:
5557 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5558 case OMPScheduleType::BaseStaticBalancedChunked:
5559 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5560 NeedsBarrier, ChunkSize);
5561
5562 default:
5563 llvm_unreachable("Unknown/unimplemented schedule kind");
5564 }
5565}
5566
5567/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5568/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5569/// the runtime. Always interpret integers as unsigned similarly to
5570/// CanonicalLoopInfo.
5571static FunctionCallee
5572getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5573 unsigned Bitwidth = Ty->getIntegerBitWidth();
5574 if (Bitwidth == 32)
5575 return OMPBuilder.getOrCreateRuntimeFunction(
5576 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5577 if (Bitwidth == 64)
5578 return OMPBuilder.getOrCreateRuntimeFunction(
5579 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5580 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5581}
5582
5583/// Returns an LLVM function to call for updating the next loop using OpenMP
5584/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5585/// the runtime. Always interpret integers as unsigned similarly to
5586/// CanonicalLoopInfo.
5587static FunctionCallee
5588getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5589 unsigned Bitwidth = Ty->getIntegerBitWidth();
5590 if (Bitwidth == 32)
5591 return OMPBuilder.getOrCreateRuntimeFunction(
5592 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
5593 if (Bitwidth == 64)
5594 return OMPBuilder.getOrCreateRuntimeFunction(
5595 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
5596 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5597}
5598
5599/// Returns an LLVM function to call for finalizing the dynamic loop using
5600/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
5601/// interpret integers as unsigned similarly to CanonicalLoopInfo.
5602static FunctionCallee
5603getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
5604 unsigned Bitwidth = Ty->getIntegerBitWidth();
5605 if (Bitwidth == 32)
5606 return OMPBuilder.getOrCreateRuntimeFunction(
5607 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
5608 if (Bitwidth == 64)
5609 return OMPBuilder.getOrCreateRuntimeFunction(
5610 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
5611 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5612}
5613
5614OpenMPIRBuilder::InsertPointOrErrorTy
5615OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
5616 InsertPointTy AllocaIP,
5617 OMPScheduleType SchedType,
5618 bool NeedsBarrier, Value *Chunk) {
5619 assert(CLI->isValid() && "Requires a valid canonical loop");
5620 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5621 "Require dedicated allocate IP");
5623 "Require valid schedule type");
5624
5625 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
5626 OMPScheduleType::ModifierOrdered;
5627
5628 // Set up the source location value for OpenMP runtime.
5629 Builder.SetCurrentDebugLocation(DL);
5630
5631 uint32_t SrcLocStrSize;
5632 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5633 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5634
5635 // Declare useful OpenMP runtime functions.
5636 Value *IV = CLI->getIndVar();
5637 Type *IVTy = IV->getType();
5638 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
5639 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
5640
5641 // Allocate space for computed loop bounds as expected by the "init" function.
5642 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5643 Type *I32Type = Type::getInt32Ty(M.getContext());
5644 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5645 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5646 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5647 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5648 CLI->setLastIter(PLastIter);
5649
5650 // At the end of the preheader, prepare for calling the "init" function by
5651 // storing the current loop bounds into the allocated space. A canonical loop
5652 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5653 // and produces an inclusive upper bound.
5654 BasicBlock *PreHeader = CLI->getPreheader();
5655 Builder.SetInsertPoint(PreHeader->getTerminator());
5656 Constant *One = ConstantInt::get(IVTy, 1);
5657 Builder.CreateStore(One, PLowerBound);
5658 Value *UpperBound = CLI->getTripCount();
5659 Builder.CreateStore(UpperBound, PUpperBound);
5660 Builder.CreateStore(One, PStride);
5661
5662 BasicBlock *Header = CLI->getHeader();
5663 BasicBlock *Exit = CLI->getExit();
5664 BasicBlock *Cond = CLI->getCond();
5665 BasicBlock *Latch = CLI->getLatch();
5666 InsertPointTy AfterIP = CLI->getAfterIP();
5667
5668 // The CLI will be "broken" in the code below, as the loop is no longer
5669 // a valid canonical loop.
5670
5671 if (!Chunk)
5672 Chunk = One;
5673
5674 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5675
5676 Constant *SchedulingType =
5677 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5678
5679 // Call the "init" function.
5680 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
5681 /* LowerBound */ One, UpperBound,
5682 /* step */ One, Chunk});
5683
5684 // An outer loop around the existing one.
5685 BasicBlock *OuterCond = BasicBlock::Create(
5686 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
5687 PreHeader->getParent());
5688 // This needs to be 32-bit always, so can't use the IVTy Zero above.
5689 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
5690 Value *Res = createRuntimeFunctionCall(
5691 DynamicNext,
5692 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
5693 Constant *Zero32 = ConstantInt::get(I32Type, 0);
5694 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
5695 Value *LowerBound =
5696 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
5697 Builder.CreateCondBr(MoreWork, Header, Exit);
5698
5699 // Change PHI-node in loop header to use outer cond rather than preheader,
5700 // and set IV to the LowerBound.
5701 Instruction *Phi = &Header->front();
5702 auto *PI = cast<PHINode>(Phi);
5703 PI->setIncomingBlock(0, OuterCond);
5704 PI->setIncomingValue(0, LowerBound);
5705
5706 // Then set the pre-header to jump to the OuterCond
5707 Instruction *Term = PreHeader->getTerminator();
5708 auto *Br = cast<BranchInst>(Term);
5709 Br->setSuccessor(0, OuterCond);
5710
5711 // Modify the inner condition:
5712 // * Use the UpperBound returned from the DynamicNext call.
5713 // * jump to the loop outer loop when done with one of the inner loops.
5714 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
5715 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
5716 Instruction *Comp = &*Builder.GetInsertPoint();
5717 auto *CI = cast<CmpInst>(Comp);
5718 CI->setOperand(1, UpperBound);
5719 // Redirect the inner exit to branch to outer condition.
5720 Instruction *Branch = &Cond->back();
5721 auto *BI = cast<BranchInst>(Branch);
5722 assert(BI->getSuccessor(1) == Exit);
5723 BI->setSuccessor(1, OuterCond);
5724
5725 // Call the "fini" function if "ordered" is present in wsloop directive.
5726 if (Ordered) {
5727 Builder.SetInsertPoint(&Latch->back());
5728 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
5729 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
5730 }
5731
5732 // Add the barrier if requested.
5733 if (NeedsBarrier) {
5734 Builder.SetInsertPoint(&Exit->back());
5735 InsertPointOrErrorTy BarrierIP =
5736 createBarrier(LocationDescription(Builder.saveIP(), DL),
5737 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5738 /* CheckCancelFlag */ false);
5739 if (!BarrierIP)
5740 return BarrierIP.takeError();
5741 }
5742
5743 CLI->invalidate();
5744 return AfterIP;
5745}
5746
5747/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
5748/// after this \p OldTarget will be orphaned.
5750 BasicBlock *NewTarget, DebugLoc DL) {
5751 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
5752 redirectTo(Pred, NewTarget, DL);
5753}
5754
5755/// Determine which blocks in \p BBs are reachable from outside and remove the
5756/// ones that are not reachable from the function.
5759 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
5760 for (Use &U : BB->uses()) {
5761 auto *UseInst = dyn_cast<Instruction>(U.getUser());
5762 if (!UseInst)
5763 continue;
5764 if (BBsToErase.count(UseInst->getParent()))
5765 continue;
5766 return true;
5767 }
5768 return false;
5769 };
5770
5771 while (BBsToErase.remove_if(HasRemainingUses)) {
5772 // Try again if anything was removed.
5773 }
5774
5775 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
5776 DeleteDeadBlocks(BBVec);
5777}
5778
5779CanonicalLoopInfo *
5780OpenMPIRBuilder::collapseLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5781 InsertPointTy ComputeIP) {
5782 assert(Loops.size() >= 1 && "At least one loop required");
5783 size_t NumLoops = Loops.size();
5784
5785 // Nothing to do if there is already just one loop.
5786 if (NumLoops == 1)
5787 return Loops.front();
5788
5789 CanonicalLoopInfo *Outermost = Loops.front();
5790 CanonicalLoopInfo *Innermost = Loops.back();
5791 BasicBlock *OrigPreheader = Outermost->getPreheader();
5792 BasicBlock *OrigAfter = Outermost->getAfter();
5793 Function *F = OrigPreheader->getParent();
5794
5795 // Loop control blocks that may become orphaned later.
5796 SmallVector<BasicBlock *, 12> OldControlBBs;
5797 OldControlBBs.reserve(6 * Loops.size());
5798 for (CanonicalLoopInfo *Loop : Loops)
5799 Loop->collectControlBlocks(OldControlBBs);
5800
5801 // Setup the IRBuilder for inserting the trip count computation.
5802 Builder.SetCurrentDebugLocation(DL);
5803 if (ComputeIP.isSet())
5804 Builder.restoreIP(ComputeIP);
5805 else
5806 Builder.restoreIP(Outermost->getPreheaderIP());
5807
5808 // Derive the collapsed' loop trip count.
5809 // TODO: Find common/largest indvar type.
5810 Value *CollapsedTripCount = nullptr;
5811 for (CanonicalLoopInfo *L : Loops) {
5812 assert(L->isValid() &&
5813 "All loops to collapse must be valid canonical loops");
5814 Value *OrigTripCount = L->getTripCount();
5815 if (!CollapsedTripCount) {
5816 CollapsedTripCount = OrigTripCount;
5817 continue;
5818 }
5819
5820 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
5821 CollapsedTripCount =
5822 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
5823 }
5824
5825 // Create the collapsed loop control flow.
5826 CanonicalLoopInfo *Result =
5827 createLoopSkeleton(DL, CollapsedTripCount, F,
5828 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
5829
5830 // Build the collapsed loop body code.
5831 // Start with deriving the input loop induction variables from the collapsed
5832 // one, using a divmod scheme. To preserve the original loops' order, the
5833 // innermost loop use the least significant bits.
5834 Builder.restoreIP(Result->getBodyIP());
5835
5836 Value *Leftover = Result->getIndVar();
5837 SmallVector<Value *> NewIndVars;
5838 NewIndVars.resize(NumLoops);
5839 for (int i = NumLoops - 1; i >= 1; --i) {
5840 Value *OrigTripCount = Loops[i]->getTripCount();
5841
5842 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
5843 NewIndVars[i] = NewIndVar;
5844
5845 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
5846 }
5847 // Outermost loop gets all the remaining bits.
5848 NewIndVars[0] = Leftover;
5849
5850 // Construct the loop body control flow.
5851 // We progressively construct the branch structure following in direction of
5852 // the control flow, from the leading in-between code, the loop nest body, the
5853 // trailing in-between code, and rejoining the collapsed loop's latch.
5854 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
5855 // the ContinueBlock is set, continue with that block. If ContinuePred, use
5856 // its predecessors as sources.
5857 BasicBlock *ContinueBlock = Result->getBody();
5858 BasicBlock *ContinuePred = nullptr;
5859 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
5860 BasicBlock *NextSrc) {
5861 if (ContinueBlock)
5862 redirectTo(ContinueBlock, Dest, DL);
5863 else
5864 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
5865
5866 ContinueBlock = nullptr;
5867 ContinuePred = NextSrc;
5868 };
5869
5870 // The code before the nested loop of each level.
5871 // Because we are sinking it into the nest, it will be executed more often
5872 // that the original loop. More sophisticated schemes could keep track of what
5873 // the in-between code is and instantiate it only once per thread.
5874 for (size_t i = 0; i < NumLoops - 1; ++i)
5875 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
5876
5877 // Connect the loop nest body.
5878 ContinueWith(Innermost->getBody(), Innermost->getLatch());
5879
5880 // The code after the nested loop at each level.
5881 for (size_t i = NumLoops - 1; i > 0; --i)
5882 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
5883
5884 // Connect the finished loop to the collapsed loop latch.
5885 ContinueWith(Result->getLatch(), nullptr);
5886
5887 // Replace the input loops with the new collapsed loop.
5888 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
5889 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
5890
5891 // Replace the input loop indvars with the derived ones.
5892 for (size_t i = 0; i < NumLoops; ++i)
5893 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
5894
5895 // Remove unused parts of the input loops.
5896 removeUnusedBlocksFromParent(OldControlBBs);
5897
5898 for (CanonicalLoopInfo *L : Loops)
5899 L->invalidate();
5900
5901#ifndef NDEBUG
5902 Result->assertOK();
5903#endif
5904 return Result;
5905}
5906
5907std::vector<CanonicalLoopInfo *>
5908OpenMPIRBuilder::tileLoops(DebugLoc DL, ArrayRef<CanonicalLoopInfo *> Loops,
5909 ArrayRef<Value *> TileSizes) {
5910 assert(TileSizes.size() == Loops.size() &&
5911 "Must pass as many tile sizes as there are loops");
5912 int NumLoops = Loops.size();
5913 assert(NumLoops >= 1 && "At least one loop to tile required");
5914
5915 CanonicalLoopInfo *OutermostLoop = Loops.front();
5916 CanonicalLoopInfo *InnermostLoop = Loops.back();
5917 Function *F = OutermostLoop->getBody()->getParent();
5918 BasicBlock *InnerEnter = InnermostLoop->getBody();
5919 BasicBlock *InnerLatch = InnermostLoop->getLatch();
5920
5921 // Loop control blocks that may become orphaned later.
5922 SmallVector<BasicBlock *, 12> OldControlBBs;
5923 OldControlBBs.reserve(6 * Loops.size());
5924 for (CanonicalLoopInfo *Loop : Loops)
5925 Loop->collectControlBlocks(OldControlBBs);
5926
5927 // Collect original trip counts and induction variable to be accessible by
5928 // index. Also, the structure of the original loops is not preserved during
5929 // the construction of the tiled loops, so do it before we scavenge the BBs of
5930 // any original CanonicalLoopInfo.
5931 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
5932 for (CanonicalLoopInfo *L : Loops) {
5933 assert(L->isValid() && "All input loops must be valid canonical loops");
5934 OrigTripCounts.push_back(L->getTripCount());
5935 OrigIndVars.push_back(L->getIndVar());
5936 }
5937
5938 // Collect the code between loop headers. These may contain SSA definitions
5939 // that are used in the loop nest body. To be usable with in the innermost
5940 // body, these BasicBlocks will be sunk into the loop nest body. That is,
5941 // these instructions may be executed more often than before the tiling.
5942 // TODO: It would be sufficient to only sink them into body of the
5943 // corresponding tile loop.
5945 for (int i = 0; i < NumLoops - 1; ++i) {
5946 CanonicalLoopInfo *Surrounding = Loops[i];
5947 CanonicalLoopInfo *Nested = Loops[i + 1];
5948
5949 BasicBlock *EnterBB = Surrounding->getBody();
5950 BasicBlock *ExitBB = Nested->getHeader();
5951 InbetweenCode.emplace_back(EnterBB, ExitBB);
5952 }
5953
5954 // Compute the trip counts of the floor loops.
5955 Builder.SetCurrentDebugLocation(DL);
5956 Builder.restoreIP(OutermostLoop->getPreheaderIP());
5957 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
5958 for (int i = 0; i < NumLoops; ++i) {
5959 Value *TileSize = TileSizes[i];
5960 Value *OrigTripCount = OrigTripCounts[i];
5961 Type *IVType = OrigTripCount->getType();
5962
5963 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
5964 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
5965
5966 // 0 if tripcount divides the tilesize, 1 otherwise.
5967 // 1 means we need an additional iteration for a partial tile.
5968 //
5969 // Unfortunately we cannot just use the roundup-formula
5970 // (tripcount + tilesize - 1)/tilesize
5971 // because the summation might overflow. We do not want introduce undefined
5972 // behavior when the untiled loop nest did not.
5973 Value *FloorTripOverflow =
5974 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
5975
5976 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
5977 Value *FloorTripCount =
5978 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
5979 "omp_floor" + Twine(i) + ".tripcount", true);
5980
5981 // Remember some values for later use.
5982 FloorCompleteCount.push_back(FloorCompleteTripCount);
5983 FloorCount.push_back(FloorTripCount);
5984 FloorRems.push_back(FloorTripRem);
5985 }
5986
5987 // Generate the new loop nest, from the outermost to the innermost.
5988 std::vector<CanonicalLoopInfo *> Result;
5989 Result.reserve(NumLoops * 2);
5990
5991 // The basic block of the surrounding loop that enters the nest generated
5992 // loop.
5993 BasicBlock *Enter = OutermostLoop->getPreheader();
5994
5995 // The basic block of the surrounding loop where the inner code should
5996 // continue.
5997 BasicBlock *Continue = OutermostLoop->getAfter();
5998
5999 // Where the next loop basic block should be inserted.
6000 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6001
6002 auto EmbeddNewLoop =
6003 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6004 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6005 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6006 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6007 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6008 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6009
6010 // Setup the position where the next embedded loop connects to this loop.
6011 Enter = EmbeddedLoop->getBody();
6012 Continue = EmbeddedLoop->getLatch();
6013 OutroInsertBefore = EmbeddedLoop->getLatch();
6014 return EmbeddedLoop;
6015 };
6016
6017 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6018 const Twine &NameBase) {
6019 for (auto P : enumerate(TripCounts)) {
6020 CanonicalLoopInfo *EmbeddedLoop =
6021 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6022 Result.push_back(EmbeddedLoop);
6023 }
6024 };
6025
6026 EmbeddNewLoops(FloorCount, "floor");
6027
6028 // Within the innermost floor loop, emit the code that computes the tile
6029 // sizes.
6030 Builder.SetInsertPoint(Enter->getTerminator());
6031 SmallVector<Value *, 4> TileCounts;
6032 for (int i = 0; i < NumLoops; ++i) {
6033 CanonicalLoopInfo *FloorLoop = Result[i];
6034 Value *TileSize = TileSizes[i];
6035
6036 Value *FloorIsEpilogue =
6037 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6038 Value *TileTripCount =
6039 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6040
6041 TileCounts.push_back(TileTripCount);
6042 }
6043
6044 // Create the tile loops.
6045 EmbeddNewLoops(TileCounts, "tile");
6046
6047 // Insert the inbetween code into the body.
6048 BasicBlock *BodyEnter = Enter;
6049 BasicBlock *BodyEntered = nullptr;
6050 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6051 BasicBlock *EnterBB = P.first;
6052 BasicBlock *ExitBB = P.second;
6053
6054 if (BodyEnter)
6055 redirectTo(BodyEnter, EnterBB, DL);
6056 else
6057 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6058
6059 BodyEnter = nullptr;
6060 BodyEntered = ExitBB;
6061 }
6062
6063 // Append the original loop nest body into the generated loop nest body.
6064 if (BodyEnter)
6065 redirectTo(BodyEnter, InnerEnter, DL);
6066 else
6067 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6069
6070 // Replace the original induction variable with an induction variable computed
6071 // from the tile and floor induction variables.
6072 Builder.restoreIP(Result.back()->getBodyIP());
6073 for (int i = 0; i < NumLoops; ++i) {
6074 CanonicalLoopInfo *FloorLoop = Result[i];
6075 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6076 Value *OrigIndVar = OrigIndVars[i];
6077 Value *Size = TileSizes[i];
6078
6079 Value *Scale =
6080 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6081 Value *Shift =
6082 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6083 OrigIndVar->replaceAllUsesWith(Shift);
6084 }
6085
6086 // Remove unused parts of the original loops.
6087 removeUnusedBlocksFromParent(OldControlBBs);
6088
6089 for (CanonicalLoopInfo *L : Loops)
6090 L->invalidate();
6091
6092#ifndef NDEBUG
6093 for (CanonicalLoopInfo *GenL : Result)
6094 GenL->assertOK();
6095#endif
6096 return Result;
6097}
6098
6099/// Attach metadata \p Properties to the basic block described by \p BB. If the
6100/// basic block already has metadata, the basic block properties are appended.
6102 ArrayRef<Metadata *> Properties) {
6103 // Nothing to do if no property to attach.
6104 if (Properties.empty())
6105 return;
6106
6107 LLVMContext &Ctx = BB->getContext();
6108 SmallVector<Metadata *> NewProperties;
6109 NewProperties.push_back(nullptr);
6110
6111 // If the basic block already has metadata, prepend it to the new metadata.
6112 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6113 if (Existing)
6114 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6115
6116 append_range(NewProperties, Properties);
6117 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6118 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6119
6120 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6121}
6122
6123/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6124/// loop already has metadata, the loop properties are appended.
6125static void addLoopMetadata(CanonicalLoopInfo *Loop,
6126 ArrayRef<Metadata *> Properties) {
6127 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6128
6129 // Attach metadata to the loop's latch
6130 BasicBlock *Latch = Loop->getLatch();
6131 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6132 addBasicBlockMetadata(Latch, Properties);
6133}
6134
6135/// Attach llvm.access.group metadata to the memref instructions of \p Block
6137 LoopInfo &LI) {
6138 for (Instruction &I : *Block) {
6139 if (I.mayReadOrWriteMemory()) {
6140 // TODO: This instruction may already have access group from
6141 // other pragmas e.g. #pragma clang loop vectorize. Append
6142 // so that the existing metadata is not overwritten.
6143 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6144 }
6145 }
6146}
6147
6148void OpenMPIRBuilder::unrollLoopFull(DebugLoc, CanonicalLoopInfo *Loop) {
6149 LLVMContext &Ctx = Builder.getContext();
6151 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6152 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6153}
6154
6155void OpenMPIRBuilder::unrollLoopHeuristic(DebugLoc, CanonicalLoopInfo *Loop) {
6156 LLVMContext &Ctx = Builder.getContext();
6158 Loop, {
6159 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6160 });
6161}
6162
6163void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6164 Value *IfCond, ValueToValueMapTy &VMap,
6165 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6166 const Twine &NamePrefix) {
6167 Function *F = CanonicalLoop->getFunction();
6168
6169 // We can't do
6170 // if (cond) {
6171 // simd_loop;
6172 // } else {
6173 // non_simd_loop;
6174 // }
6175 // because then the CanonicalLoopInfo would only point to one of the loops:
6176 // leading to other constructs operating on the same loop to malfunction.
6177 // Instead generate
6178 // while (...) {
6179 // if (cond) {
6180 // simd_body;
6181 // } else {
6182 // not_simd_body;
6183 // }
6184 // }
6185 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6186 // body at -O3
6187
6188 // Define where if branch should be inserted
6189 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6190
6191 // Create additional blocks for the if statement
6192 BasicBlock *Cond = SplitBeforeIt->getParent();
6193 llvm::LLVMContext &C = Cond->getContext();
6195 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6197 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6198
6199 // Create if condition branch.
6200 Builder.SetInsertPoint(SplitBeforeIt);
6201 Instruction *BrInstr =
6202 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6203 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6204 // Then block contains branch to omp loop body which needs to be vectorized
6205 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6206 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6207
6208 Builder.SetInsertPoint(ElseBlock);
6209
6210 // Clone loop for the else branch
6212
6213 SmallVector<BasicBlock *, 8> ExistingBlocks;
6214 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6215 ExistingBlocks.push_back(ThenBlock);
6216 ExistingBlocks.append(L->block_begin(), L->block_end());
6217 // Cond is the block that has the if clause condition
6218 // LoopCond is omp_loop.cond
6219 // LoopHeader is omp_loop.header
6220 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6221 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6222 assert(LoopCond && LoopHeader && "Invalid loop structure");
6223 for (BasicBlock *Block : ExistingBlocks) {
6224 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6225 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6226 continue;
6227 }
6228 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6229
6230 // fix name not to be omp.if.then
6231 if (Block == ThenBlock)
6232 NewBB->setName(NamePrefix + ".if.else");
6233
6234 NewBB->moveBefore(CanonicalLoop->getExit());
6235 VMap[Block] = NewBB;
6236 NewBlocks.push_back(NewBB);
6237 }
6238 remapInstructionsInBlocks(NewBlocks, VMap);
6239 Builder.CreateBr(NewBlocks.front());
6240
6241 // The loop latch must have only one predecessor. Currently it is branched to
6242 // from both the 'then' and 'else' branches.
6243 L->getLoopLatch()->splitBasicBlock(
6244 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
6245
6246 // Ensure that the then block is added to the loop so we add the attributes in
6247 // the next step
6248 L->addBasicBlockToLoop(ThenBlock, LI);
6249}
6250
6251unsigned
6252OpenMPIRBuilder::getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
6253 const StringMap<bool> &Features) {
6254 if (TargetTriple.isX86()) {
6255 if (Features.lookup("avx512f"))
6256 return 512;
6257 else if (Features.lookup("avx"))
6258 return 256;
6259 return 128;
6260 }
6261 if (TargetTriple.isPPC())
6262 return 128;
6263 if (TargetTriple.isWasm())
6264 return 128;
6265 return 0;
6266}
6267
6268void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
6269 MapVector<Value *, Value *> AlignedVars,
6270 Value *IfCond, OrderKind Order,
6271 ConstantInt *Simdlen, ConstantInt *Safelen) {
6272 LLVMContext &Ctx = Builder.getContext();
6273
6274 Function *F = CanonicalLoop->getFunction();
6275
6276 // TODO: We should not rely on pass manager. Currently we use pass manager
6277 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6278 // object. We should have a method which returns all blocks between
6279 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6281 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6282 FAM.registerPass([]() { return LoopAnalysis(); });
6283 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6284
6285 LoopAnalysis LIA;
6286 LoopInfo &&LI = LIA.run(*F, FAM);
6287
6288 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6289 if (AlignedVars.size()) {
6290 InsertPointTy IP = Builder.saveIP();
6291 for (auto &AlignedItem : AlignedVars) {
6292 Value *AlignedPtr = AlignedItem.first;
6293 Value *Alignment = AlignedItem.second;
6294 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6295 Builder.SetInsertPoint(loadInst->getNextNode());
6296 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6297 Alignment);
6298 }
6299 Builder.restoreIP(IP);
6300 }
6301
6302 if (IfCond) {
6303 ValueToValueMapTy VMap;
6304 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6305 }
6306
6308
6309 // Get the basic blocks from the loop in which memref instructions
6310 // can be found.
6311 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6312 // preferably without running any passes.
6313 for (BasicBlock *Block : L->getBlocks()) {
6314 if (Block == CanonicalLoop->getCond() ||
6315 Block == CanonicalLoop->getHeader())
6316 continue;
6317 Reachable.insert(Block);
6318 }
6319
6320 SmallVector<Metadata *> LoopMDList;
6321
6322 // In presence of finite 'safelen', it may be unsafe to mark all
6323 // the memory instructions parallel, because loop-carried
6324 // dependences of 'safelen' iterations are possible.
6325 // If clause order(concurrent) is specified then the memory instructions
6326 // are marked parallel even if 'safelen' is finite.
6327 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6328 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6329
6330 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6331 // versions so we can't add the loop attributes in that case.
6332 if (IfCond) {
6333 // we can still add llvm.loop.parallel_access
6334 addLoopMetadata(CanonicalLoop, LoopMDList);
6335 return;
6336 }
6337
6338 // Use the above access group metadata to create loop level
6339 // metadata, which should be distinct for each loop.
6340 ConstantAsMetadata *BoolConst =
6342 LoopMDList.push_back(MDNode::get(
6343 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6344
6345 if (Simdlen || Safelen) {
6346 // If both simdlen and safelen clauses are specified, the value of the
6347 // simdlen parameter must be less than or equal to the value of the safelen
6348 // parameter. Therefore, use safelen only in the absence of simdlen.
6349 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6350 LoopMDList.push_back(
6351 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6352 ConstantAsMetadata::get(VectorizeWidth)}));
6353 }
6354
6355 addLoopMetadata(CanonicalLoop, LoopMDList);
6356}
6357
6358/// Create the TargetMachine object to query the backend for optimization
6359/// preferences.
6360///
6361/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6362/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6363/// needed for the LLVM pass pipline. We use some default options to avoid
6364/// having to pass too many settings from the frontend that probably do not
6365/// matter.
6366///
6367/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6368/// method. If we are going to use TargetMachine for more purposes, especially
6369/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6370/// might become be worth requiring front-ends to pass on their TargetMachine,
6371/// or at least cache it between methods. Note that while fontends such as Clang
6372/// have just a single main TargetMachine per translation unit, "target-cpu" and
6373/// "target-features" that determine the TargetMachine are per-function and can
6374/// be overrided using __attribute__((target("OPTIONS"))).
6375static std::unique_ptr<TargetMachine>
6377 Module *M = F->getParent();
6378
6379 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6380 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6381 const llvm::Triple &Triple = M->getTargetTriple();
6382
6383 std::string Error;
6385 if (!TheTarget)
6386 return {};
6387
6389 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6390 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6391 /*CodeModel=*/std::nullopt, OptLevel));
6392}
6393
6394/// Heuristically determine the best-performant unroll factor for \p CLI. This
6395/// depends on the target processor. We are re-using the same heuristics as the
6396/// LoopUnrollPass.
6397static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI) {
6398 Function *F = CLI->getFunction();
6399
6400 // Assume the user requests the most aggressive unrolling, even if the rest of
6401 // the code is optimized using a lower setting.
6403 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6404
6406 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6407 FAM.registerPass([]() { return AssumptionAnalysis(); });
6408 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6409 FAM.registerPass([]() { return LoopAnalysis(); });
6410 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6411 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6412 TargetIRAnalysis TIRA;
6413 if (TM)
6414 TIRA = TargetIRAnalysis(
6415 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6416 FAM.registerPass([&]() { return TIRA; });
6417
6418 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6420 ScalarEvolution &&SE = SEA.run(*F, FAM);
6422 DominatorTree &&DT = DTA.run(*F, FAM);
6423 LoopAnalysis LIA;
6424 LoopInfo &&LI = LIA.run(*F, FAM);
6426 AssumptionCache &&AC = ACT.run(*F, FAM);
6428
6429 Loop *L = LI.getLoopFor(CLI->getHeader());
6430 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6431
6433 L, SE, TTI,
6434 /*BlockFrequencyInfo=*/nullptr,
6435 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6436 /*UserThreshold=*/std::nullopt,
6437 /*UserCount=*/std::nullopt,
6438 /*UserAllowPartial=*/true,
6439 /*UserAllowRuntime=*/true,
6440 /*UserUpperBound=*/std::nullopt,
6441 /*UserFullUnrollMaxCount=*/std::nullopt);
6442
6443 UP.Force = true;
6444
6445 // Account for additional optimizations taking place before the LoopUnrollPass
6446 // would unroll the loop.
6449
6450 // Use normal unroll factors even if the rest of the code is optimized for
6451 // size.
6454
6455 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6456 << " Threshold=" << UP.Threshold << "\n"
6457 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6458 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6459 << " PartialOptSizeThreshold="
6460 << UP.PartialOptSizeThreshold << "\n");
6461
6462 // Disable peeling.
6465 /*UserAllowPeeling=*/false,
6466 /*UserAllowProfileBasedPeeling=*/false,
6467 /*UnrollingSpecficValues=*/false);
6468
6470 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6471
6472 // Assume that reads and writes to stack variables can be eliminated by
6473 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6474 // size.
6475 for (BasicBlock *BB : L->blocks()) {
6476 for (Instruction &I : *BB) {
6477 Value *Ptr;
6478 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6479 Ptr = Load->getPointerOperand();
6480 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6481 Ptr = Store->getPointerOperand();
6482 } else
6483 continue;
6484
6485 Ptr = Ptr->stripPointerCasts();
6486
6487 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6488 if (Alloca->getParent() == &F->getEntryBlock())
6489 EphValues.insert(&I);
6490 }
6491 }
6492 }
6493
6494 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6495
6496 // Loop is not unrollable if the loop contains certain instructions.
6497 if (!UCE.canUnroll()) {
6498 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6499 return 1;
6500 }
6501
6502 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6503 << "\n");
6504
6505 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6506 // be able to use it.
6507 int TripCount = 0;
6508 int MaxTripCount = 0;
6509 bool MaxOrZero = false;
6510 unsigned TripMultiple = 0;
6511
6512 bool UseUpperBound = false;
6513 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6514 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6515 UseUpperBound);
6516 unsigned Factor = UP.Count;
6517 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6518
6519 // This function returns 1 to signal to not unroll a loop.
6520 if (Factor == 0)
6521 return 1;
6522 return Factor;
6523}
6524
6525void OpenMPIRBuilder::unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop,
6526 int32_t Factor,
6527 CanonicalLoopInfo **UnrolledCLI) {
6528 assert(Factor >= 0 && "Unroll factor must not be negative");
6529
6530 Function *F = Loop->getFunction();
6531 LLVMContext &Ctx = F->getContext();
6532
6533 // If the unrolled loop is not used for another loop-associated directive, it
6534 // is sufficient to add metadata for the LoopUnrollPass.
6535 if (!UnrolledCLI) {
6536 SmallVector<Metadata *, 2> LoopMetadata;
6537 LoopMetadata.push_back(
6538 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6539
6540 if (Factor >= 1) {
6542 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6543 LoopMetadata.push_back(MDNode::get(
6544 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6545 }
6546
6547 addLoopMetadata(Loop, LoopMetadata);
6548 return;
6549 }
6550
6551 // Heuristically determine the unroll factor.
6552 if (Factor == 0)
6554
6555 // No change required with unroll factor 1.
6556 if (Factor == 1) {
6557 *UnrolledCLI = Loop;
6558 return;
6559 }
6560
6561 assert(Factor >= 2 &&
6562 "unrolling only makes sense with a factor of 2 or larger");
6563
6564 Type *IndVarTy = Loop->getIndVarType();
6565
6566 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6567 // unroll the inner loop.
6568 Value *FactorVal =
6569 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6570 /*isSigned=*/false));
6571 std::vector<CanonicalLoopInfo *> LoopNest =
6572 tileLoops(DL, {Loop}, {FactorVal});
6573 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6574 *UnrolledCLI = LoopNest[0];
6575 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6576
6577 // LoopUnrollPass can only fully unroll loops with constant trip count.
6578 // Unroll by the unroll factor with a fallback epilog for the remainder
6579 // iterations if necessary.
6581 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6583 InnerLoop,
6584 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6586 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6587
6588#ifndef NDEBUG
6589 (*UnrolledCLI)->assertOK();
6590#endif
6591}
6592
6593OpenMPIRBuilder::InsertPointTy
6594OpenMPIRBuilder::createCopyPrivate(const LocationDescription &Loc,
6595 llvm::Value *BufSize, llvm::Value *CpyBuf,
6596 llvm::Value *CpyFn, llvm::Value *DidIt) {
6597 if (!updateToLocation(Loc))
6598 return Loc.IP;
6599
6600 uint32_t SrcLocStrSize;
6601 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6602 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6603 Value *ThreadId = getOrCreateThreadID(Ident);
6604
6605 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
6606
6607 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
6608
6609 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
6610 createRuntimeFunctionCall(Fn, Args);
6611
6612 return Builder.saveIP();
6613}
6614
6615OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createSingle(
6616 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6617 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
6619
6620 if (!updateToLocation(Loc))
6621 return Loc.IP;
6622
6623 // If needed allocate and initialize `DidIt` with 0.
6624 // DidIt: flag variable: 1=single thread; 0=not single thread.
6625 llvm::Value *DidIt = nullptr;
6626 if (!CPVars.empty()) {
6627 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
6628 Builder.CreateStore(Builder.getInt32(0), DidIt);
6629 }
6630
6631 Directive OMPD = Directive::OMPD_single;
6632 uint32_t SrcLocStrSize;
6633 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6634 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6635 Value *ThreadId = getOrCreateThreadID(Ident);
6636 Value *Args[] = {Ident, ThreadId};
6637
6638 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
6639 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
6640
6641 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
6642 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6643
6644 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
6645 if (Error Err = FiniCB(IP))
6646 return Err;
6647
6648 // The thread that executes the single region must set `DidIt` to 1.
6649 // This is used by __kmpc_copyprivate, to know if the caller is the
6650 // single thread or not.
6651 if (DidIt)
6652 Builder.CreateStore(Builder.getInt32(1), DidIt);
6653
6654 return Error::success();
6655 };
6656
6657 // generates the following:
6658 // if (__kmpc_single()) {
6659 // .... single region ...
6660 // __kmpc_end_single
6661 // }
6662 // __kmpc_copyprivate
6663 // __kmpc_barrier
6664
6665 InsertPointOrErrorTy AfterIP =
6666 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
6667 /*Conditional*/ true,
6668 /*hasFinalize*/ true);
6669 if (!AfterIP)
6670 return AfterIP.takeError();
6671
6672 if (DidIt) {
6673 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
6674 // NOTE BufSize is currently unused, so just pass 0.
6675 createCopyPrivate(LocationDescription(Builder.saveIP(), Loc.DL),
6676 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
6677 CPFuncs[I], DidIt);
6678 // NOTE __kmpc_copyprivate already inserts a barrier
6679 } else if (!IsNowait) {
6680 InsertPointOrErrorTy AfterIP =
6681 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
6682 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
6683 /* CheckCancelFlag */ false);
6684 if (!AfterIP)
6685 return AfterIP.takeError();
6686 }
6687 return Builder.saveIP();
6688}
6689
6690OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createCritical(
6691 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6692 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
6693
6694 if (!updateToLocation(Loc))
6695 return Loc.IP;
6696
6697 Directive OMPD = Directive::OMPD_critical;
6698 uint32_t SrcLocStrSize;
6699 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6700 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6701 Value *ThreadId = getOrCreateThreadID(Ident);
6702 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
6703 Value *Args[] = {Ident, ThreadId, LockVar};
6704
6705 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
6706 Function *RTFn = nullptr;
6707 if (HintInst) {
6708 // Add Hint to entry Args and create call
6709 EnterArgs.push_back(HintInst);
6710 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
6711 } else {
6712 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
6713 }
6714 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
6715
6716 Function *ExitRTLFn =
6717 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
6718 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6719
6720 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6721 /*Conditional*/ false, /*hasFinalize*/ true);
6722}
6723
6724OpenMPIRBuilder::InsertPointTy
6725OpenMPIRBuilder::createOrderedDepend(const LocationDescription &Loc,
6726 InsertPointTy AllocaIP, unsigned NumLoops,
6727 ArrayRef<llvm::Value *> StoreValues,
6728 const Twine &Name, bool IsDependSource) {
6729 assert(
6730 llvm::all_of(StoreValues,
6731 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
6732 "OpenMP runtime requires depend vec with i64 type");
6733
6734 if (!updateToLocation(Loc))
6735 return Loc.IP;
6736
6737 // Allocate space for vector and generate alloc instruction.
6738 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
6739 Builder.restoreIP(AllocaIP);
6740 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
6741 ArgsBase->setAlignment(Align(8));
6742 updateToLocation(Loc);
6743
6744 // Store the index value with offset in depend vector.
6745 for (unsigned I = 0; I < NumLoops; ++I) {
6746 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
6747 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
6748 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
6749 STInst->setAlignment(Align(8));
6750 }
6751
6752 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
6753 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
6754
6755 uint32_t SrcLocStrSize;
6756 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6757 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6758 Value *ThreadId = getOrCreateThreadID(Ident);
6759 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
6760
6761 Function *RTLFn = nullptr;
6762 if (IsDependSource)
6763 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
6764 else
6765 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
6766 createRuntimeFunctionCall(RTLFn, Args);
6767
6768 return Builder.saveIP();
6769}
6770
6771OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createOrderedThreadsSimd(
6772 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
6773 FinalizeCallbackTy FiniCB, bool IsThreads) {
6774 if (!updateToLocation(Loc))
6775 return Loc.IP;
6776
6777 Directive OMPD = Directive::OMPD_ordered;
6778 Instruction *EntryCall = nullptr;
6779 Instruction *ExitCall = nullptr;
6780
6781 if (IsThreads) {
6782 uint32_t SrcLocStrSize;
6783 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6784 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6785 Value *ThreadId = getOrCreateThreadID(Ident);
6786 Value *Args[] = {Ident, ThreadId};
6787
6788 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
6789 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
6790
6791 Function *ExitRTLFn =
6792 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
6793 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
6794 }
6795
6796 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
6797 /*Conditional*/ false, /*hasFinalize*/ true);
6798}
6799
6800OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
6801 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
6802 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
6803 bool HasFinalize, bool IsCancellable) {
6804
6805 if (HasFinalize)
6806 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
6807
6808 // Create inlined region's entry and body blocks, in preparation
6809 // for conditional creation
6810 BasicBlock *EntryBB = Builder.GetInsertBlock();
6811 Instruction *SplitPos = EntryBB->getTerminator();
6812 if (!isa_and_nonnull<BranchInst>(SplitPos))
6813 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
6814 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
6815 BasicBlock *FiniBB =
6816 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
6817
6818 Builder.SetInsertPoint(EntryBB->getTerminator());
6819 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
6820
6821 // generate body
6822 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
6823 /* CodeGenIP */ Builder.saveIP()))
6824 return Err;
6825
6826 // emit exit call and do any needed finalization.
6827 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
6828 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
6829 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
6830 "Unexpected control flow graph state!!");
6831 InsertPointOrErrorTy AfterIP =
6832 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
6833 if (!AfterIP)
6834 return AfterIP.takeError();
6835
6836 // If we are skipping the region of a non conditional, remove the exit
6837 // block, and clear the builder's insertion point.
6838 assert(SplitPos->getParent() == ExitBB &&
6839 "Unexpected Insertion point location!");
6840 auto merged = MergeBlockIntoPredecessor(ExitBB);
6841 BasicBlock *ExitPredBB = SplitPos->getParent();
6842 auto InsertBB = merged ? ExitPredBB : ExitBB;
6843 if (!isa_and_nonnull<BranchInst>(SplitPos))
6844 SplitPos->eraseFromParent();
6845 Builder.SetInsertPoint(InsertBB);
6846
6847 return Builder.saveIP();
6848}
6849
6850OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
6851 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
6852 // if nothing to do, Return current insertion point.
6853 if (!Conditional || !EntryCall)
6854 return Builder.saveIP();
6855
6856 BasicBlock *EntryBB = Builder.GetInsertBlock();
6857 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
6858 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
6859 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
6860
6861 // Emit thenBB and set the Builder's insertion point there for
6862 // body generation next. Place the block after the current block.
6863 Function *CurFn = EntryBB->getParent();
6864 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
6865
6866 // Move Entry branch to end of ThenBB, and replace with conditional
6867 // branch (If-stmt)
6868 Instruction *EntryBBTI = EntryBB->getTerminator();
6869 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
6870 EntryBBTI->removeFromParent();
6871 Builder.SetInsertPoint(UI);
6872 Builder.Insert(EntryBBTI);
6873 UI->eraseFromParent();
6874 Builder.SetInsertPoint(ThenBB->getTerminator());
6875
6876 // return an insertion point to ExitBB.
6877 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
6878}
6879
6880OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
6881 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
6882 bool HasFinalize) {
6883
6884 Builder.restoreIP(FinIP);
6885
6886 // If there is finalization to do, emit it before the exit call
6887 if (HasFinalize) {
6888 assert(!FinalizationStack.empty() &&
6889 "Unexpected finalization stack state!");
6890
6891 FinalizationInfo Fi = FinalizationStack.pop_back_val();
6892 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
6893
6894 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
6895 return std::move(Err);
6896
6897 // Exit condition: insertion point is before the terminator of the new Fini
6898 // block
6899 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
6900 }
6901
6902 if (!ExitCall)
6903 return Builder.saveIP();
6904
6905 // place the Exitcall as last instruction before Finalization block terminator
6906 ExitCall->removeFromParent();
6907 Builder.Insert(ExitCall);
6908
6909 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
6910 ExitCall->getIterator());
6911}
6912
6913OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createCopyinClauseBlocks(
6914 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
6915 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
6916 if (!IP.isSet())
6917 return IP;
6918
6919 IRBuilder<>::InsertPointGuard IPG(Builder);
6920
6921 // creates the following CFG structure
6922 // OMP_Entry : (MasterAddr != PrivateAddr)?
6923 // F T
6924 // | \
6925 // | copin.not.master
6926 // | /
6927 // v /
6928 // copyin.not.master.end
6929 // |
6930 // v
6931 // OMP.Entry.Next
6932
6933 BasicBlock *OMP_Entry = IP.getBlock();
6934 Function *CurFn = OMP_Entry->getParent();
6935 BasicBlock *CopyBegin =
6936 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
6937 BasicBlock *CopyEnd = nullptr;
6938
6939 // If entry block is terminated, split to preserve the branch to following
6940 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
6941 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
6942 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
6943 "copyin.not.master.end");
6944 OMP_Entry->getTerminator()->eraseFromParent();
6945 } else {
6946 CopyEnd =
6947 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
6948 }
6949
6950 Builder.SetInsertPoint(OMP_Entry);
6951 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
6952 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
6953 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
6954 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
6955
6956 Builder.SetInsertPoint(CopyBegin);
6957 if (BranchtoEnd)
6958 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
6959
6960 return Builder.saveIP();
6961}
6962
6963CallInst *OpenMPIRBuilder::createOMPAlloc(const LocationDescription &Loc,
6965 std::string Name) {
6966 IRBuilder<>::InsertPointGuard IPG(Builder);
6967 updateToLocation(Loc);
6968
6969 uint32_t SrcLocStrSize;
6970 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6971 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6972 Value *ThreadId = getOrCreateThreadID(Ident);
6973 Value *Args[] = {ThreadId, Size, Allocator};
6974
6975 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
6976
6977 return createRuntimeFunctionCall(Fn, Args, Name);
6978}
6979
6980CallInst *OpenMPIRBuilder::createOMPFree(const LocationDescription &Loc,
6981 Value *Addr, Value *Allocator,
6982 std::string Name) {
6983 IRBuilder<>::InsertPointGuard IPG(Builder);
6984 updateToLocation(Loc);
6985
6986 uint32_t SrcLocStrSize;
6987 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
6988 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6989 Value *ThreadId = getOrCreateThreadID(Ident);
6990 Value *Args[] = {ThreadId, Addr, Allocator};
6991 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
6992 return createRuntimeFunctionCall(Fn, Args, Name);
6993}
6994
6995CallInst *OpenMPIRBuilder::createOMPInteropInit(
6996 const LocationDescription &Loc, Value *InteropVar,
6997 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
6998 Value *DependenceAddress, bool HaveNowaitClause) {
6999 IRBuilder<>::InsertPointGuard IPG(Builder);
7000 updateToLocation(Loc);
7001
7002 uint32_t SrcLocStrSize;
7003 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7004 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7005 Value *ThreadId = getOrCreateThreadID(Ident);
7006 if (Device == nullptr)
7008 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7009 if (NumDependences == nullptr) {
7010 NumDependences = ConstantInt::get(Int32, 0);
7011 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7012 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7013 }
7014 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7015 Value *Args[] = {
7016 Ident, ThreadId, InteropVar, InteropTypeVal,
7017 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7018
7019 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7020
7021 return createRuntimeFunctionCall(Fn, Args);
7022}
7023
7024CallInst *OpenMPIRBuilder::createOMPInteropDestroy(
7025 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7026 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7027 IRBuilder<>::InsertPointGuard IPG(Builder);
7028 updateToLocation(Loc);
7029
7030 uint32_t SrcLocStrSize;
7031 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7032 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7033 Value *ThreadId = getOrCreateThreadID(Ident);
7034 if (Device == nullptr)
7036 if (NumDependences == nullptr) {
7037 NumDependences = ConstantInt::get(Int32, 0);
7038 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7039 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7040 }
7041 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7042 Value *Args[] = {
7043 Ident, ThreadId, InteropVar, Device,
7044 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7045
7046 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7047
7048 return createRuntimeFunctionCall(Fn, Args);
7049}
7050
7051CallInst *OpenMPIRBuilder::createOMPInteropUse(const LocationDescription &Loc,
7052 Value *InteropVar, Value *Device,
7053 Value *NumDependences,
7054 Value *DependenceAddress,
7055 bool HaveNowaitClause) {
7056 IRBuilder<>::InsertPointGuard IPG(Builder);
7057 updateToLocation(Loc);
7058 uint32_t SrcLocStrSize;
7059 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7060 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7061 Value *ThreadId = getOrCreateThreadID(Ident);
7062 if (Device == nullptr)
7064 if (NumDependences == nullptr) {
7065 NumDependences = ConstantInt::get(Int32, 0);
7066 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7067 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7068 }
7069 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7070 Value *Args[] = {
7071 Ident, ThreadId, InteropVar, Device,
7072 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7073
7074 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7075
7076 return createRuntimeFunctionCall(Fn, Args);
7077}
7078
7079CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
7080 const LocationDescription &Loc, llvm::Value *Pointer,
7081 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7082 IRBuilder<>::InsertPointGuard IPG(Builder);
7083 updateToLocation(Loc);
7084
7085 uint32_t SrcLocStrSize;
7086 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7087 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7088 Value *ThreadId = getOrCreateThreadID(Ident);
7089 Constant *ThreadPrivateCache =
7090 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7091 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7092
7093 Function *Fn =
7094 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7095
7096 return createRuntimeFunctionCall(Fn, Args);
7097}
7098
7099OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createTargetInit(
7100 const LocationDescription &Loc,
7101 const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs) {
7102 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7103 "expected num_threads and num_teams to be specified");
7104
7105 if (!updateToLocation(Loc))
7106 return Loc.IP;
7107
7108 uint32_t SrcLocStrSize;
7109 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7110 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7111 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7112 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7113 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7114 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7115 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7116
7117 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7118 Function *Kernel = DebugKernelWrapper;
7119
7120 // We need to strip the debug prefix to get the correct kernel name.
7121 StringRef KernelName = Kernel->getName();
7122 const std::string DebugPrefix = "_debug__";
7123 if (KernelName.ends_with(DebugPrefix)) {
7124 KernelName = KernelName.drop_back(DebugPrefix.length());
7125 Kernel = M.getFunction(KernelName);
7126 assert(Kernel && "Expected the real kernel to exist");
7127 }
7128
7129 // Manifest the launch configuration in the metadata matching the kernel
7130 // environment.
7131 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7132 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7133
7134 // If MaxThreads not set, select the maximum between the default workgroup
7135 // size and the MinThreads value.
7136 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7137 if (MaxThreadsVal < 0)
7138 MaxThreadsVal = std::max(
7139 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7140
7141 if (MaxThreadsVal > 0)
7142 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7143
7144 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7145 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7146 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7147 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7148 Constant *ReductionDataSize =
7149 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7150 Constant *ReductionBufferLength =
7151 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7152
7153 Function *Fn = getOrCreateRuntimeFunctionPtr(
7154 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7155 const DataLayout &DL = Fn->getDataLayout();
7156
7157 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7158 Constant *DynamicEnvironmentInitializer =
7159 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7160 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7161 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7162 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7163 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7164 DL.getDefaultGlobalsAddressSpace());
7165 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7166
7167 Constant *DynamicEnvironment =
7168 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7169 ? DynamicEnvironmentGV
7170 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7171 DynamicEnvironmentPtr);
7172
7173 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7174 ConfigurationEnvironment, {
7175 UseGenericStateMachineVal,
7176 MayUseNestedParallelismVal,
7177 IsSPMDVal,
7178 MinThreads,
7179 MaxThreads,
7180 MinTeams,
7181 MaxTeams,
7182 ReductionDataSize,
7183 ReductionBufferLength,
7184 });
7185 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7186 KernelEnvironment, {
7187 ConfigurationEnvironmentInitializer,
7188 Ident,
7189 DynamicEnvironment,
7190 });
7191 std::string KernelEnvironmentName =
7192 (KernelName + "_kernel_environment").str();
7193 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7194 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7195 KernelEnvironmentInitializer, KernelEnvironmentName,
7196 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7197 DL.getDefaultGlobalsAddressSpace());
7198 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7199
7200 Constant *KernelEnvironment =
7201 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7202 ? KernelEnvironmentGV
7203 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7204 KernelEnvironmentPtr);
7205 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7206 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7207 KernelLaunchEnvironment =
7208 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7209 ? KernelLaunchEnvironment
7210 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7211 KernelLaunchEnvParamTy);
7212 CallInst *ThreadKind = createRuntimeFunctionCall(
7213 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7214
7215 Value *ExecUserCode = Builder.CreateICmpEQ(
7216 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7217 "exec_user_code");
7218
7219 // ThreadKind = __kmpc_target_init(...)
7220 // if (ThreadKind == -1)
7221 // user_code
7222 // else
7223 // return;
7224
7225 auto *UI = Builder.CreateUnreachable();
7226 BasicBlock *CheckBB = UI->getParent();
7227 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7228
7229 BasicBlock *WorkerExitBB = BasicBlock::Create(
7230 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7231 Builder.SetInsertPoint(WorkerExitBB);
7232 Builder.CreateRetVoid();
7233
7234 auto *CheckBBTI = CheckBB->getTerminator();
7235 Builder.SetInsertPoint(CheckBBTI);
7236 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7237
7238 CheckBBTI->eraseFromParent();
7239 UI->eraseFromParent();
7240
7241 // Continue in the "user_code" block, see diagram above and in
7242 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7243 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7244}
7245
7246void OpenMPIRBuilder::createTargetDeinit(const LocationDescription &Loc,
7247 int32_t TeamsReductionDataSize,
7248 int32_t TeamsReductionBufferLength) {
7249 if (!updateToLocation(Loc))
7250 return;
7251
7252 Function *Fn = getOrCreateRuntimeFunctionPtr(
7253 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7254
7255 createRuntimeFunctionCall(Fn, {});
7256
7257 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7258 return;
7259
7260 Function *Kernel = Builder.GetInsertBlock()->getParent();
7261 // We need to strip the debug prefix to get the correct kernel name.
7262 StringRef KernelName = Kernel->getName();
7263 const std::string DebugPrefix = "_debug__";
7264 if (KernelName.ends_with(DebugPrefix))
7265 KernelName = KernelName.drop_back(DebugPrefix.length());
7266 auto *KernelEnvironmentGV =
7267 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7268 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7269 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7270 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7271 KernelEnvironmentInitializer,
7272 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7273 NewInitializer = ConstantFoldInsertValueInstruction(
7274 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7275 {0, 8});
7276 KernelEnvironmentGV->setInitializer(NewInitializer);
7277}
7278
7279static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7280 bool Min) {
7281 if (Kernel.hasFnAttribute(Name)) {
7282 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7283 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7284 }
7285 Kernel.addFnAttr(Name, llvm::utostr(Value));
7286}
7287
7288std::pair<int32_t, int32_t>
7289OpenMPIRBuilder::readThreadBoundsForKernel(const Triple &T, Function &Kernel) {
7290 int32_t ThreadLimit =
7291 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7292
7293 if (T.isAMDGPU()) {
7294 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7295 if (!Attr.isValid() || !Attr.isStringAttribute())
7296 return {0, ThreadLimit};
7297 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7298 int32_t LB, UB;
7299 if (!llvm::to_integer(UBStr, UB, 10))
7300 return {0, ThreadLimit};
7301 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7302 if (!llvm::to_integer(LBStr, LB, 10))
7303 return {0, UB};
7304 return {LB, UB};
7305 }
7306
7307 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7308 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7309 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7310 }
7311 return {0, ThreadLimit};
7312}
7313
7314void OpenMPIRBuilder::writeThreadBoundsForKernel(const Triple &T,
7315 Function &Kernel, int32_t LB,
7316 int32_t UB) {
7317 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7318
7319 if (T.isAMDGPU()) {
7320 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7321 llvm::utostr(LB) + "," + llvm::utostr(UB));
7322 return;
7323 }
7324
7325 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7326}
7327
7328std::pair<int32_t, int32_t>
7329OpenMPIRBuilder::readTeamBoundsForKernel(const Triple &, Function &Kernel) {
7330 // TODO: Read from backend annotations if available.
7331 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7332}
7333
7334void OpenMPIRBuilder::writeTeamsForKernel(const Triple &T, Function &Kernel,
7335 int32_t LB, int32_t UB) {
7336 if (T.isNVPTX())
7337 if (UB > 0)
7338 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7339 if (T.isAMDGPU())
7340 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7341
7342 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7343}
7344
7345void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7346 Function *OutlinedFn) {
7347 if (Config.isTargetDevice()) {
7349 // TODO: Determine if DSO local can be set to true.
7350 OutlinedFn->setDSOLocal(false);
7352 if (T.isAMDGCN())
7354 else if (T.isNVPTX())
7356 else if (T.isSPIRV())
7358 }
7359}
7360
7361Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7362 StringRef EntryFnIDName) {
7363 if (Config.isTargetDevice()) {
7364 assert(OutlinedFn && "The outlined function must exist if embedded");
7365 return OutlinedFn;
7366 }
7367
7368 return new GlobalVariable(
7369 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7370 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7371}
7372
7373Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7374 StringRef EntryFnName) {
7375 if (OutlinedFn)
7376 return OutlinedFn;
7377
7378 assert(!M.getGlobalVariable(EntryFnName, true) &&
7379 "Named kernel already exists?");
7380 return new GlobalVariable(
7381 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7382 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7383}
7384
7385Error OpenMPIRBuilder::emitTargetRegionFunction(
7386 TargetRegionEntryInfo &EntryInfo,
7387 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7388 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7389
7390 SmallString<64> EntryFnName;
7391 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7392
7393 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7394 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7395 if (!CBResult)
7396 return CBResult.takeError();
7397 OutlinedFn = *CBResult;
7398 } else {
7399 OutlinedFn = nullptr;
7400 }
7401
7402 // If this target outline function is not an offload entry, we don't need to
7403 // register it. This may be in the case of a false if clause, or if there are
7404 // no OpenMP targets.
7405 if (!IsOffloadEntry)
7406 return Error::success();
7407
7408 std::string EntryFnIDName =
7409 Config.isTargetDevice()
7410 ? std::string(EntryFnName)
7411 : createPlatformSpecificName({EntryFnName, "region_id"});
7412
7413 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7414 EntryFnName, EntryFnIDName);
7415 return Error::success();
7416}
7417
7418Constant *OpenMPIRBuilder::registerTargetRegionFunction(
7419 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7420 StringRef EntryFnName, StringRef EntryFnIDName) {
7421 if (OutlinedFn)
7422 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7423 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7424 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7425 OffloadInfoManager.registerTargetRegionEntryInfo(
7426 EntryInfo, EntryAddr, OutlinedFnID,
7427 OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion);
7428 return OutlinedFnID;
7429}
7430
7431OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
7432 const LocationDescription &Loc, InsertPointTy AllocaIP,
7433 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7434 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7435 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7436 function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
7437 BodyGenTy BodyGenType)>
7438 BodyGenCB,
7439 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7440 if (!updateToLocation(Loc))
7441 return InsertPointTy();
7442
7443 Builder.restoreIP(CodeGenIP);
7444 // Disable TargetData CodeGen on Device pass.
7445 if (Config.IsTargetDevice.value_or(false)) {
7446 if (BodyGenCB) {
7447 InsertPointOrErrorTy AfterIP =
7448 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7449 if (!AfterIP)
7450 return AfterIP.takeError();
7451 Builder.restoreIP(*AfterIP);
7452 }
7453 return Builder.saveIP();
7454 }
7455
7456 bool IsStandAlone = !BodyGenCB;
7457 MapInfosTy *MapInfo;
7458 // Generate the code for the opening of the data environment. Capture all the
7459 // arguments of the runtime call by reference because they are used in the
7460 // closing of the region.
7461 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7462 InsertPointTy CodeGenIP) -> Error {
7463 MapInfo = &GenMapInfoCB(Builder.saveIP());
7464 if (Error Err = emitOffloadingArrays(
7465 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7466 /*IsNonContiguous=*/true, DeviceAddrCB))
7467 return Err;
7468
7469 TargetDataRTArgs RTArgs;
7470 emitOffloadingArraysArgument(Builder, RTArgs, Info);
7471
7472 // Emit the number of elements in the offloading arrays.
7473 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7474
7475 // Source location for the ident struct
7476 if (!SrcLocInfo) {
7477 uint32_t SrcLocStrSize;
7478 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7479 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7480 }
7481
7482 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7483 SrcLocInfo, DeviceID,
7484 PointerNum, RTArgs.BasePointersArray,
7485 RTArgs.PointersArray, RTArgs.SizesArray,
7486 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7487 RTArgs.MappersArray};
7488
7489 if (IsStandAlone) {
7490 assert(MapperFunc && "MapperFunc missing for standalone target data");
7491
7492 auto TaskBodyCB = [&](Value *, Value *,
7494 if (Info.HasNoWait) {
7495 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7499 }
7500
7501 createRuntimeFunctionCall(getOrCreateRuntimeFunctionPtr(*MapperFunc),
7502 OffloadingArgs);
7503
7504 if (Info.HasNoWait) {
7505 BasicBlock *OffloadContBlock =
7506 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7507 Function *CurFn = Builder.GetInsertBlock()->getParent();
7508 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7509 Builder.restoreIP(Builder.saveIP());
7510 }
7511 return Error::success();
7512 };
7513
7514 bool RequiresOuterTargetTask = Info.HasNoWait;
7515 if (!RequiresOuterTargetTask)
7516 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7517 /*TargetTaskAllocaIP=*/{}));
7518 else
7519 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7520 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7521 } else {
7522 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7523 omp::OMPRTL___tgt_target_data_begin_mapper);
7524
7525 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
7526
7527 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7528 if (isa<AllocaInst>(DeviceMap.second.second)) {
7529 auto *LI =
7530 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7531 Builder.CreateStore(LI, DeviceMap.second.second);
7532 }
7533 }
7534
7535 // If device pointer privatization is required, emit the body of the
7536 // region here. It will have to be duplicated: with and without
7537 // privatization.
7538 InsertPointOrErrorTy AfterIP =
7539 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7540 if (!AfterIP)
7541 return AfterIP.takeError();
7542 Builder.restoreIP(*AfterIP);
7543 }
7544 return Error::success();
7545 };
7546
7547 // If we need device pointer privatization, we need to emit the body of the
7548 // region with no privatization in the 'else' branch of the conditional.
7549 // Otherwise, we don't have to do anything.
7550 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7551 InsertPointTy CodeGenIP) -> Error {
7552 InsertPointOrErrorTy AfterIP =
7553 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7554 if (!AfterIP)
7555 return AfterIP.takeError();
7556 Builder.restoreIP(*AfterIP);
7557 return Error::success();
7558 };
7559
7560 // Generate code for the closing of the data region.
7561 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7562 TargetDataRTArgs RTArgs;
7563 Info.EmitDebug = !MapInfo->Names.empty();
7564 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7565
7566 // Emit the number of elements in the offloading arrays.
7567 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7568
7569 // Source location for the ident struct
7570 if (!SrcLocInfo) {
7571 uint32_t SrcLocStrSize;
7572 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7573 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7574 }
7575
7576 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7577 PointerNum, RTArgs.BasePointersArray,
7578 RTArgs.PointersArray, RTArgs.SizesArray,
7579 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7580 RTArgs.MappersArray};
7581 Function *EndMapperFunc =
7582 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7583
7584 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
7585 return Error::success();
7586 };
7587
7588 // We don't have to do anything to close the region if the if clause evaluates
7589 // to false.
7590 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7591 return Error::success();
7592 };
7593
7594 Error Err = [&]() -> Error {
7595 if (BodyGenCB) {
7596 Error Err = [&]() {
7597 if (IfCond)
7598 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
7599 return BeginThenGen(AllocaIP, Builder.saveIP());
7600 }();
7601
7602 if (Err)
7603 return Err;
7604
7605 // If we don't require privatization of device pointers, we emit the body
7606 // in between the runtime calls. This avoids duplicating the body code.
7607 InsertPointOrErrorTy AfterIP =
7608 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7609 if (!AfterIP)
7610 return AfterIP.takeError();
7611 restoreIPandDebugLoc(Builder, *AfterIP);
7612
7613 if (IfCond)
7614 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
7615 return EndThenGen(AllocaIP, Builder.saveIP());
7616 }
7617 if (IfCond)
7618 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
7619 return BeginThenGen(AllocaIP, Builder.saveIP());
7620 }();
7621
7622 if (Err)
7623 return Err;
7624
7625 return Builder.saveIP();
7626}
7627
7629OpenMPIRBuilder::createForStaticInitFunction(unsigned IVSize, bool IVSigned,
7630 bool IsGPUDistribute) {
7631 assert((IVSize == 32 || IVSize == 64) &&
7632 "IV size is not compatible with the omp runtime");
7634 if (IsGPUDistribute)
7635 Name = IVSize == 32
7636 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
7637 : omp::OMPRTL___kmpc_distribute_static_init_4u)
7638 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
7639 : omp::OMPRTL___kmpc_distribute_static_init_8u);
7640 else
7641 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
7642 : omp::OMPRTL___kmpc_for_static_init_4u)
7643 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
7644 : omp::OMPRTL___kmpc_for_static_init_8u);
7645
7646 return getOrCreateRuntimeFunction(M, Name);
7647}
7648
7649FunctionCallee OpenMPIRBuilder::createDispatchInitFunction(unsigned IVSize,
7650 bool IVSigned) {
7651 assert((IVSize == 32 || IVSize == 64) &&
7652 "IV size is not compatible with the omp runtime");
7653 RuntimeFunction Name = IVSize == 32
7654 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
7655 : omp::OMPRTL___kmpc_dispatch_init_4u)
7656 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
7657 : omp::OMPRTL___kmpc_dispatch_init_8u);
7658
7659 return getOrCreateRuntimeFunction(M, Name);
7660}
7661
7662FunctionCallee OpenMPIRBuilder::createDispatchNextFunction(unsigned IVSize,
7663 bool IVSigned) {
7664 assert((IVSize == 32 || IVSize == 64) &&
7665 "IV size is not compatible with the omp runtime");
7666 RuntimeFunction Name = IVSize == 32
7667 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
7668 : omp::OMPRTL___kmpc_dispatch_next_4u)
7669 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
7670 : omp::OMPRTL___kmpc_dispatch_next_8u);
7671
7672 return getOrCreateRuntimeFunction(M, Name);
7673}
7674
7675FunctionCallee OpenMPIRBuilder::createDispatchFiniFunction(unsigned IVSize,
7676 bool IVSigned) {
7677 assert((IVSize == 32 || IVSize == 64) &&
7678 "IV size is not compatible with the omp runtime");
7679 RuntimeFunction Name = IVSize == 32
7680 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
7681 : omp::OMPRTL___kmpc_dispatch_fini_4u)
7682 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
7683 : omp::OMPRTL___kmpc_dispatch_fini_8u);
7684
7685 return getOrCreateRuntimeFunction(M, Name);
7686}
7687
7688FunctionCallee OpenMPIRBuilder::createDispatchDeinitFunction() {
7689 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
7690}
7691
7693 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
7694 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
7695
7696 DISubprogram *NewSP = Func->getSubprogram();
7697 if (!NewSP)
7698 return;
7699
7701
7702 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
7703 DILocalVariable *&NewVar = RemappedVariables[OldVar];
7704 // Only use cached variable if the arg number matches. This is important
7705 // so that DIVariable created for privatized variables are not discarded.
7706 if (NewVar && (arg == NewVar->getArg()))
7707 return NewVar;
7708
7710 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
7711 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
7712 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
7713 return NewVar;
7714 };
7715
7716 auto UpdateDebugRecord = [&](auto *DR) {
7717 DILocalVariable *OldVar = DR->getVariable();
7718 unsigned ArgNo = 0;
7719 for (auto Loc : DR->location_ops()) {
7720 auto Iter = ValueReplacementMap.find(Loc);
7721 if (Iter != ValueReplacementMap.end()) {
7722 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
7723 ArgNo = std::get<1>(Iter->second) + 1;
7724 }
7725 }
7726 if (ArgNo != 0)
7727 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
7728 };
7729
7730 // The location and scope of variable intrinsics and records still point to
7731 // the parent function of the target region. Update them.
7732 for (Instruction &I : instructions(Func)) {
7734 "Unexpected debug intrinsic");
7735 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
7736 UpdateDebugRecord(&DVR);
7737 }
7738 // An extra argument is passed to the device. Create the debug data for it.
7739 if (OMPBuilder.Config.isTargetDevice()) {
7740 DICompileUnit *CU = NewSP->getUnit();
7741 Module *M = Func->getParent();
7742 DIBuilder DB(*M, true, CU);
7743 DIType *VoidPtrTy =
7744 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
7745 DILocalVariable *Var = DB.createParameterVariable(
7746 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
7747 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
7748 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
7749 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
7750 &(*Func->begin()));
7751 }
7752}
7753
7755 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
7756 return cast<Operator>(V)->getOperand(0);
7757 return V;
7758}
7759
7761 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
7762 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
7763 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
7764 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
7765 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
7766 SmallVector<Type *> ParameterTypes;
7767 if (OMPBuilder.Config.isTargetDevice()) {
7768 // Add the "implicit" runtime argument we use to provide launch specific
7769 // information for target devices.
7770 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
7771 ParameterTypes.push_back(Int8PtrTy);
7772
7773 // All parameters to target devices are passed as pointers
7774 // or i64. This assumes 64-bit address spaces/pointers.
7775 for (auto &Arg : Inputs)
7776 ParameterTypes.push_back(Arg->getType()->isPointerTy()
7777 ? Arg->getType()
7778 : Type::getInt64Ty(Builder.getContext()));
7779 } else {
7780 for (auto &Arg : Inputs)
7781 ParameterTypes.push_back(Arg->getType());
7782 }
7783
7784 auto BB = Builder.GetInsertBlock();
7785 auto M = BB->getModule();
7786 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
7787 /*isVarArg*/ false);
7788 auto Func =
7789 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
7790
7791 // Forward target-cpu and target-features function attributes from the
7792 // original function to the new outlined function.
7793 Function *ParentFn = Builder.GetInsertBlock()->getParent();
7794
7795 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
7796 if (TargetCpuAttr.isStringAttribute())
7797 Func->addFnAttr(TargetCpuAttr);
7798
7799 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
7800 if (TargetFeaturesAttr.isStringAttribute())
7801 Func->addFnAttr(TargetFeaturesAttr);
7802
7803 if (OMPBuilder.Config.isTargetDevice()) {
7804 Value *ExecMode =
7805 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
7806 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
7807 }
7808
7809 // Save insert point.
7810 IRBuilder<>::InsertPointGuard IPG(Builder);
7811 // We will generate the entries in the outlined function but the debug
7812 // location may still be pointing to the parent function. Reset it now.
7813 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
7814
7815 // Generate the region into the function.
7816 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
7817 Builder.SetInsertPoint(EntryBB);
7818
7819 // Insert target init call in the device compilation pass.
7820 if (OMPBuilder.Config.isTargetDevice())
7821 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
7822
7823 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
7824
7825 // As we embed the user code in the middle of our target region after we
7826 // generate entry code, we must move what allocas we can into the entry
7827 // block to avoid possible breaking optimisations for device
7828 if (OMPBuilder.Config.isTargetDevice())
7829 OMPBuilder.ConstantAllocaRaiseCandidates.emplace_back(Func);
7830
7831 // Insert target deinit call in the device compilation pass.
7832 BasicBlock *OutlinedBodyBB =
7833 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
7834 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = CBFunc(
7835 Builder.saveIP(),
7836 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
7837 if (!AfterIP)
7838 return AfterIP.takeError();
7839 Builder.restoreIP(*AfterIP);
7840 if (OMPBuilder.Config.isTargetDevice())
7841 OMPBuilder.createTargetDeinit(Builder);
7842
7843 // Insert return instruction.
7844 Builder.CreateRetVoid();
7845
7846 // New Alloca IP at entry point of created device function.
7847 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
7848 auto AllocaIP = Builder.saveIP();
7849
7850 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
7851
7852 // Skip the artificial dyn_ptr on the device.
7853 const auto &ArgRange =
7854 OMPBuilder.Config.isTargetDevice()
7855 ? make_range(Func->arg_begin() + 1, Func->arg_end())
7856 : Func->args();
7857
7859
7860 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
7861 // Things like GEP's can come in the form of Constants. Constants and
7862 // ConstantExpr's do not have access to the knowledge of what they're
7863 // contained in, so we must dig a little to find an instruction so we
7864 // can tell if they're used inside of the function we're outlining. We
7865 // also replace the original constant expression with a new instruction
7866 // equivalent; an instruction as it allows easy modification in the
7867 // following loop, as we can now know the constant (instruction) is
7868 // owned by our target function and replaceUsesOfWith can now be invoked
7869 // on it (cannot do this with constants it seems). A brand new one also
7870 // allows us to be cautious as it is perhaps possible the old expression
7871 // was used inside of the function but exists and is used externally
7872 // (unlikely by the nature of a Constant, but still).
7873 // NOTE: We cannot remove dead constants that have been rewritten to
7874 // instructions at this stage, we run the risk of breaking later lowering
7875 // by doing so as we could still be in the process of lowering the module
7876 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
7877 // constants we have created rewritten versions of.
7878 if (auto *Const = dyn_cast<Constant>(Input))
7879 convertUsersOfConstantsToInstructions(Const, Func, false);
7880
7881 // Collect users before iterating over them to avoid invalidating the
7882 // iteration in case a user uses Input more than once (e.g. a call
7883 // instruction).
7884 SetVector<User *> Users(Input->users().begin(), Input->users().end());
7885 // Collect all the instructions
7887 if (auto *Instr = dyn_cast<Instruction>(User))
7888 if (Instr->getFunction() == Func)
7889 Instr->replaceUsesOfWith(Input, InputCopy);
7890 };
7891
7892 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
7893
7894 // Rewrite uses of input valus to parameters.
7895 for (auto InArg : zip(Inputs, ArgRange)) {
7896 Value *Input = std::get<0>(InArg);
7897 Argument &Arg = std::get<1>(InArg);
7898 Value *InputCopy = nullptr;
7899
7900 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP =
7901 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
7902 if (!AfterIP)
7903 return AfterIP.takeError();
7904 Builder.restoreIP(*AfterIP);
7905 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
7906
7907 // In certain cases a Global may be set up for replacement, however, this
7908 // Global may be used in multiple arguments to the kernel, just segmented
7909 // apart, for example, if we have a global array, that is sectioned into
7910 // multiple mappings (technically not legal in OpenMP, but there is a case
7911 // in Fortran for Common Blocks where this is neccesary), we will end up
7912 // with GEP's into this array inside the kernel, that refer to the Global
7913 // but are technically seperate arguments to the kernel for all intents and
7914 // purposes. If we have mapped a segment that requires a GEP into the 0-th
7915 // index, it will fold into an referal to the Global, if we then encounter
7916 // this folded GEP during replacement all of the references to the
7917 // Global in the kernel will be replaced with the argument we have generated
7918 // that corresponds to it, including any other GEP's that refer to the
7919 // Global that may be other arguments. This will invalidate all of the other
7920 // preceding mapped arguments that refer to the same global that may be
7921 // seperate segments. To prevent this, we defer global processing until all
7922 // other processing has been performed.
7925 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
7926 continue;
7927 }
7928
7930 continue;
7931
7932 ReplaceValue(Input, InputCopy, Func);
7933 }
7934
7935 // Replace all of our deferred Input values, currently just Globals.
7936 for (auto Deferred : DeferredReplacement)
7937 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
7938
7939 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
7940 ValueReplacementMap);
7941 return Func;
7942}
7943/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
7944/// of pointers containing shared data between the parent task and the created
7945/// task.
7946static LoadInst *loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder,
7947 IRBuilderBase &Builder,
7948 Value *TaskWithPrivates,
7949 Type *TaskWithPrivatesTy) {
7950
7951 Type *TaskTy = OMPIRBuilder.Task;
7952 LLVMContext &Ctx = Builder.getContext();
7953 Value *TaskT =
7954 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
7955 Value *Shareds = TaskT;
7956 // TaskWithPrivatesTy can be one of the following
7957 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
7958 // %struct.privates }
7959 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
7960 //
7961 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
7962 // its first member has to be the task descriptor. TaskTy is the type of the
7963 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
7964 // first member of TaskT, gives us the pointer to shared data.
7965 if (TaskWithPrivatesTy != TaskTy)
7966 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
7967 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
7968}
7969/// Create an entry point for a target task with the following.
7970/// It'll have the following signature
7971/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
7972/// This function is called from emitTargetTask once the
7973/// code to launch the target kernel has been outlined already.
7974/// NumOffloadingArrays is the number of offloading arrays that we need to copy
7975/// into the task structure so that the deferred target task can access this
7976/// data even after the stack frame of the generating task has been rolled
7977/// back. Offloading arrays contain base pointers, pointers, sizes etc
7978/// of the data that the target kernel will access. These in effect are the
7979/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
7981 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
7982 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
7983 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
7984
7985 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
7986 // This is because PrivatesTy is the type of the structure in which
7987 // we pass the offloading arrays to the deferred target task.
7988 assert((!NumOffloadingArrays || PrivatesTy) &&
7989 "PrivatesTy cannot be nullptr when there are offloadingArrays"
7990 "to privatize");
7991
7992 Module &M = OMPBuilder.M;
7993 // KernelLaunchFunction is the target launch function, i.e.
7994 // the function that sets up kernel arguments and calls
7995 // __tgt_target_kernel to launch the kernel on the device.
7996 //
7997 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
7998
7999 // StaleCI is the CallInst which is the call to the outlined
8000 // target kernel launch function. If there are local live-in values
8001 // that the outlined function uses then these are aggregated into a structure
8002 // which is passed as the second argument. If there are no local live-in
8003 // values or if all values used by the outlined kernel are global variables,
8004 // then there's only one argument, the threadID. So, StaleCI can be
8005 //
8006 // %structArg = alloca { ptr, ptr }, align 8
8007 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8008 // store ptr %20, ptr %gep_, align 8
8009 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8010 // store ptr %21, ptr %gep_8, align 8
8011 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8012 //
8013 // OR
8014 //
8015 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8016 OpenMPIRBuilder::InsertPointTy IP(StaleCI->getParent(),
8017 StaleCI->getIterator());
8018
8019 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8020
8021 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8022 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8023 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8024
8025 auto ProxyFnTy =
8026 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8027 /* isVarArg */ false);
8028 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8029 ".omp_target_task_proxy_func",
8030 Builder.GetInsertBlock()->getModule());
8031 Value *ThreadId = ProxyFn->getArg(0);
8032 Value *TaskWithPrivates = ProxyFn->getArg(1);
8033 ThreadId->setName("thread.id");
8034 TaskWithPrivates->setName("task");
8035
8036 bool HasShareds = SharedArgsOperandNo > 0;
8037 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8038 BasicBlock *EntryBB =
8039 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8040 Builder.SetInsertPoint(EntryBB);
8041
8042 SmallVector<Value *> KernelLaunchArgs;
8043 KernelLaunchArgs.reserve(StaleCI->arg_size());
8044 KernelLaunchArgs.push_back(ThreadId);
8045
8046 if (HasOffloadingArrays) {
8047 assert(TaskTy != TaskWithPrivatesTy &&
8048 "If there are offloading arrays to pass to the target"
8049 "TaskTy cannot be the same as TaskWithPrivatesTy");
8050 (void)TaskTy;
8051 Value *Privates =
8052 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8053 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8054 KernelLaunchArgs.push_back(
8055 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8056 }
8057
8058 if (HasShareds) {
8059 auto *ArgStructAlloca =
8060 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8061 assert(ArgStructAlloca &&
8062 "Unable to find the alloca instruction corresponding to arguments "
8063 "for extracted function");
8064 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8065
8066 AllocaInst *NewArgStructAlloca =
8067 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8068
8069 Value *SharedsSize =
8070 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8071
8073 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8074
8075 Builder.CreateMemCpy(
8076 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8077 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8078 KernelLaunchArgs.push_back(NewArgStructAlloca);
8079 }
8080 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8081 Builder.CreateRetVoid();
8082 return ProxyFn;
8083}
8085
8086 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8087 return GEP->getSourceElementType();
8088 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8089 return Alloca->getAllocatedType();
8090
8091 llvm_unreachable("Unhandled Instruction type");
8092 return nullptr;
8093}
8094// This function returns a struct that has at most two members.
8095// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8096// descriptor. The second member, if needed, is a struct containing arrays
8097// that need to be passed to the offloaded target kernel. For example,
8098// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8099// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8100// respectively, then the types created by this function are
8101//
8102// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8103// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8104// %struct.privates }
8105// %struct.task_with_privates is returned by this function.
8106// If there aren't any offloading arrays to pass to the target kernel,
8107// %struct.kmp_task_ompbuilder_t is returned.
8108static StructType *
8109createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder,
8110 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8111
8112 if (OffloadingArraysToPrivatize.empty())
8113 return OMPIRBuilder.Task;
8114
8115 SmallVector<Type *, 4> StructFieldTypes;
8116 for (Value *V : OffloadingArraysToPrivatize) {
8117 assert(V->getType()->isPointerTy() &&
8118 "Expected pointer to array to privatize. Got a non-pointer value "
8119 "instead");
8120 Type *ArrayTy = getOffloadingArrayType(V);
8121 assert(ArrayTy && "ArrayType cannot be nullptr");
8122 StructFieldTypes.push_back(ArrayTy);
8123 }
8124 StructType *PrivatesStructTy =
8125 StructType::create(StructFieldTypes, "struct.privates");
8126 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8127 "struct.task_with_privates");
8128}
8130 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8131 TargetRegionEntryInfo &EntryInfo,
8132 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8133 Function *&OutlinedFn, Constant *&OutlinedFnID,
8135 OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc,
8136 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB) {
8137
8138 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8139 [&](StringRef EntryFnName) {
8140 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8141 EntryFnName, Inputs, CBFunc,
8142 ArgAccessorFuncCB);
8143 };
8144
8145 return OMPBuilder.emitTargetRegionFunction(
8146 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8147 OutlinedFnID);
8148}
8149
8150OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
8151 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8152 OpenMPIRBuilder::InsertPointTy AllocaIP,
8154 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8155
8156 // The following explains the code-gen scenario for the `target` directive. A
8157 // similar scneario is followed for other device-related directives (e.g.
8158 // `target enter data`) but in similar fashion since we only need to emit task
8159 // that encapsulates the proper runtime call.
8160 //
8161 // When we arrive at this function, the target region itself has been
8162 // outlined into the function OutlinedFn.
8163 // So at ths point, for
8164 // --------------------------------------------------------------
8165 // void user_code_that_offloads(...) {
8166 // omp target depend(..) map(from:a) map(to:b) private(i)
8167 // do i = 1, 10
8168 // a(i) = b(i) + n
8169 // }
8170 //
8171 // --------------------------------------------------------------
8172 //
8173 // we have
8174 //
8175 // --------------------------------------------------------------
8176 //
8177 // void user_code_that_offloads(...) {
8178 // %.offload_baseptrs = alloca [2 x ptr], align 8
8179 // %.offload_ptrs = alloca [2 x ptr], align 8
8180 // %.offload_mappers = alloca [2 x ptr], align 8
8181 // ;; target region has been outlined and now we need to
8182 // ;; offload to it via a target task.
8183 // }
8184 // void outlined_device_function(ptr a, ptr b, ptr n) {
8185 // n = *n_ptr;
8186 // do i = 1, 10
8187 // a(i) = b(i) + n
8188 // }
8189 //
8190 // We have to now do the following
8191 // (i) Make an offloading call to outlined_device_function using the OpenMP
8192 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8193 // emitted by emitKernelLaunch
8194 // (ii) Create a task entry point function that calls kernel_launch_function
8195 // and is the entry point for the target task. See
8196 // '@.omp_target_task_proxy_func in the pseudocode below.
8197 // (iii) Create a task with the task entry point created in (ii)
8198 //
8199 // That is we create the following
8200 // struct task_with_privates {
8201 // struct kmp_task_ompbuilder_t task_struct;
8202 // struct privates {
8203 // [2 x ptr] ; baseptrs
8204 // [2 x ptr] ; ptrs
8205 // [2 x i64] ; sizes
8206 // }
8207 // }
8208 // void user_code_that_offloads(...) {
8209 // %.offload_baseptrs = alloca [2 x ptr], align 8
8210 // %.offload_ptrs = alloca [2 x ptr], align 8
8211 // %.offload_sizes = alloca [2 x i64], align 8
8212 //
8213 // %structArg = alloca { ptr, ptr, ptr }, align 8
8214 // %strucArg[0] = a
8215 // %strucArg[1] = b
8216 // %strucArg[2] = &n
8217 //
8218 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8219 // sizeof(kmp_task_ompbuilder_t),
8220 // sizeof(structArg),
8221 // @.omp_target_task_proxy_func,
8222 // ...)
8223 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8224 // sizeof(structArg))
8225 // memcpy(target_task_with_privates->privates->baseptrs,
8226 // offload_baseptrs, sizeof(offload_baseptrs)
8227 // memcpy(target_task_with_privates->privates->ptrs,
8228 // offload_ptrs, sizeof(offload_ptrs)
8229 // memcpy(target_task_with_privates->privates->sizes,
8230 // offload_sizes, sizeof(offload_sizes)
8231 // dependencies_array = ...
8232 // ;; if nowait not present
8233 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8234 // call @__kmpc_omp_task_begin_if0(...)
8235 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8236 // %target_task_with_privates)
8237 // call @__kmpc_omp_task_complete_if0(...)
8238 // }
8239 //
8240 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8241 // ptr %task) {
8242 // %structArg = alloca {ptr, ptr, ptr}
8243 // %task_ptr = getelementptr(%task, 0, 0)
8244 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8245 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8246 //
8247 // %offloading_arrays = getelementptr(%task, 0, 1)
8248 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8249 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8250 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8251 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8252 // %offload_sizes, %structArg)
8253 // }
8254 //
8255 // We need the proxy function because the signature of the task entry point
8256 // expected by kmpc_omp_task is always the same and will be different from
8257 // that of the kernel_launch function.
8258 //
8259 // kernel_launch_function is generated by emitKernelLaunch and has the
8260 // always_inline attribute. For this example, it'll look like so:
8261 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8262 // %offload_sizes, %structArg) alwaysinline {
8263 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8264 // ; load aggregated data from %structArg
8265 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8266 // ; offload_sizes
8267 // call i32 @__tgt_target_kernel(...,
8268 // outlined_device_function,
8269 // ptr %kernel_args)
8270 // }
8271 // void outlined_device_function(ptr a, ptr b, ptr n) {
8272 // n = *n_ptr;
8273 // do i = 1, 10
8274 // a(i) = b(i) + n
8275 // }
8276 //
8277 BasicBlock *TargetTaskBodyBB =
8278 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8279 BasicBlock *TargetTaskAllocaBB =
8280 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8281
8282 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8283 TargetTaskAllocaBB->begin());
8284 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8285
8286 OutlineInfo OI;
8287 OI.EntryBB = TargetTaskAllocaBB;
8288 OI.OuterAllocaBB = AllocaIP.getBlock();
8289
8290 // Add the thread ID argument.
8292 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
8293 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8294
8295 // Generate the task body which will subsequently be outlined.
8296 Builder.restoreIP(TargetTaskBodyIP);
8297 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8298 return Err;
8299
8300 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8301 // it is given. These blocks are enumerated by
8302 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8303 // to be outside the region. In other words, OI.ExitBlock is expected to be
8304 // the start of the region after the outlining. We used to set OI.ExitBlock
8305 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8306 // except when the task body is a single basic block. In that case,
8307 // OI.ExitBlock is set to the single task body block and will get left out of
8308 // the outlining process. So, simply create a new empty block to which we
8309 // uncoditionally branch from where TaskBodyCB left off
8310 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8311 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8312 /*IsFinished=*/true);
8313
8314 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8315 bool NeedsTargetTask = HasNoWait && DeviceID;
8316 if (NeedsTargetTask) {
8317 for (auto *V :
8318 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8319 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8320 RTArgs.SizesArray}) {
8322 OffloadingArraysToPrivatize.push_back(V);
8323 OI.ExcludeArgsFromAggregate.push_back(V);
8324 }
8325 }
8326 }
8327 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8328 DeviceID, OffloadingArraysToPrivatize](
8329 Function &OutlinedFn) mutable {
8330 assert(OutlinedFn.hasOneUse() &&
8331 "there must be a single user for the outlined function");
8332
8333 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8334
8335 // The first argument of StaleCI is always the thread id.
8336 // The next few arguments are the pointers to offloading arrays
8337 // if any. (see OffloadingArraysToPrivatize)
8338 // Finally, all other local values that are live-in into the outlined region
8339 // end up in a structure whose pointer is passed as the last argument. This
8340 // piece of data is passed in the "shared" field of the task structure. So,
8341 // we know we have to pass shareds to the task if the number of arguments is
8342 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8343 // thread id. Further, for safety, we assert that the number of arguments of
8344 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8345 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8346 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8347 assert((!HasShareds ||
8348 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8349 "Wrong number of arguments for StaleCI when shareds are present");
8350 int SharedArgOperandNo =
8351 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8352
8353 StructType *TaskWithPrivatesTy =
8354 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8355 StructType *PrivatesTy = nullptr;
8356
8357 if (!OffloadingArraysToPrivatize.empty())
8358 PrivatesTy =
8359 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8360
8362 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8363 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8364
8365 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8366 << "\n");
8367
8368 Builder.SetInsertPoint(StaleCI);
8369
8370 // Gather the arguments for emitting the runtime call.
8371 uint32_t SrcLocStrSize;
8372 Constant *SrcLocStr =
8373 getOrCreateSrcLocStr(LocationDescription(Builder), SrcLocStrSize);
8374 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8375
8376 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8377 //
8378 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8379 // the DeviceID to the deferred task and also since
8380 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8381 Function *TaskAllocFn =
8382 !NeedsTargetTask
8383 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8384 : getOrCreateRuntimeFunctionPtr(
8385 OMPRTL___kmpc_omp_target_task_alloc);
8386
8387 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8388 // call.
8389 Value *ThreadID = getOrCreateThreadID(Ident);
8390
8391 // Argument - `sizeof_kmp_task_t` (TaskSize)
8392 // Tasksize refers to the size in bytes of kmp_task_t data structure
8393 // plus any other data to be passed to the target task, if any, which
8394 // is packed into a struct. kmp_task_t and the struct so created are
8395 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8396 Value *TaskSize = Builder.getInt64(
8397 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8398
8399 // Argument - `sizeof_shareds` (SharedsSize)
8400 // SharedsSize refers to the shareds array size in the kmp_task_t data
8401 // structure.
8402 Value *SharedsSize = Builder.getInt64(0);
8403 if (HasShareds) {
8404 auto *ArgStructAlloca =
8405 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8406 assert(ArgStructAlloca &&
8407 "Unable to find the alloca instruction corresponding to arguments "
8408 "for extracted function");
8409 auto *ArgStructType =
8410 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8411 assert(ArgStructType && "Unable to find struct type corresponding to "
8412 "arguments for extracted function");
8413 SharedsSize =
8414 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8415 }
8416
8417 // Argument - `flags`
8418 // Task is tied iff (Flags & 1) == 1.
8419 // Task is untied iff (Flags & 1) == 0.
8420 // Task is final iff (Flags & 2) == 2.
8421 // Task is not final iff (Flags & 2) == 0.
8422 // A target task is not final and is untied.
8423 Value *Flags = Builder.getInt32(0);
8424
8425 // Emit the @__kmpc_omp_task_alloc runtime call
8426 // The runtime call returns a pointer to an area where the task captured
8427 // variables must be copied before the task is run (TaskData)
8428 CallInst *TaskData = nullptr;
8429
8430 SmallVector<llvm::Value *> TaskAllocArgs = {
8431 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8432 /*flags=*/Flags,
8433 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8434 /*task_func=*/ProxyFn};
8435
8436 if (NeedsTargetTask) {
8437 assert(DeviceID && "Expected non-empty device ID.");
8438 TaskAllocArgs.push_back(DeviceID);
8439 }
8440
8441 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
8442
8443 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8444 if (HasShareds) {
8445 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8447 *this, Builder, TaskData, TaskWithPrivatesTy);
8448 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8449 SharedsSize);
8450 }
8451 if (!OffloadingArraysToPrivatize.empty()) {
8452 Value *Privates =
8453 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8454 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8455 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8456 [[maybe_unused]] Type *ArrayType =
8457 getOffloadingArrayType(PtrToPrivatize);
8458 assert(ArrayType && "ArrayType cannot be nullptr");
8459
8460 Type *ElementType = PrivatesTy->getElementType(i);
8461 assert(ElementType == ArrayType &&
8462 "ElementType should match ArrayType");
8463 (void)ArrayType;
8464
8465 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8466 Builder.CreateMemCpy(
8467 Dst, Alignment, PtrToPrivatize, Alignment,
8468 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8469 }
8470 }
8471
8472 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8473
8474 // ---------------------------------------------------------------
8475 // V5.2 13.8 target construct
8476 // If the nowait clause is present, execution of the target task
8477 // may be deferred. If the nowait clause is not present, the target task is
8478 // an included task.
8479 // ---------------------------------------------------------------
8480 // The above means that the lack of a nowait on the target construct
8481 // translates to '#pragma omp task if(0)'
8482 if (!NeedsTargetTask) {
8483 if (DepArray) {
8484 Function *TaskWaitFn =
8485 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8486 createRuntimeFunctionCall(
8487 TaskWaitFn,
8488 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8489 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8490 /*dep_list=*/DepArray,
8491 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8492 /*noalias_dep_list=*/
8494 }
8495 // Included task.
8496 Function *TaskBeginFn =
8497 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8498 Function *TaskCompleteFn =
8499 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8500 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8501 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
8502 CI->setDebugLoc(StaleCI->getDebugLoc());
8503 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8504 } else if (DepArray) {
8505 // HasNoWait - meaning the task may be deferred. Call
8506 // __kmpc_omp_task_with_deps if there are dependencies,
8507 // else call __kmpc_omp_task
8508 Function *TaskFn =
8509 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8510 createRuntimeFunctionCall(
8511 TaskFn,
8512 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8513 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8515 } else {
8516 // Emit the @__kmpc_omp_task runtime call to spawn the task
8517 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8518 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
8519 }
8520
8521 StaleCI->eraseFromParent();
8522 for (Instruction *I : llvm::reverse(ToBeDeleted))
8523 I->eraseFromParent();
8524 };
8525 addOutlineInfo(std::move(OI));
8526
8527 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8528 << *(Builder.GetInsertBlock()) << "\n");
8529 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8530 << *(Builder.GetInsertBlock()->getParent()->getParent())
8531 << "\n");
8532 return Builder.saveIP();
8533}
8534
8535Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
8536 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8537 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8538 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8539 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8540 if (Error Err =
8541 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8542 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8543 return Err;
8544 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8545 return Error::success();
8546}
8547
8548static void emitTargetCall(
8549 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8550 OpenMPIRBuilder::InsertPointTy AllocaIP,
8551 OpenMPIRBuilder::TargetDataInfo &Info,
8552 const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
8553 const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
8554 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8556 OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
8557 OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
8559 bool HasNoWait, Value *DynCGroupMem,
8560 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8561 // Generate a function call to the host fallback implementation of the target
8562 // region. This is called by the host when no offload entry was generated for
8563 // the target region and when the offloading call fails at runtime.
8564 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8565 -> OpenMPIRBuilder::InsertPointOrErrorTy {
8566 Builder.restoreIP(IP);
8567 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
8568 return Builder.saveIP();
8569 };
8570
8571 bool HasDependencies = Dependencies.size() > 0;
8572 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8573
8574 OpenMPIRBuilder::TargetKernelArgs KArgs;
8575
8576 auto TaskBodyCB =
8577 [&](Value *DeviceID, Value *RTLoc,
8578 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8579 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8580 // produce any.
8581 llvm::OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8582 // emitKernelLaunch makes the necessary runtime call to offload the
8583 // kernel. We then outline all that code into a separate function
8584 // ('kernel_launch_function' in the pseudo code above). This function is
8585 // then called by the target task proxy function (see
8586 // '@.omp_target_task_proxy_func' in the pseudo code above)
8587 // "@.omp_target_task_proxy_func' is generated by
8588 // emitTargetTaskProxyFunction.
8589 if (OutlinedFnID && DeviceID)
8590 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8591 EmitTargetCallFallbackCB, KArgs,
8592 DeviceID, RTLoc, TargetTaskAllocaIP);
8593
8594 // We only need to do the outlining if `DeviceID` is set to avoid calling
8595 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
8596 // generating the `else` branch of an `if` clause.
8597 //
8598 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
8599 // In this case, we execute the host implementation directly.
8600 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
8601 }());
8602
8603 OMPBuilder.Builder.restoreIP(AfterIP);
8604 return Error::success();
8605 };
8606
8607 auto &&EmitTargetCallElse =
8608 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8609 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8610 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8611 // produce any.
8612 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8613 if (RequiresOuterTargetTask) {
8614 // Arguments that are intended to be directly forwarded to an
8615 // emitKernelLaunch call are pased as nullptr, since
8616 // OutlinedFnID=nullptr results in that call not being done.
8617 OpenMPIRBuilder::TargetDataRTArgs EmptyRTArgs;
8618 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
8619 /*RTLoc=*/nullptr, AllocaIP,
8620 Dependencies, EmptyRTArgs, HasNoWait);
8621 }
8622 return EmitTargetCallFallbackCB(Builder.saveIP());
8623 }());
8624
8625 Builder.restoreIP(AfterIP);
8626 return Error::success();
8627 };
8628
8629 auto &&EmitTargetCallThen =
8630 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
8631 OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
8632 Info.HasNoWait = HasNoWait;
8633 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
8634 OpenMPIRBuilder::TargetDataRTArgs RTArgs;
8635 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
8636 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
8637 /*IsNonContiguous=*/true,
8638 /*ForEndCall=*/false))
8639 return Err;
8640
8641 SmallVector<Value *, 3> NumTeamsC;
8642 for (auto [DefaultVal, RuntimeVal] :
8643 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
8644 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
8645 : Builder.getInt32(DefaultVal));
8646
8647 // Calculate number of threads: 0 if no clauses specified, otherwise it is
8648 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
8649 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
8650 if (Clause)
8651 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
8652 /*isSigned=*/false);
8653 return Clause;
8654 };
8655 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
8656 if (Clause)
8657 Result =
8658 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
8659 Result, Clause)
8660 : Clause;
8661 };
8662
8663 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
8664 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
8665 SmallVector<Value *, 3> NumThreadsC;
8666 Value *MaxThreadsClause =
8667 RuntimeAttrs.TeamsThreadLimit.size() == 1
8668 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
8669 : nullptr;
8670
8671 for (auto [TeamsVal, TargetVal] : zip_equal(
8672 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
8673 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
8674 Value *NumThreads = InitMaxThreadsClause(TargetVal);
8675
8676 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
8677 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
8678
8679 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
8680 }
8681
8682 unsigned NumTargetItems = Info.NumberOfPtrs;
8683 // TODO: Use correct device ID
8684 Value *DeviceID = Builder.getInt64(OMP_DEVICEID_UNDEF);
8685 uint32_t SrcLocStrSize;
8686 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
8687 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
8688 llvm::omp::IdentFlag(0), 0);
8689
8690 Value *TripCount = RuntimeAttrs.LoopTripCount
8691 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
8692 Builder.getInt64Ty(),
8693 /*isSigned=*/false)
8694 : Builder.getInt64(0);
8695
8696 // Request zero groupprivate bytes by default.
8697 if (!DynCGroupMem)
8698 DynCGroupMem = Builder.getInt32(0);
8699
8700 KArgs = OpenMPIRBuilder::TargetKernelArgs(
8701 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
8702 HasNoWait, DynCGroupMemFallback);
8703
8704 // Assume no error was returned because TaskBodyCB and
8705 // EmitTargetCallFallbackCB don't produce any.
8706 OpenMPIRBuilder::InsertPointTy AfterIP = cantFail([&]() {
8707 // The presence of certain clauses on the target directive require the
8708 // explicit generation of the target task.
8709 if (RequiresOuterTargetTask)
8710 return OMPBuilder.emitTargetTask(TaskBodyCB, DeviceID, RTLoc, AllocaIP,
8711 Dependencies, KArgs.RTArgs,
8712 Info.HasNoWait);
8713
8714 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
8715 EmitTargetCallFallbackCB, KArgs,
8716 DeviceID, RTLoc, AllocaIP);
8717 }());
8718
8719 Builder.restoreIP(AfterIP);
8720 return Error::success();
8721 };
8722
8723 // If we don't have an ID for the target region, it means an offload entry
8724 // wasn't created. In this case we just run the host fallback directly and
8725 // ignore any potential 'if' clauses.
8726 if (!OutlinedFnID) {
8727 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
8728 return;
8729 }
8730
8731 // If there's no 'if' clause, only generate the kernel launch code path.
8732 if (!IfCond) {
8733 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
8734 return;
8735 }
8736
8737 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
8738 EmitTargetCallElse, AllocaIP));
8739}
8740
8741OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
8742 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
8743 InsertPointTy CodeGenIP, TargetDataInfo &Info,
8744 TargetRegionEntryInfo &EntryInfo,
8745 const TargetKernelDefaultAttrs &DefaultAttrs,
8746 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
8747 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
8748 OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
8749 OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
8750 CustomMapperCallbackTy CustomMapperCB,
8751 const SmallVector<DependData> &Dependencies, bool HasNowait,
8752 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8753
8754 if (!updateToLocation(Loc))
8755 return InsertPointTy();
8756
8757 Builder.restoreIP(CodeGenIP);
8758
8759 Function *OutlinedFn;
8760 Constant *OutlinedFnID = nullptr;
8761 // The target region is outlined into its own function. The LLVM IR for
8762 // the target region itself is generated using the callbacks CBFunc
8763 // and ArgAccessorFuncCB
8765 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
8766 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
8767 return Err;
8768
8769 // If we are not on the target device, then we need to generate code
8770 // to make a remote call (offload) to the previously outlined function
8771 // that represents the target region. Do that now.
8772 if (!Config.isTargetDevice())
8773 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
8774 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
8775 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
8776 DynCGroupMemFallback);
8777 return Builder.saveIP();
8778}
8779
8780std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
8781 StringRef FirstSeparator,
8782 StringRef Separator) {
8783 SmallString<128> Buffer;
8784 llvm::raw_svector_ostream OS(Buffer);
8785 StringRef Sep = FirstSeparator;
8786 for (StringRef Part : Parts) {
8787 OS << Sep << Part;
8788 Sep = Separator;
8789 }
8790 return OS.str().str();
8791}
8792
8793std::string
8794OpenMPIRBuilder::createPlatformSpecificName(ArrayRef<StringRef> Parts) const {
8795 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
8796 Config.separator());
8797}
8798
8799GlobalVariable *OpenMPIRBuilder::getOrCreateInternalVariable(
8800 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
8801 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
8802 if (Elem.second) {
8803 assert(Elem.second->getValueType() == Ty &&
8804 "OMP internal variable has different type than requested");
8805 } else {
8806 // TODO: investigate the appropriate linkage type used for the global
8807 // variable for possibly changing that to internal or private, or maybe
8808 // create different versions of the function for different OMP internal
8809 // variables.
8810 const DataLayout &DL = M.getDataLayout();
8811 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
8812 // default global AS is 1.
8813 // See double-target-call-with-declare-target.f90 and
8814 // declare-target-vars-in-target-region.f90 libomptarget
8815 // tests.
8816 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
8817 : M.getTargetTriple().isAMDGPU()
8818 ? 0
8819 : DL.getDefaultGlobalsAddressSpace();
8820 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
8823 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
8824 Constant::getNullValue(Ty), Elem.first(),
8825 /*InsertBefore=*/nullptr,
8826 GlobalValue::NotThreadLocal, AddressSpaceVal);
8827 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
8828 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
8829 GV->setAlignment(std::max(TypeAlign, PtrAlign));
8830 Elem.second = GV;
8831 }
8832
8833 return Elem.second;
8834}
8835
8836Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
8837 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
8838 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
8839 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
8840}
8841
8842Value *OpenMPIRBuilder::getSizeInBytes(Value *BasePtr) {
8843 LLVMContext &Ctx = Builder.getContext();
8844 Value *Null =
8846 Value *SizeGep =
8847 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
8848 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
8849 return SizePtrToInt;
8850}
8851
8853OpenMPIRBuilder::createOffloadMaptypes(SmallVectorImpl<uint64_t> &Mappings,
8854 std::string VarName) {
8855 llvm::Constant *MaptypesArrayInit =
8856 llvm::ConstantDataArray::get(M.getContext(), Mappings);
8857 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
8858 M, MaptypesArrayInit->getType(),
8859 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
8860 VarName);
8861 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
8862 return MaptypesArrayGlobal;
8863}
8864
8865void OpenMPIRBuilder::createMapperAllocas(const LocationDescription &Loc,
8866 InsertPointTy AllocaIP,
8867 unsigned NumOperands,
8868 struct MapperAllocas &MapperAllocas) {
8869 if (!updateToLocation(Loc))
8870 return;
8871
8872 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8873 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8874 Builder.restoreIP(AllocaIP);
8875 AllocaInst *ArgsBase = Builder.CreateAlloca(
8876 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
8877 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
8878 ".offload_ptrs");
8879 AllocaInst *ArgSizes = Builder.CreateAlloca(
8880 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
8881 updateToLocation(Loc);
8882 MapperAllocas.ArgsBase = ArgsBase;
8883 MapperAllocas.Args = Args;
8884 MapperAllocas.ArgSizes = ArgSizes;
8885}
8886
8887void OpenMPIRBuilder::emitMapperCall(const LocationDescription &Loc,
8888 Function *MapperFunc, Value *SrcLocInfo,
8889 Value *MaptypesArg, Value *MapnamesArg,
8890 struct MapperAllocas &MapperAllocas,
8891 int64_t DeviceID, unsigned NumOperands) {
8892 if (!updateToLocation(Loc))
8893 return;
8894
8895 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
8896 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
8897 Value *ArgsBaseGEP =
8898 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
8899 {Builder.getInt32(0), Builder.getInt32(0)});
8900 Value *ArgsGEP =
8901 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
8902 {Builder.getInt32(0), Builder.getInt32(0)});
8903 Value *ArgSizesGEP =
8904 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
8905 {Builder.getInt32(0), Builder.getInt32(0)});
8906 Value *NullPtr =
8907 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
8908 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
8909 Builder.getInt32(NumOperands),
8910 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
8911 MaptypesArg, MapnamesArg, NullPtr});
8912}
8913
8914void OpenMPIRBuilder::emitOffloadingArraysArgument(IRBuilderBase &Builder,
8915 TargetDataRTArgs &RTArgs,
8916 TargetDataInfo &Info,
8917 bool ForEndCall) {
8918 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
8919 "expected region end call to runtime only when end call is separate");
8920 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
8921 auto VoidPtrTy = UnqualPtrTy;
8922 auto VoidPtrPtrTy = UnqualPtrTy;
8923 auto Int64Ty = Type::getInt64Ty(M.getContext());
8924 auto Int64PtrTy = UnqualPtrTy;
8925
8926 if (!Info.NumberOfPtrs) {
8927 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8928 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8929 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
8930 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
8931 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8932 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8933 return;
8934 }
8935
8936 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
8937 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
8938 Info.RTArgs.BasePointersArray,
8939 /*Idx0=*/0, /*Idx1=*/0);
8940 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
8941 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
8942 /*Idx0=*/0,
8943 /*Idx1=*/0);
8944 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
8945 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
8946 /*Idx0=*/0, /*Idx1=*/0);
8947 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
8948 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
8949 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
8950 : Info.RTArgs.MapTypesArray,
8951 /*Idx0=*/0,
8952 /*Idx1=*/0);
8953
8954 // Only emit the mapper information arrays if debug information is
8955 // requested.
8956 if (!Info.EmitDebug)
8957 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
8958 else
8959 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
8960 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
8961 /*Idx0=*/0,
8962 /*Idx1=*/0);
8963 // If there is no user-defined mapper, set the mapper array to nullptr to
8964 // avoid an unnecessary data privatization
8965 if (!Info.HasMapper)
8966 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
8967 else
8968 RTArgs.MappersArray =
8969 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
8970}
8971
8972void OpenMPIRBuilder::emitNonContiguousDescriptor(InsertPointTy AllocaIP,
8973 InsertPointTy CodeGenIP,
8974 MapInfosTy &CombinedInfo,
8975 TargetDataInfo &Info) {
8976 MapInfosTy::StructNonContiguousInfo &NonContigInfo =
8977 CombinedInfo.NonContigInfo;
8978
8979 // Build an array of struct descriptor_dim and then assign it to
8980 // offload_args.
8981 //
8982 // struct descriptor_dim {
8983 // uint64_t offset;
8984 // uint64_t count;
8985 // uint64_t stride
8986 // };
8987 Type *Int64Ty = Builder.getInt64Ty();
8989 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
8990 "struct.descriptor_dim");
8991
8992 enum { OffsetFD = 0, CountFD, StrideFD };
8993 // We need two index variable here since the size of "Dims" is the same as
8994 // the size of Components, however, the size of offset, count, and stride is
8995 // equal to the size of base declaration that is non-contiguous.
8996 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
8997 // Skip emitting ir if dimension size is 1 since it cannot be
8998 // non-contiguous.
8999 if (NonContigInfo.Dims[I] == 1)
9000 continue;
9001 Builder.restoreIP(AllocaIP);
9002 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9003 AllocaInst *DimsAddr =
9004 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9005 Builder.restoreIP(CodeGenIP);
9006 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9007 unsigned RevIdx = EE - II - 1;
9008 Value *DimsLVal = Builder.CreateInBoundsGEP(
9009 DimsAddr->getAllocatedType(), DimsAddr,
9010 {Builder.getInt64(0), Builder.getInt64(II)});
9011 // Offset
9012 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9013 Builder.CreateAlignedStore(
9014 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9015 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9016 // Count
9017 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9018 Builder.CreateAlignedStore(
9019 NonContigInfo.Counts[L][RevIdx], CountLVal,
9020 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9021 // Stride
9022 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9023 Builder.CreateAlignedStore(
9024 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9025 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9026 }
9027 // args[I] = &dims
9028 Builder.restoreIP(CodeGenIP);
9029 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9030 DimsAddr, Builder.getPtrTy());
9031 Value *P = Builder.CreateConstInBoundsGEP2_32(
9032 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9033 Info.RTArgs.PointersArray, 0, I);
9034 Builder.CreateAlignedStore(
9035 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9036 ++L;
9037 }
9038}
9039
9040void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9041 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9042 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9043 BasicBlock *ExitBB, bool IsInit) {
9044 StringRef Prefix = IsInit ? ".init" : ".del";
9045
9046 // Evaluate if this is an array section.
9048 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9049 Value *IsArray =
9050 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9051 Value *DeleteBit = Builder.CreateAnd(
9052 MapType,
9053 Builder.getInt64(
9054 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9055 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9056 Value *DeleteCond;
9057 Value *Cond;
9058 if (IsInit) {
9059 // base != begin?
9060 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9061 // IsPtrAndObj?
9062 Value *PtrAndObjBit = Builder.CreateAnd(
9063 MapType,
9064 Builder.getInt64(
9065 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9066 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
9067 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
9068 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
9069 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9070 DeleteCond = Builder.CreateIsNull(
9071 DeleteBit,
9072 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9073 } else {
9074 Cond = IsArray;
9075 DeleteCond = Builder.CreateIsNotNull(
9076 DeleteBit,
9077 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9078 }
9079 Cond = Builder.CreateAnd(Cond, DeleteCond);
9080 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9081
9082 emitBlock(BodyBB, MapperFn);
9083 // Get the array size by multiplying element size and element number (i.e., \p
9084 // Size).
9085 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9086 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9087 // memory allocation/deletion purpose only.
9088 Value *MapTypeArg = Builder.CreateAnd(
9089 MapType,
9090 Builder.getInt64(
9091 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9092 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9093 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9094 MapTypeArg = Builder.CreateOr(
9095 MapTypeArg,
9096 Builder.getInt64(
9097 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9098 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9099
9100 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9101 // data structure.
9102 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9103 ArraySize, MapTypeArg, MapName};
9104 createRuntimeFunctionCall(
9105 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9106 OffloadingArgs);
9107}
9108
9109Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
9110 function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
9111 llvm::Value *BeginArg)>
9112 GenMapInfoCB,
9113 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9114 SmallVector<Type *> Params;
9115 Params.emplace_back(Builder.getPtrTy());
9116 Params.emplace_back(Builder.getPtrTy());
9117 Params.emplace_back(Builder.getPtrTy());
9118 Params.emplace_back(Builder.getInt64Ty());
9119 Params.emplace_back(Builder.getInt64Ty());
9120 Params.emplace_back(Builder.getPtrTy());
9121
9122 auto *FnTy =
9123 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9124
9125 SmallString<64> TyStr;
9126 raw_svector_ostream Out(TyStr);
9127 Function *MapperFn =
9129 MapperFn->addFnAttr(Attribute::NoInline);
9130 MapperFn->addFnAttr(Attribute::NoUnwind);
9131 MapperFn->addParamAttr(0, Attribute::NoUndef);
9132 MapperFn->addParamAttr(1, Attribute::NoUndef);
9133 MapperFn->addParamAttr(2, Attribute::NoUndef);
9134 MapperFn->addParamAttr(3, Attribute::NoUndef);
9135 MapperFn->addParamAttr(4, Attribute::NoUndef);
9136 MapperFn->addParamAttr(5, Attribute::NoUndef);
9137
9138 // Start the mapper function code generation.
9139 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9140 auto SavedIP = Builder.saveIP();
9141 Builder.SetInsertPoint(EntryBB);
9142
9143 Value *MapperHandle = MapperFn->getArg(0);
9144 Value *BaseIn = MapperFn->getArg(1);
9145 Value *BeginIn = MapperFn->getArg(2);
9146 Value *Size = MapperFn->getArg(3);
9147 Value *MapType = MapperFn->getArg(4);
9148 Value *MapName = MapperFn->getArg(5);
9149
9150 // Compute the starting and end addresses of array elements.
9151 // Prepare common arguments for array initiation and deletion.
9152 // Convert the size in bytes into the number of array elements.
9153 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9154 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9155 Value *PtrBegin = BeginIn;
9156 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9157
9158 // Emit array initiation if this is an array section and \p MapType indicates
9159 // that memory allocation is required.
9160 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9161 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9162 MapType, MapName, ElementSize, HeadBB,
9163 /*IsInit=*/true);
9164
9165 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9166
9167 // Emit the loop header block.
9168 emitBlock(HeadBB, MapperFn);
9169 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9170 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9171 // Evaluate whether the initial condition is satisfied.
9172 Value *IsEmpty =
9173 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9174 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9175
9176 // Emit the loop body block.
9177 emitBlock(BodyBB, MapperFn);
9178 BasicBlock *LastBB = BodyBB;
9179 PHINode *PtrPHI =
9180 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9181 PtrPHI->addIncoming(PtrBegin, HeadBB);
9182
9183 // Get map clause information. Fill up the arrays with all mapped variables.
9184 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9185 if (!Info)
9186 return Info.takeError();
9187
9188 // Call the runtime API __tgt_mapper_num_components to get the number of
9189 // pre-existing components.
9190 Value *OffloadingArgs[] = {MapperHandle};
9191 Value *PreviousSize = createRuntimeFunctionCall(
9192 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9193 OffloadingArgs);
9194 Value *ShiftedPreviousSize =
9195 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9196
9197 // Fill up the runtime mapper handle for all components.
9198 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9199 Value *CurBaseArg = Info->BasePointers[I];
9200 Value *CurBeginArg = Info->Pointers[I];
9201 Value *CurSizeArg = Info->Sizes[I];
9202 Value *CurNameArg = Info->Names.size()
9203 ? Info->Names[I]
9204 : Constant::getNullValue(Builder.getPtrTy());
9205
9206 // Extract the MEMBER_OF field from the map type.
9207 Value *OriMapType = Builder.getInt64(
9208 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9209 Info->Types[I]));
9210 Value *MemberMapType =
9211 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9212
9213 // Combine the map type inherited from user-defined mapper with that
9214 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9215 // bits of the \a MapType, which is the input argument of the mapper
9216 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9217 // bits of MemberMapType.
9218 // [OpenMP 5.0], 1.2.6. map-type decay.
9219 // | alloc | to | from | tofrom | release | delete
9220 // ----------------------------------------------------------
9221 // alloc | alloc | alloc | alloc | alloc | release | delete
9222 // to | alloc | to | alloc | to | release | delete
9223 // from | alloc | alloc | from | from | release | delete
9224 // tofrom | alloc | to | from | tofrom | release | delete
9225 Value *LeftToFrom = Builder.CreateAnd(
9226 MapType,
9227 Builder.getInt64(
9228 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9229 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9230 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9231 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9232 BasicBlock *AllocElseBB =
9233 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9234 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9235 BasicBlock *ToElseBB =
9236 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9237 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9238 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9239 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9240 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9241 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9242 emitBlock(AllocBB, MapperFn);
9243 Value *AllocMapType = Builder.CreateAnd(
9244 MemberMapType,
9245 Builder.getInt64(
9246 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9247 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9248 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9249 Builder.CreateBr(EndBB);
9250 emitBlock(AllocElseBB, MapperFn);
9251 Value *IsTo = Builder.CreateICmpEQ(
9252 LeftToFrom,
9253 Builder.getInt64(
9254 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9255 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9256 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9257 // In case of to, clear OMP_MAP_FROM.
9258 emitBlock(ToBB, MapperFn);
9259 Value *ToMapType = Builder.CreateAnd(
9260 MemberMapType,
9261 Builder.getInt64(
9262 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9263 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9264 Builder.CreateBr(EndBB);
9265 emitBlock(ToElseBB, MapperFn);
9266 Value *IsFrom = Builder.CreateICmpEQ(
9267 LeftToFrom,
9268 Builder.getInt64(
9269 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9270 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9271 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9272 // In case of from, clear OMP_MAP_TO.
9273 emitBlock(FromBB, MapperFn);
9274 Value *FromMapType = Builder.CreateAnd(
9275 MemberMapType,
9276 Builder.getInt64(
9277 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9278 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9279 // In case of tofrom, do nothing.
9280 emitBlock(EndBB, MapperFn);
9281 LastBB = EndBB;
9282 PHINode *CurMapType =
9283 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9284 CurMapType->addIncoming(AllocMapType, AllocBB);
9285 CurMapType->addIncoming(ToMapType, ToBB);
9286 CurMapType->addIncoming(FromMapType, FromBB);
9287 CurMapType->addIncoming(MemberMapType, ToElseBB);
9288
9289 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9290 CurSizeArg, CurMapType, CurNameArg};
9291
9292 auto ChildMapperFn = CustomMapperCB(I);
9293 if (!ChildMapperFn)
9294 return ChildMapperFn.takeError();
9295 if (*ChildMapperFn) {
9296 // Call the corresponding mapper function.
9297 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9298 ->setDoesNotThrow();
9299 } else {
9300 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9301 // data structure.
9302 createRuntimeFunctionCall(
9303 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9304 OffloadingArgs);
9305 }
9306 }
9307
9308 // Update the pointer to point to the next element that needs to be mapped,
9309 // and check whether we have mapped all elements.
9310 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9311 "omp.arraymap.next");
9312 PtrPHI->addIncoming(PtrNext, LastBB);
9313 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9314 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9315 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9316
9317 emitBlock(ExitBB, MapperFn);
9318 // Emit array deletion if this is an array section and \p MapType indicates
9319 // that deletion is required.
9320 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9321 MapType, MapName, ElementSize, DoneBB,
9322 /*IsInit=*/false);
9323
9324 // Emit the function exit block.
9325 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9326
9327 Builder.CreateRetVoid();
9328 Builder.restoreIP(SavedIP);
9329 return MapperFn;
9330}
9331
9332Error OpenMPIRBuilder::emitOffloadingArrays(
9333 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9334 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9335 bool IsNonContiguous,
9336 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9337
9338 // Reset the array information.
9339 Info.clearArrayInfo();
9340 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9341
9342 if (Info.NumberOfPtrs == 0)
9343 return Error::success();
9344
9345 Builder.restoreIP(AllocaIP);
9346 // Detect if we have any capture size requiring runtime evaluation of the
9347 // size so that a constant array could be eventually used.
9348 ArrayType *PointerArrayType =
9349 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9350
9351 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9352 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9353
9354 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9355 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9356 AllocaInst *MappersArray = Builder.CreateAlloca(
9357 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9358 Info.RTArgs.MappersArray = MappersArray;
9359
9360 // If we don't have any VLA types or other types that require runtime
9361 // evaluation, we can use a constant array for the map sizes, otherwise we
9362 // need to fill up the arrays as we do for the pointers.
9363 Type *Int64Ty = Builder.getInt64Ty();
9364 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9365 ConstantInt::get(Int64Ty, 0));
9366 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9367 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9368 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9369 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9370 if (IsNonContiguous &&
9371 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9372 CombinedInfo.Types[I] &
9373 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9374 ConstSizes[I] =
9375 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9376 else
9377 ConstSizes[I] = CI;
9378 continue;
9379 }
9380 }
9381 RuntimeSizes.set(I);
9382 }
9383
9384 if (RuntimeSizes.all()) {
9385 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9386 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9387 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9388 restoreIPandDebugLoc(Builder, CodeGenIP);
9389 } else {
9390 auto *SizesArrayInit = ConstantArray::get(
9391 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9392 std::string Name = createPlatformSpecificName({"offload_sizes"});
9393 auto *SizesArrayGbl =
9394 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9395 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9396 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9397
9398 if (!RuntimeSizes.any()) {
9399 Info.RTArgs.SizesArray = SizesArrayGbl;
9400 } else {
9401 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9402 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9403 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9404 AllocaInst *Buffer = Builder.CreateAlloca(
9405 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9406 Buffer->setAlignment(OffloadSizeAlign);
9407 restoreIPandDebugLoc(Builder, CodeGenIP);
9408 Builder.CreateMemCpy(
9409 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9410 SizesArrayGbl, OffloadSizeAlign,
9411 Builder.getIntN(
9412 IndexSize,
9413 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9414
9415 Info.RTArgs.SizesArray = Buffer;
9416 }
9417 restoreIPandDebugLoc(Builder, CodeGenIP);
9418 }
9419
9420 // The map types are always constant so we don't need to generate code to
9421 // fill arrays. Instead, we create an array constant.
9423 for (auto mapFlag : CombinedInfo.Types)
9424 Mapping.push_back(
9425 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9426 mapFlag));
9427 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9428 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9429 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9430
9431 // The information types are only built if provided.
9432 if (!CombinedInfo.Names.empty()) {
9433 auto *MapNamesArrayGbl = createOffloadMapnames(
9434 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9435 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9436 Info.EmitDebug = true;
9437 } else {
9438 Info.RTArgs.MapNamesArray =
9439 Constant::getNullValue(PointerType::getUnqual(Builder.getContext()));
9440 Info.EmitDebug = false;
9441 }
9442
9443 // If there's a present map type modifier, it must not be applied to the end
9444 // of a region, so generate a separate map type array in that case.
9445 if (Info.separateBeginEndCalls()) {
9446 bool EndMapTypesDiffer = false;
9447 for (uint64_t &Type : Mapping) {
9448 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9449 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9450 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9451 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9452 EndMapTypesDiffer = true;
9453 }
9454 }
9455 if (EndMapTypesDiffer) {
9456 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9457 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9458 }
9459 }
9460
9461 PointerType *PtrTy = Builder.getPtrTy();
9462 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9463 Value *BPVal = CombinedInfo.BasePointers[I];
9464 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9465 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9466 0, I);
9467 Builder.CreateAlignedStore(BPVal, BP,
9468 M.getDataLayout().getPrefTypeAlign(PtrTy));
9469
9470 if (Info.requiresDevicePointerInfo()) {
9471 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9472 CodeGenIP = Builder.saveIP();
9473 Builder.restoreIP(AllocaIP);
9474 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9475 Builder.restoreIP(CodeGenIP);
9476 if (DeviceAddrCB)
9477 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9478 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9479 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9480 if (DeviceAddrCB)
9481 DeviceAddrCB(I, BP);
9482 }
9483 }
9484
9485 Value *PVal = CombinedInfo.Pointers[I];
9486 Value *P = Builder.CreateConstInBoundsGEP2_32(
9487 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9488 I);
9489 // TODO: Check alignment correct.
9490 Builder.CreateAlignedStore(PVal, P,
9491 M.getDataLayout().getPrefTypeAlign(PtrTy));
9492
9493 if (RuntimeSizes.test(I)) {
9494 Value *S = Builder.CreateConstInBoundsGEP2_32(
9495 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9496 /*Idx0=*/0,
9497 /*Idx1=*/I);
9498 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9499 Int64Ty,
9500 /*isSigned=*/true),
9501 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9502 }
9503 // Fill up the mapper array.
9504 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9505 Value *MFunc = ConstantPointerNull::get(PtrTy);
9506
9507 auto CustomMFunc = CustomMapperCB(I);
9508 if (!CustomMFunc)
9509 return CustomMFunc.takeError();
9510 if (*CustomMFunc)
9511 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9512
9513 Value *MAddr = Builder.CreateInBoundsGEP(
9514 MappersArray->getAllocatedType(), MappersArray,
9515 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9516 Builder.CreateAlignedStore(
9517 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9518 }
9519
9520 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9521 Info.NumberOfPtrs == 0)
9522 return Error::success();
9523 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9524 return Error::success();
9525}
9526
9527void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
9528 BasicBlock *CurBB = Builder.GetInsertBlock();
9529
9530 if (!CurBB || CurBB->getTerminator()) {
9531 // If there is no insert point or the previous block is already
9532 // terminated, don't touch it.
9533 } else {
9534 // Otherwise, create a fall-through branch.
9535 Builder.CreateBr(Target);
9536 }
9537
9538 Builder.ClearInsertionPoint();
9539}
9540
9541void OpenMPIRBuilder::emitBlock(BasicBlock *BB, Function *CurFn,
9542 bool IsFinished) {
9543 BasicBlock *CurBB = Builder.GetInsertBlock();
9544
9545 // Fall out of the current block (if necessary).
9546 emitBranch(BB);
9547
9548 if (IsFinished && BB->use_empty()) {
9549 BB->eraseFromParent();
9550 return;
9551 }
9552
9553 // Place the block after the current block, if possible, or else at
9554 // the end of the function.
9555 if (CurBB && CurBB->getParent())
9556 CurFn->insert(std::next(CurBB->getIterator()), BB);
9557 else
9558 CurFn->insert(CurFn->end(), BB);
9559 Builder.SetInsertPoint(BB);
9560}
9561
9562Error OpenMPIRBuilder::emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen,
9563 BodyGenCallbackTy ElseGen,
9564 InsertPointTy AllocaIP) {
9565 // If the condition constant folds and can be elided, try to avoid emitting
9566 // the condition and the dead arm of the if/else.
9567 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9568 auto CondConstant = CI->getSExtValue();
9569 if (CondConstant)
9570 return ThenGen(AllocaIP, Builder.saveIP());
9571
9572 return ElseGen(AllocaIP, Builder.saveIP());
9573 }
9574
9575 Function *CurFn = Builder.GetInsertBlock()->getParent();
9576
9577 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9578 // emit the conditional branch.
9579 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9580 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9581 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9582 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9583 // Emit the 'then' code.
9584 emitBlock(ThenBlock, CurFn);
9585 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9586 return Err;
9587 emitBranch(ContBlock);
9588 // Emit the 'else' code if present.
9589 // There is no need to emit line number for unconditional branch.
9590 emitBlock(ElseBlock, CurFn);
9591 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
9592 return Err;
9593 // There is no need to emit line number for unconditional branch.
9594 emitBranch(ContBlock);
9595 // Emit the continuation block for code after the if.
9596 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
9597 return Error::success();
9598}
9599
9600bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
9601 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
9604 "Unexpected Atomic Ordering.");
9605
9606 bool Flush = false;
9608
9609 switch (AK) {
9610 case Read:
9613 FlushAO = AtomicOrdering::Acquire;
9614 Flush = true;
9615 }
9616 break;
9617 case Write:
9618 case Compare:
9619 case Update:
9622 FlushAO = AtomicOrdering::Release;
9623 Flush = true;
9624 }
9625 break;
9626 case Capture:
9627 switch (AO) {
9629 FlushAO = AtomicOrdering::Acquire;
9630 Flush = true;
9631 break;
9633 FlushAO = AtomicOrdering::Release;
9634 Flush = true;
9635 break;
9639 Flush = true;
9640 break;
9641 default:
9642 // do nothing - leave silently.
9643 break;
9644 }
9645 }
9646
9647 if (Flush) {
9648 // Currently Flush RT call still doesn't take memory_ordering, so for when
9649 // that happens, this tries to do the resolution of which atomic ordering
9650 // to use with but issue the flush call
9651 // TODO: pass `FlushAO` after memory ordering support is added
9652 (void)FlushAO;
9653 emitFlush(Loc);
9654 }
9655
9656 // for AO == AtomicOrdering::Monotonic and all other case combinations
9657 // do nothing
9658 return Flush;
9659}
9660
9661OpenMPIRBuilder::InsertPointTy
9662OpenMPIRBuilder::createAtomicRead(const LocationDescription &Loc,
9663 AtomicOpValue &X, AtomicOpValue &V,
9664 AtomicOrdering AO, InsertPointTy AllocaIP) {
9665 if (!updateToLocation(Loc))
9666 return Loc.IP;
9667
9668 assert(X.Var->getType()->isPointerTy() &&
9669 "OMP Atomic expects a pointer to target memory");
9670 Type *XElemTy = X.ElemTy;
9671 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9672 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9673 "OMP atomic read expected a scalar type");
9674
9675 Value *XRead = nullptr;
9676
9677 if (XElemTy->isIntegerTy()) {
9678 LoadInst *XLD =
9679 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
9680 XLD->setAtomic(AO);
9681 XRead = cast<Value>(XLD);
9682 } else if (XElemTy->isStructTy()) {
9683 // FIXME: Add checks to ensure __atomic_load is emitted iff the
9684 // target does not support `atomicrmw` of the size of the struct
9685 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9686 OldVal->setAtomic(AO);
9687 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9688 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9689 OpenMPIRBuilder::AtomicInfo atomicInfo(
9690 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9691 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9692 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9693 XRead = AtomicLoadRes.first;
9694 OldVal->eraseFromParent();
9695 } else {
9696 // We need to perform atomic op as integer
9697 IntegerType *IntCastTy =
9698 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9699 LoadInst *XLoad =
9700 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
9701 XLoad->setAtomic(AO);
9702 if (XElemTy->isFloatingPointTy()) {
9703 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
9704 } else {
9705 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
9706 }
9707 }
9708 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
9709 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
9710 return Builder.saveIP();
9711}
9712
9713OpenMPIRBuilder::InsertPointTy
9714OpenMPIRBuilder::createAtomicWrite(const LocationDescription &Loc,
9715 AtomicOpValue &X, Value *Expr,
9716 AtomicOrdering AO, InsertPointTy AllocaIP) {
9717 if (!updateToLocation(Loc))
9718 return Loc.IP;
9719
9720 assert(X.Var->getType()->isPointerTy() &&
9721 "OMP Atomic expects a pointer to target memory");
9722 Type *XElemTy = X.ElemTy;
9723 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9724 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
9725 "OMP atomic write expected a scalar type");
9726
9727 if (XElemTy->isIntegerTy()) {
9728 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
9729 XSt->setAtomic(AO);
9730 } else if (XElemTy->isStructTy()) {
9731 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
9732 const DataLayout &DL = OldVal->getModule()->getDataLayout();
9733 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
9734 OpenMPIRBuilder::AtomicInfo atomicInfo(
9735 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9736 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
9737 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
9738 OldVal->eraseFromParent();
9739 } else {
9740 // We need to bitcast and perform atomic op as integers
9741 IntegerType *IntCastTy =
9742 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9743 Value *ExprCast =
9744 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
9745 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
9746 XSt->setAtomic(AO);
9747 }
9748
9749 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
9750 return Builder.saveIP();
9751}
9752
9753OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicUpdate(
9754 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9755 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
9756 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
9757 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9758 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
9759 if (!updateToLocation(Loc))
9760 return Loc.IP;
9761
9762 LLVM_DEBUG({
9763 Type *XTy = X.Var->getType();
9764 assert(XTy->isPointerTy() &&
9765 "OMP Atomic expects a pointer to target memory");
9766 Type *XElemTy = X.ElemTy;
9767 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
9768 XElemTy->isPointerTy()) &&
9769 "OMP atomic update expected a scalar type");
9770 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
9771 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
9772 "OpenMP atomic does not support LT or GT operations");
9773 });
9774
9775 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
9776 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
9777 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
9778 if (!AtomicResult)
9779 return AtomicResult.takeError();
9780 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
9781 return Builder.saveIP();
9782}
9783
9784// FIXME: Duplicating AtomicExpand
9785Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
9786 AtomicRMWInst::BinOp RMWOp) {
9787 switch (RMWOp) {
9788 case AtomicRMWInst::Add:
9789 return Builder.CreateAdd(Src1, Src2);
9790 case AtomicRMWInst::Sub:
9791 return Builder.CreateSub(Src1, Src2);
9792 case AtomicRMWInst::And:
9793 return Builder.CreateAnd(Src1, Src2);
9795 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
9796 case AtomicRMWInst::Or:
9797 return Builder.CreateOr(Src1, Src2);
9798 case AtomicRMWInst::Xor:
9799 return Builder.CreateXor(Src1, Src2);
9804 case AtomicRMWInst::Max:
9805 case AtomicRMWInst::Min:
9816 llvm_unreachable("Unsupported atomic update operation");
9817 }
9818 llvm_unreachable("Unsupported atomic update operation");
9819}
9820
9821Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
9822 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
9824 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
9825 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9826 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
9827 // or a complex datatype.
9828 bool emitRMWOp = false;
9829 switch (RMWOp) {
9830 case AtomicRMWInst::Add:
9831 case AtomicRMWInst::And:
9833 case AtomicRMWInst::Or:
9834 case AtomicRMWInst::Xor:
9836 emitRMWOp = XElemTy;
9837 break;
9838 case AtomicRMWInst::Sub:
9839 emitRMWOp = (IsXBinopExpr && XElemTy);
9840 break;
9841 default:
9842 emitRMWOp = false;
9843 }
9844 emitRMWOp &= XElemTy->isIntegerTy();
9845
9846 std::pair<Value *, Value *> Res;
9847 if (emitRMWOp) {
9848 AtomicRMWInst *RMWInst =
9849 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
9850 if (T.isAMDGPU()) {
9851 if (IsIgnoreDenormalMode)
9852 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
9853 llvm::MDNode::get(Builder.getContext(), {}));
9854 if (!IsFineGrainedMemory)
9855 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
9856 llvm::MDNode::get(Builder.getContext(), {}));
9857 if (!IsRemoteMemory)
9858 RMWInst->setMetadata("amdgpu.no.remote.memory",
9859 llvm::MDNode::get(Builder.getContext(), {}));
9860 }
9861 Res.first = RMWInst;
9862 // not needed except in case of postfix captures. Generate anyway for
9863 // consistency with the else part. Will be removed with any DCE pass.
9864 // AtomicRMWInst::Xchg does not have a coressponding instruction.
9865 if (RMWOp == AtomicRMWInst::Xchg)
9866 Res.second = Res.first;
9867 else
9868 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
9869 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
9870 XElemTy->isStructTy()) {
9871 LoadInst *OldVal =
9872 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
9873 OldVal->setAtomic(AO);
9874 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
9875 unsigned LoadSize =
9876 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
9877
9878 OpenMPIRBuilder::AtomicInfo atomicInfo(
9879 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
9880 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
9881 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
9882 BasicBlock *CurBB = Builder.GetInsertBlock();
9883 Instruction *CurBBTI = CurBB->getTerminator();
9884 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9885 BasicBlock *ExitBB =
9886 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9887 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9888 X->getName() + ".atomic.cont");
9889 ContBB->getTerminator()->eraseFromParent();
9890 Builder.restoreIP(AllocaIP);
9891 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9892 NewAtomicAddr->setName(X->getName() + "x.new.val");
9893 Builder.SetInsertPoint(ContBB);
9894 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9895 PHI->addIncoming(AtomicLoadRes.first, CurBB);
9896 Value *OldExprVal = PHI;
9897 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9898 if (!CBResult)
9899 return CBResult.takeError();
9900 Value *Upd = *CBResult;
9901 Builder.CreateStore(Upd, NewAtomicAddr);
9904 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
9905 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
9906 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
9907 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
9908 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
9909 OldVal->eraseFromParent();
9910 Res.first = OldExprVal;
9911 Res.second = Upd;
9912
9913 if (UnreachableInst *ExitTI =
9915 CurBBTI->eraseFromParent();
9916 Builder.SetInsertPoint(ExitBB);
9917 } else {
9918 Builder.SetInsertPoint(ExitTI);
9919 }
9920 } else {
9921 IntegerType *IntCastTy =
9922 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
9923 LoadInst *OldVal =
9924 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
9925 OldVal->setAtomic(AO);
9926 // CurBB
9927 // | /---\
9928 // ContBB |
9929 // | \---/
9930 // ExitBB
9931 BasicBlock *CurBB = Builder.GetInsertBlock();
9932 Instruction *CurBBTI = CurBB->getTerminator();
9933 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
9934 BasicBlock *ExitBB =
9935 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
9936 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
9937 X->getName() + ".atomic.cont");
9938 ContBB->getTerminator()->eraseFromParent();
9939 Builder.restoreIP(AllocaIP);
9940 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
9941 NewAtomicAddr->setName(X->getName() + "x.new.val");
9942 Builder.SetInsertPoint(ContBB);
9943 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
9944 PHI->addIncoming(OldVal, CurBB);
9945 bool IsIntTy = XElemTy->isIntegerTy();
9946 Value *OldExprVal = PHI;
9947 if (!IsIntTy) {
9948 if (XElemTy->isFloatingPointTy()) {
9949 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
9950 X->getName() + ".atomic.fltCast");
9951 } else {
9952 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
9953 X->getName() + ".atomic.ptrCast");
9954 }
9955 }
9956
9957 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
9958 if (!CBResult)
9959 return CBResult.takeError();
9960 Value *Upd = *CBResult;
9961 Builder.CreateStore(Upd, NewAtomicAddr);
9962 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
9965 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
9966 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
9967 Result->setVolatile(VolatileX);
9968 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
9969 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
9970 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
9971 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
9972
9973 Res.first = OldExprVal;
9974 Res.second = Upd;
9975
9976 // set Insertion point in exit block
9977 if (UnreachableInst *ExitTI =
9979 CurBBTI->eraseFromParent();
9980 Builder.SetInsertPoint(ExitBB);
9981 } else {
9982 Builder.SetInsertPoint(ExitTI);
9983 }
9984 }
9985
9986 return Res;
9987}
9988
9989OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createAtomicCapture(
9990 const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X,
9991 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
9992 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
9993 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
9994 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
9995 if (!updateToLocation(Loc))
9996 return Loc.IP;
9997
9998 LLVM_DEBUG({
9999 Type *XTy = X.Var->getType();
10000 assert(XTy->isPointerTy() &&
10001 "OMP Atomic expects a pointer to target memory");
10002 Type *XElemTy = X.ElemTy;
10003 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10004 XElemTy->isPointerTy()) &&
10005 "OMP atomic capture expected a scalar type");
10006 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10007 "OpenMP atomic does not support LT or GT operations");
10008 });
10009
10010 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10011 // 'x' is simply atomically rewritten with 'expr'.
10012 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10013 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10014 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10015 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10016 if (!AtomicResult)
10017 return AtomicResult.takeError();
10018 Value *CapturedVal =
10019 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10020 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10021
10022 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10023 return Builder.saveIP();
10024}
10025
10026OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
10027 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
10028 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
10029 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10030 bool IsFailOnly) {
10031
10033 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10034 IsPostfixUpdate, IsFailOnly, Failure);
10035}
10036
10037OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createAtomicCompare(
10038 const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V,
10039 AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO,
10040 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10041 bool IsFailOnly, AtomicOrdering Failure) {
10042
10043 if (!updateToLocation(Loc))
10044 return Loc.IP;
10045
10046 assert(X.Var->getType()->isPointerTy() &&
10047 "OMP atomic expects a pointer to target memory");
10048 // compare capture
10049 if (V.Var) {
10050 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10051 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10052 }
10053
10054 bool IsInteger = E->getType()->isIntegerTy();
10055
10056 if (Op == OMPAtomicCompareOp::EQ) {
10057 AtomicCmpXchgInst *Result = nullptr;
10058 if (!IsInteger) {
10059 IntegerType *IntCastTy =
10060 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10061 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10062 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10063 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10064 AO, Failure);
10065 } else {
10066 Result =
10067 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10068 }
10069
10070 if (V.Var) {
10071 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10072 if (!IsInteger)
10073 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10074 assert(OldValue->getType() == V.ElemTy &&
10075 "OldValue and V must be of same type");
10076 if (IsPostfixUpdate) {
10077 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10078 } else {
10079 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10080 if (IsFailOnly) {
10081 // CurBB----
10082 // | |
10083 // v |
10084 // ContBB |
10085 // | |
10086 // v |
10087 // ExitBB <-
10088 //
10089 // where ContBB only contains the store of old value to 'v'.
10090 BasicBlock *CurBB = Builder.GetInsertBlock();
10091 Instruction *CurBBTI = CurBB->getTerminator();
10092 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10093 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10094 CurBBTI, X.Var->getName() + ".atomic.exit");
10095 BasicBlock *ContBB = CurBB->splitBasicBlock(
10096 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10097 ContBB->getTerminator()->eraseFromParent();
10098 CurBB->getTerminator()->eraseFromParent();
10099
10100 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10101
10102 Builder.SetInsertPoint(ContBB);
10103 Builder.CreateStore(OldValue, V.Var);
10104 Builder.CreateBr(ExitBB);
10105
10106 if (UnreachableInst *ExitTI =
10108 CurBBTI->eraseFromParent();
10109 Builder.SetInsertPoint(ExitBB);
10110 } else {
10111 Builder.SetInsertPoint(ExitTI);
10112 }
10113 } else {
10114 Value *CapturedValue =
10115 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10116 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10117 }
10118 }
10119 }
10120 // The comparison result has to be stored.
10121 if (R.Var) {
10122 assert(R.Var->getType()->isPointerTy() &&
10123 "r.var must be of pointer type");
10124 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10125
10126 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10127 Value *ResultCast = R.IsSigned
10128 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10129 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10130 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10131 }
10132 } else {
10133 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10134 "Op should be either max or min at this point");
10135 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10136
10137 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10138 // Let's take max as example.
10139 // OpenMP form:
10140 // x = x > expr ? expr : x;
10141 // LLVM form:
10142 // *ptr = *ptr > val ? *ptr : val;
10143 // We need to transform to LLVM form.
10144 // x = x <= expr ? x : expr;
10146 if (IsXBinopExpr) {
10147 if (IsInteger) {
10148 if (X.IsSigned)
10149 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10151 else
10152 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10154 } else {
10155 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10157 }
10158 } else {
10159 if (IsInteger) {
10160 if (X.IsSigned)
10161 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10163 else
10164 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10166 } else {
10167 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10169 }
10170 }
10171
10172 AtomicRMWInst *OldValue =
10173 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10174 if (V.Var) {
10175 Value *CapturedValue = nullptr;
10176 if (IsPostfixUpdate) {
10177 CapturedValue = OldValue;
10178 } else {
10179 CmpInst::Predicate Pred;
10180 switch (NewOp) {
10181 case AtomicRMWInst::Max:
10182 Pred = CmpInst::ICMP_SGT;
10183 break;
10185 Pred = CmpInst::ICMP_UGT;
10186 break;
10188 Pred = CmpInst::FCMP_OGT;
10189 break;
10190 case AtomicRMWInst::Min:
10191 Pred = CmpInst::ICMP_SLT;
10192 break;
10194 Pred = CmpInst::ICMP_ULT;
10195 break;
10197 Pred = CmpInst::FCMP_OLT;
10198 break;
10199 default:
10200 llvm_unreachable("unexpected comparison op");
10201 }
10202 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10203 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10204 }
10205 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10206 }
10207 }
10208
10209 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10210
10211 return Builder.saveIP();
10212}
10213
10214OpenMPIRBuilder::InsertPointOrErrorTy
10215OpenMPIRBuilder::createTeams(const LocationDescription &Loc,
10216 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10217 Value *NumTeamsUpper, Value *ThreadLimit,
10218 Value *IfExpr) {
10219 if (!updateToLocation(Loc))
10220 return InsertPointTy();
10221
10222 uint32_t SrcLocStrSize;
10223 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10224 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10225 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10226
10227 // Outer allocation basicblock is the entry block of the current function.
10228 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10229 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10230 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10231 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10232 }
10233
10234 // The current basic block is split into four basic blocks. After outlining,
10235 // they will be mapped as follows:
10236 // ```
10237 // def current_fn() {
10238 // current_basic_block:
10239 // br label %teams.exit
10240 // teams.exit:
10241 // ; instructions after teams
10242 // }
10243 //
10244 // def outlined_fn() {
10245 // teams.alloca:
10246 // br label %teams.body
10247 // teams.body:
10248 // ; instructions within teams body
10249 // }
10250 // ```
10251 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10252 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10253 BasicBlock *AllocaBB =
10254 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10255
10256 bool SubClausesPresent =
10257 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10258 // Push num_teams
10259 if (!Config.isTargetDevice() && SubClausesPresent) {
10260 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10261 "if lowerbound is non-null, then upperbound must also be non-null "
10262 "for bounds on num_teams");
10263
10264 if (NumTeamsUpper == nullptr)
10265 NumTeamsUpper = Builder.getInt32(0);
10266
10267 if (NumTeamsLower == nullptr)
10268 NumTeamsLower = NumTeamsUpper;
10269
10270 if (IfExpr) {
10271 assert(IfExpr->getType()->isIntegerTy() &&
10272 "argument to if clause must be an integer value");
10273
10274 // upper = ifexpr ? upper : 1
10275 if (IfExpr->getType() != Int1)
10276 IfExpr = Builder.CreateICmpNE(IfExpr,
10277 ConstantInt::get(IfExpr->getType(), 0));
10278 NumTeamsUpper = Builder.CreateSelect(
10279 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10280
10281 // lower = ifexpr ? lower : 1
10282 NumTeamsLower = Builder.CreateSelect(
10283 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10284 }
10285
10286 if (ThreadLimit == nullptr)
10287 ThreadLimit = Builder.getInt32(0);
10288
10289 Value *ThreadNum = getOrCreateThreadID(Ident);
10290 createRuntimeFunctionCall(
10291 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10292 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
10293 }
10294 // Generate the body of teams.
10295 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10296 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10297 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10298 return Err;
10299
10300 OutlineInfo OI;
10301 OI.EntryBB = AllocaBB;
10302 OI.ExitBB = ExitBB;
10303 OI.OuterAllocaBB = &OuterAllocaBB;
10304
10305 // Insert fake values for global tid and bound tid.
10307 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10308 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10309 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10310 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
10311 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10312
10313 auto HostPostOutlineCB = [this, Ident,
10314 ToBeDeleted](Function &OutlinedFn) mutable {
10315 // The stale call instruction will be replaced with a new call instruction
10316 // for runtime call with the outlined function.
10317
10318 assert(OutlinedFn.hasOneUse() &&
10319 "there must be a single user for the outlined function");
10320 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10321 ToBeDeleted.push_back(StaleCI);
10322
10323 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10324 "Outlined function must have two or three arguments only");
10325
10326 bool HasShared = OutlinedFn.arg_size() == 3;
10327
10328 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10329 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10330 if (HasShared)
10331 OutlinedFn.getArg(2)->setName("data");
10332
10333 // Call to the runtime function for teams in the current function.
10334 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10335 "outlined function.");
10336 Builder.SetInsertPoint(StaleCI);
10338 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10339 if (HasShared)
10340 Args.push_back(StaleCI->getArgOperand(2));
10341 createRuntimeFunctionCall(
10342 getOrCreateRuntimeFunctionPtr(
10343 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10344 Args);
10345
10346 for (Instruction *I : llvm::reverse(ToBeDeleted))
10347 I->eraseFromParent();
10348 };
10349
10350 if (!Config.isTargetDevice())
10351 OI.PostOutlineCB = HostPostOutlineCB;
10352
10353 addOutlineInfo(std::move(OI));
10354
10355 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10356
10357 return Builder.saveIP();
10358}
10359
10360OpenMPIRBuilder::InsertPointOrErrorTy
10361OpenMPIRBuilder::createDistribute(const LocationDescription &Loc,
10362 InsertPointTy OuterAllocaIP,
10363 BodyGenCallbackTy BodyGenCB) {
10364 if (!updateToLocation(Loc))
10365 return InsertPointTy();
10366
10367 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10368
10369 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10370 BasicBlock *BodyBB =
10371 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10372 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10373 }
10374 BasicBlock *ExitBB =
10375 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10376 BasicBlock *BodyBB =
10377 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10378 BasicBlock *AllocaBB =
10379 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10380
10381 // Generate the body of distribute clause
10382 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10383 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10384 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10385 return Err;
10386
10387 // When using target we use different runtime functions which require a
10388 // callback.
10389 if (Config.isTargetDevice()) {
10390 OutlineInfo OI;
10391 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10392 OI.EntryBB = AllocaBB;
10393 OI.ExitBB = ExitBB;
10394
10395 addOutlineInfo(std::move(OI));
10396 }
10397 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10398
10399 return Builder.saveIP();
10400}
10401
10403OpenMPIRBuilder::createOffloadMapnames(SmallVectorImpl<llvm::Constant *> &Names,
10404 std::string VarName) {
10405 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10407 Names.size()),
10408 Names);
10409 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10410 M, MapNamesArrayInit->getType(),
10411 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10412 VarName);
10413 return MapNamesArrayGlobal;
10414}
10415
10416// Create all simple and struct types exposed by the runtime and remember
10417// the llvm::PointerTypes of them for easy access later.
10418void OpenMPIRBuilder::initializeTypes(Module &M) {
10419 LLVMContext &Ctx = M.getContext();
10420 StructType *T;
10421 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10422 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10423#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10424#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10425 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10426 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10427#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10428 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10429 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10430#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10431 T = StructType::getTypeByName(Ctx, StructName); \
10432 if (!T) \
10433 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10434 VarName = T; \
10435 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10436#include "llvm/Frontend/OpenMP/OMPKinds.def"
10437}
10438
10439void OpenMPIRBuilder::OutlineInfo::collectBlocks(
10441 SmallVectorImpl<BasicBlock *> &BlockVector) {
10443 BlockSet.insert(EntryBB);
10444 BlockSet.insert(ExitBB);
10445
10446 Worklist.push_back(EntryBB);
10447 while (!Worklist.empty()) {
10448 BasicBlock *BB = Worklist.pop_back_val();
10449 BlockVector.push_back(BB);
10450 for (BasicBlock *SuccBB : successors(BB))
10451 if (BlockSet.insert(SuccBB).second)
10452 Worklist.push_back(SuccBB);
10453 }
10454}
10455
10456void OpenMPIRBuilder::createOffloadEntry(Constant *ID, Constant *Addr,
10457 uint64_t Size, int32_t Flags,
10459 StringRef Name) {
10460 if (!Config.isGPU()) {
10463 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10464 return;
10465 }
10466 // TODO: Add support for global variables on the device after declare target
10467 // support.
10468 Function *Fn = dyn_cast<Function>(Addr);
10469 if (!Fn)
10470 return;
10471
10472 // Add a function attribute for the kernel.
10473 Fn->addFnAttr("kernel");
10474 if (T.isAMDGCN())
10475 Fn->addFnAttr("uniform-work-group-size", "true");
10476 Fn->addFnAttr(Attribute::MustProgress);
10477}
10478
10479// We only generate metadata for function that contain target regions.
10480void OpenMPIRBuilder::createOffloadEntriesAndInfoMetadata(
10481 EmitMetadataErrorReportFunctionTy &ErrorFn) {
10482
10483 // If there are no entries, we don't need to do anything.
10484 if (OffloadInfoManager.empty())
10485 return;
10486
10487 LLVMContext &C = M.getContext();
10488 SmallVector<std::pair<const OffloadEntriesInfoManager::OffloadEntryInfo *,
10489 TargetRegionEntryInfo>,
10490 16>
10491 OrderedEntries(OffloadInfoManager.size());
10492
10493 // Auxiliary methods to create metadata values and strings.
10494 auto &&GetMDInt = [this](unsigned V) {
10495 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10496 };
10497
10498 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10499
10500 // Create the offloading info metadata node.
10501 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10502 auto &&TargetRegionMetadataEmitter =
10503 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10504 const TargetRegionEntryInfo &EntryInfo,
10505 const OffloadEntriesInfoManager::OffloadEntryInfoTargetRegion &E) {
10506 // Generate metadata for target regions. Each entry of this metadata
10507 // contains:
10508 // - Entry 0 -> Kind of this type of metadata (0).
10509 // - Entry 1 -> Device ID of the file where the entry was identified.
10510 // - Entry 2 -> File ID of the file where the entry was identified.
10511 // - Entry 3 -> Mangled name of the function where the entry was
10512 // identified.
10513 // - Entry 4 -> Line in the file where the entry was identified.
10514 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10515 // - Entry 6 -> Order the entry was created.
10516 // The first element of the metadata node is the kind.
10517 Metadata *Ops[] = {
10518 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10519 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10520 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10521 GetMDInt(E.getOrder())};
10522
10523 // Save this entry in the right position of the ordered entries array.
10524 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10525
10526 // Add metadata to the named metadata node.
10527 MD->addOperand(MDNode::get(C, Ops));
10528 };
10529
10530 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10531
10532 // Create function that emits metadata for each device global variable entry;
10533 auto &&DeviceGlobalVarMetadataEmitter =
10534 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10535 StringRef MangledName,
10536 const OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar &E) {
10537 // Generate metadata for global variables. Each entry of this metadata
10538 // contains:
10539 // - Entry 0 -> Kind of this type of metadata (1).
10540 // - Entry 1 -> Mangled name of the variable.
10541 // - Entry 2 -> Declare target kind.
10542 // - Entry 3 -> Order the entry was created.
10543 // The first element of the metadata node is the kind.
10544 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10545 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10546
10547 // Save this entry in the right position of the ordered entries array.
10548 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10549 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10550
10551 // Add metadata to the named metadata node.
10552 MD->addOperand(MDNode::get(C, Ops));
10553 };
10554
10555 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10556 DeviceGlobalVarMetadataEmitter);
10557
10558 for (const auto &E : OrderedEntries) {
10559 assert(E.first && "All ordered entries must exist!");
10560 if (const auto *CE =
10562 E.first)) {
10563 if (!CE->getID() || !CE->getAddress()) {
10564 // Do not blame the entry if the parent funtion is not emitted.
10565 TargetRegionEntryInfo EntryInfo = E.second;
10566 StringRef FnName = EntryInfo.ParentName;
10567 if (!M.getNamedValue(FnName))
10568 continue;
10569 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10570 continue;
10571 }
10572 createOffloadEntry(CE->getID(), CE->getAddress(),
10573 /*Size=*/0, CE->getFlags(),
10575 } else if (const auto *CE = dyn_cast<
10576 OffloadEntriesInfoManager::OffloadEntryInfoDeviceGlobalVar>(
10577 E.first)) {
10578 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags =
10579 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10580 CE->getFlags());
10581 switch (Flags) {
10582 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter:
10583 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo:
10584 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10585 continue;
10586 if (!CE->getAddress()) {
10587 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10588 continue;
10589 }
10590 // The vaiable has no definition - no need to add the entry.
10591 if (CE->getVarSize() == 0)
10592 continue;
10593 break;
10594 case OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink:
10595 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
10596 (!Config.isTargetDevice() && CE->getAddress())) &&
10597 "Declaret target link address is set.");
10598 if (Config.isTargetDevice())
10599 continue;
10600 if (!CE->getAddress()) {
10601 ErrorFn(EMIT_MD_GLOBAL_VAR_LINK_ERROR, TargetRegionEntryInfo());
10602 continue;
10603 }
10604 break;
10605 default:
10606 break;
10607 }
10608
10609 // Hidden or internal symbols on the device are not externally visible.
10610 // We should not attempt to register them by creating an offloading
10611 // entry. Indirect variables are handled separately on the device.
10612 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
10613 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
10614 Flags != OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10615 continue;
10616
10617 // Indirect globals need to use a special name that doesn't match the name
10618 // of the associated host global.
10619 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
10620 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10621 Flags, CE->getLinkage(), CE->getVarName());
10622 else
10623 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
10624 Flags, CE->getLinkage());
10625
10626 } else {
10627 llvm_unreachable("Unsupported entry kind.");
10628 }
10629 }
10630
10631 // Emit requires directive globals to a special entry so the runtime can
10632 // register them when the device image is loaded.
10633 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
10634 // entries should be redesigned to better suit this use-case.
10635 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
10639 ".requires", /*Size=*/0,
10640 OffloadEntriesInfoManager::OMPTargetGlobalRegisterRequires,
10641 Config.getRequiresFlags());
10642}
10643
10644void TargetRegionEntryInfo::getTargetRegionEntryFnName(
10645 SmallVectorImpl<char> &Name, StringRef ParentName, unsigned DeviceID,
10646 unsigned FileID, unsigned Line, unsigned Count) {
10647 raw_svector_ostream OS(Name);
10648 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
10649 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
10650 if (Count)
10651 OS << "_" << Count;
10652}
10653
10654void OffloadEntriesInfoManager::getTargetRegionEntryFnName(
10655 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
10656 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
10657 TargetRegionEntryInfo::getTargetRegionEntryFnName(
10658 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
10659 EntryInfo.Line, NewCount);
10660}
10661
10662TargetRegionEntryInfo
10663OpenMPIRBuilder::getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack,
10664 vfs::FileSystem &VFS,
10665 StringRef ParentName) {
10666 sys::fs::UniqueID ID(0xdeadf17e, 0);
10667 auto FileIDInfo = CallBack();
10668 uint64_t FileID = 0;
10669 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
10670 ID = Status->getUniqueID();
10671 FileID = Status->getUniqueID().getFile();
10672 } else {
10673 // If the inode ID could not be determined, create a hash value
10674 // the current file name and use that as an ID.
10675 FileID = hash_value(std::get<0>(FileIDInfo));
10676 }
10677
10678 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
10679 std::get<1>(FileIDInfo));
10680}
10681
10682unsigned OpenMPIRBuilder::getFlagMemberOffset() {
10683 unsigned Offset = 0;
10684 for (uint64_t Remain =
10685 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10687 !(Remain & 1); Remain = Remain >> 1)
10688 Offset++;
10689 return Offset;
10690}
10691
10693OpenMPIRBuilder::getMemberOfFlag(unsigned Position) {
10694 // Rotate by getFlagMemberOffset() bits.
10695 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
10696 << getFlagMemberOffset());
10697}
10698
10699void OpenMPIRBuilder::setCorrectMemberOfFlag(
10701 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
10702 // If the entry is PTR_AND_OBJ but has not been marked with the special
10703 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
10704 // marked as MEMBER_OF.
10705 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10707 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
10710 return;
10711
10712 // Reset the placeholder value to prepare the flag for the assignment of the
10713 // proper MEMBER_OF value.
10714 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
10715 Flags |= MemberOfFlag;
10716}
10717
10718Constant *OpenMPIRBuilder::getAddrOfDeclareTargetVar(
10719 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10720 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10721 bool IsDeclaration, bool IsExternallyVisible,
10722 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10723 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10724 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
10725 std::function<Constant *()> GlobalInitializer,
10726 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
10727 // TODO: convert this to utilise the IRBuilder Config rather than
10728 // a passed down argument.
10729 if (OpenMPSIMD)
10730 return nullptr;
10731
10732 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink ||
10733 ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10734 CaptureClause ==
10735 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10736 Config.hasRequiresUnifiedSharedMemory())) {
10737 SmallString<64> PtrName;
10738 {
10739 raw_svector_ostream OS(PtrName);
10740 OS << MangledName;
10741 if (!IsExternallyVisible)
10742 OS << format("_%x", EntryInfo.FileID);
10743 OS << "_decl_tgt_ref_ptr";
10744 }
10745
10746 Value *Ptr = M.getNamedValue(PtrName);
10747
10748 if (!Ptr) {
10749 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
10750 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
10751
10752 auto *GV = cast<GlobalVariable>(Ptr);
10753 GV->setLinkage(GlobalValue::WeakAnyLinkage);
10754
10755 if (!Config.isTargetDevice()) {
10756 if (GlobalInitializer)
10757 GV->setInitializer(GlobalInitializer());
10758 else
10759 GV->setInitializer(GlobalValue);
10760 }
10761
10762 registerTargetGlobalVariable(
10763 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10764 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10765 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
10766 }
10767
10768 return cast<Constant>(Ptr);
10769 }
10770
10771 return nullptr;
10772}
10773
10774void OpenMPIRBuilder::registerTargetGlobalVariable(
10775 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause,
10776 OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause,
10777 bool IsDeclaration, bool IsExternallyVisible,
10778 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
10779 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
10780 std::vector<Triple> TargetTriple,
10781 std::function<Constant *()> GlobalInitializer,
10782 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
10783 Constant *Addr) {
10784 if (DeviceClause != OffloadEntriesInfoManager::OMPTargetDeviceClauseAny ||
10785 (TargetTriple.empty() && !Config.isTargetDevice()))
10786 return;
10787
10788 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind Flags;
10790 int64_t VarSize;
10792
10793 if ((CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo ||
10794 CaptureClause ==
10795 OffloadEntriesInfoManager::OMPTargetGlobalVarEntryEnter) &&
10796 !Config.hasRequiresUnifiedSharedMemory()) {
10797 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10798 VarName = MangledName;
10799 GlobalValue *LlvmVal = M.getNamedValue(VarName);
10800
10801 if (!IsDeclaration)
10802 VarSize = divideCeil(
10803 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
10804 else
10805 VarSize = 0;
10806 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
10807
10808 // This is a workaround carried over from Clang which prevents undesired
10809 // optimisation of internal variables.
10810 if (Config.isTargetDevice() &&
10811 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
10812 // Do not create a "ref-variable" if the original is not also available
10813 // on the host.
10814 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
10815 return;
10816
10817 std::string RefName = createPlatformSpecificName({VarName, "ref"});
10818
10819 if (!M.getNamedValue(RefName)) {
10820 Constant *AddrRef =
10821 getOrCreateInternalVariable(Addr->getType(), RefName);
10822 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
10823 GvAddrRef->setConstant(true);
10824 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
10825 GvAddrRef->setInitializer(Addr);
10826 GeneratedRefs.push_back(GvAddrRef);
10827 }
10828 }
10829 } else {
10830 if (CaptureClause == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink)
10831 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryLink;
10832 else
10833 Flags = OffloadEntriesInfoManager::OMPTargetGlobalVarEntryTo;
10834
10835 if (Config.isTargetDevice()) {
10836 VarName = (Addr) ? Addr->getName() : "";
10837 Addr = nullptr;
10838 } else {
10839 Addr = getAddrOfDeclareTargetVar(
10840 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
10841 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
10842 LlvmPtrTy, GlobalInitializer, VariableLinkage);
10843 VarName = (Addr) ? Addr->getName() : "";
10844 }
10845 VarSize = M.getDataLayout().getPointerSize();
10847 }
10848
10849 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
10850 Flags, Linkage);
10851}
10852
10853/// Loads all the offload entries information from the host IR
10854/// metadata.
10855void OpenMPIRBuilder::loadOffloadInfoMetadata(Module &M) {
10856 // If we are in target mode, load the metadata from the host IR. This code has
10857 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
10858
10859 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
10860 if (!MD)
10861 return;
10862
10863 for (MDNode *MN : MD->operands()) {
10864 auto &&GetMDInt = [MN](unsigned Idx) {
10865 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
10866 return cast<ConstantInt>(V->getValue())->getZExtValue();
10867 };
10868
10869 auto &&GetMDString = [MN](unsigned Idx) {
10870 auto *V = cast<MDString>(MN->getOperand(Idx));
10871 return V->getString();
10872 };
10873
10874 switch (GetMDInt(0)) {
10875 default:
10876 llvm_unreachable("Unexpected metadata!");
10877 break;
10878 case OffloadEntriesInfoManager::OffloadEntryInfo::
10879 OffloadingEntryInfoTargetRegion: {
10880 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
10881 /*DeviceID=*/GetMDInt(1),
10882 /*FileID=*/GetMDInt(2),
10883 /*Line=*/GetMDInt(4),
10884 /*Count=*/GetMDInt(5));
10885 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
10886 /*Order=*/GetMDInt(6));
10887 break;
10888 }
10889 case OffloadEntriesInfoManager::OffloadEntryInfo::
10890 OffloadingEntryInfoDeviceGlobalVar:
10891 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
10892 /*MangledName=*/GetMDString(1),
10893 static_cast<OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind>(
10894 /*Flags=*/GetMDInt(2)),
10895 /*Order=*/GetMDInt(3));
10896 break;
10897 }
10898 }
10899}
10900
10901void OpenMPIRBuilder::loadOffloadInfoMetadata(vfs::FileSystem &VFS,
10902 StringRef HostFilePath) {
10903 if (HostFilePath.empty())
10904 return;
10905
10906 auto Buf = VFS.getBufferForFile(HostFilePath);
10907 if (std::error_code Err = Buf.getError()) {
10908 report_fatal_error(("error opening host file from host file path inside of "
10909 "OpenMPIRBuilder: " +
10910 Err.message())
10911 .c_str());
10912 }
10913
10914 LLVMContext Ctx;
10916 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
10917 if (std::error_code Err = M.getError()) {
10919 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
10920 .c_str());
10921 }
10922
10923 loadOffloadInfoMetadata(*M.get());
10924}
10925
10926//===----------------------------------------------------------------------===//
10927// OffloadEntriesInfoManager
10928//===----------------------------------------------------------------------===//
10929
10930bool OffloadEntriesInfoManager::empty() const {
10931 return OffloadEntriesTargetRegion.empty() &&
10932 OffloadEntriesDeviceGlobalVar.empty();
10933}
10934
10935unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
10936 const TargetRegionEntryInfo &EntryInfo) const {
10937 auto It = OffloadEntriesTargetRegionCount.find(
10938 getTargetRegionEntryCountKey(EntryInfo));
10939 if (It == OffloadEntriesTargetRegionCount.end())
10940 return 0;
10941 return It->second;
10942}
10943
10944void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
10945 const TargetRegionEntryInfo &EntryInfo) {
10946 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
10947 EntryInfo.Count + 1;
10948}
10949
10950/// Initialize target region entry.
10951void OffloadEntriesInfoManager::initializeTargetRegionEntryInfo(
10952 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
10953 OffloadEntriesTargetRegion[EntryInfo] =
10954 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
10955 OMPTargetRegionEntryTargetRegion);
10956 ++OffloadingEntriesNum;
10957}
10958
10959void OffloadEntriesInfoManager::registerTargetRegionEntryInfo(
10960 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
10961 OMPTargetRegionEntryKind Flags) {
10962 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
10963
10964 // Update the EntryInfo with the next available count for this location.
10965 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10966
10967 // If we are emitting code for a target, the entry is already initialized,
10968 // only has to be registered.
10969 if (OMPBuilder->Config.isTargetDevice()) {
10970 // This could happen if the device compilation is invoked standalone.
10971 if (!hasTargetRegionEntryInfo(EntryInfo)) {
10972 return;
10973 }
10974 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
10975 Entry.setAddress(Addr);
10976 Entry.setID(ID);
10977 Entry.setFlags(Flags);
10978 } else {
10979 if (Flags == OffloadEntriesInfoManager::OMPTargetRegionEntryTargetRegion &&
10980 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
10981 return;
10982 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
10983 "Target region entry already registered!");
10984 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
10985 OffloadEntriesTargetRegion[EntryInfo] = Entry;
10986 ++OffloadingEntriesNum;
10987 }
10988 incrementTargetRegionEntryInfoCount(EntryInfo);
10989}
10990
10991bool OffloadEntriesInfoManager::hasTargetRegionEntryInfo(
10992 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
10993
10994 // Update the EntryInfo with the next available count for this location.
10995 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
10996
10997 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
10998 if (It == OffloadEntriesTargetRegion.end()) {
10999 return false;
11000 }
11001 // Fail if this entry is already registered.
11002 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11003 return false;
11004 return true;
11005}
11006
11007void OffloadEntriesInfoManager::actOnTargetRegionEntriesInfo(
11008 const OffloadTargetRegionEntryInfoActTy &Action) {
11009 // Scan all target region entries and perform the provided action.
11010 for (const auto &It : OffloadEntriesTargetRegion) {
11011 Action(It.first, It.second);
11012 }
11013}
11014
11015void OffloadEntriesInfoManager::initializeDeviceGlobalVarEntryInfo(
11016 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11017 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11018 ++OffloadingEntriesNum;
11019}
11020
11021void OffloadEntriesInfoManager::registerDeviceGlobalVarEntryInfo(
11022 StringRef VarName, Constant *Addr, int64_t VarSize,
11023 OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage) {
11024 if (OMPBuilder->Config.isTargetDevice()) {
11025 // This could happen if the device compilation is invoked standalone.
11026 if (!hasDeviceGlobalVarEntryInfo(VarName))
11027 return;
11028 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11029 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11030 if (Entry.getVarSize() == 0) {
11031 Entry.setVarSize(VarSize);
11032 Entry.setLinkage(Linkage);
11033 }
11034 return;
11035 }
11036 Entry.setVarSize(VarSize);
11037 Entry.setLinkage(Linkage);
11038 Entry.setAddress(Addr);
11039 } else {
11040 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11041 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11042 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11043 "Entry not initialized!");
11044 if (Entry.getVarSize() == 0) {
11045 Entry.setVarSize(VarSize);
11046 Entry.setLinkage(Linkage);
11047 }
11048 return;
11049 }
11050 if (Flags == OffloadEntriesInfoManager::OMPTargetGlobalVarEntryIndirect)
11051 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11052 Addr, VarSize, Flags, Linkage,
11053 VarName.str());
11054 else
11055 OffloadEntriesDeviceGlobalVar.try_emplace(
11056 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
11057 ++OffloadingEntriesNum;
11058 }
11059}
11060
11061void OffloadEntriesInfoManager::actOnDeviceGlobalVarEntriesInfo(
11062 const OffloadDeviceGlobalVarEntryInfoActTy &Action) {
11063 // Scan all target region entries and perform the provided action.
11064 for (const auto &E : OffloadEntriesDeviceGlobalVar)
11065 Action(E.getKey(), E.getValue());
11066}
11067
11068//===----------------------------------------------------------------------===//
11069// CanonicalLoopInfo
11070//===----------------------------------------------------------------------===//
11071
11072void CanonicalLoopInfo::collectControlBlocks(
11074 // We only count those BBs as control block for which we do not need to
11075 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
11076 // flow. For consistency, this also means we do not add the Body block, which
11077 // is just the entry to the body code.
11078 BBs.reserve(BBs.size() + 6);
11079 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
11080}
11081
11082BasicBlock *CanonicalLoopInfo::getPreheader() const {
11083 assert(isValid() && "Requires a valid canonical loop");
11084 for (BasicBlock *Pred : predecessors(Header)) {
11085 if (Pred != Latch)
11086 return Pred;
11087 }
11088 llvm_unreachable("Missing preheader");
11089}
11090
11091void CanonicalLoopInfo::setTripCount(Value *TripCount) {
11092 assert(isValid() && "Requires a valid canonical loop");
11093
11094 Instruction *CmpI = &getCond()->front();
11095 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
11096 CmpI->setOperand(1, TripCount);
11097
11098#ifndef NDEBUG
11099 assertOK();
11100#endif
11101}
11102
11103void CanonicalLoopInfo::mapIndVar(
11104 llvm::function_ref<Value *(Instruction *)> Updater) {
11105 assert(isValid() && "Requires a valid canonical loop");
11106
11107 Instruction *OldIV = getIndVar();
11108
11109 // Record all uses excluding those introduced by the updater. Uses by the
11110 // CanonicalLoopInfo itself to keep track of the number of iterations are
11111 // excluded.
11112 SmallVector<Use *> ReplacableUses;
11113 for (Use &U : OldIV->uses()) {
11114 auto *User = dyn_cast<Instruction>(U.getUser());
11115 if (!User)
11116 continue;
11117 if (User->getParent() == getCond())
11118 continue;
11119 if (User->getParent() == getLatch())
11120 continue;
11121 ReplacableUses.push_back(&U);
11122 }
11123
11124 // Run the updater that may introduce new uses
11125 Value *NewIV = Updater(OldIV);
11126
11127 // Replace the old uses with the value returned by the updater.
11128 for (Use *U : ReplacableUses)
11129 U->set(NewIV);
11130
11131#ifndef NDEBUG
11132 assertOK();
11133#endif
11134}
11135
11136void CanonicalLoopInfo::assertOK() const {
11137#ifndef NDEBUG
11138 // No constraints if this object currently does not describe a loop.
11139 if (!isValid())
11140 return;
11141
11142 BasicBlock *Preheader = getPreheader();
11143 BasicBlock *Body = getBody();
11144 BasicBlock *After = getAfter();
11145
11146 // Verify standard control-flow we use for OpenMP loops.
11147 assert(Preheader);
11148 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11149 "Preheader must terminate with unconditional branch");
11150 assert(Preheader->getSingleSuccessor() == Header &&
11151 "Preheader must jump to header");
11152
11153 assert(Header);
11154 assert(isa<BranchInst>(Header->getTerminator()) &&
11155 "Header must terminate with unconditional branch");
11156 assert(Header->getSingleSuccessor() == Cond &&
11157 "Header must jump to exiting block");
11158
11159 assert(Cond);
11160 assert(Cond->getSinglePredecessor() == Header &&
11161 "Exiting block only reachable from header");
11162
11163 assert(isa<BranchInst>(Cond->getTerminator()) &&
11164 "Exiting block must terminate with conditional branch");
11165 assert(size(successors(Cond)) == 2 &&
11166 "Exiting block must have two successors");
11167 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11168 "Exiting block's first successor jump to the body");
11169 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11170 "Exiting block's second successor must exit the loop");
11171
11172 assert(Body);
11173 assert(Body->getSinglePredecessor() == Cond &&
11174 "Body only reachable from exiting block");
11175 assert(!isa<PHINode>(Body->front()));
11176
11177 assert(Latch);
11179 "Latch must terminate with unconditional branch");
11180 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11181 // TODO: To support simple redirecting of the end of the body code that has
11182 // multiple; introduce another auxiliary basic block like preheader and after.
11183 assert(Latch->getSinglePredecessor() != nullptr);
11184 assert(!isa<PHINode>(Latch->front()));
11185
11186 assert(Exit);
11187 assert(isa<BranchInst>(Exit->getTerminator()) &&
11188 "Exit block must terminate with unconditional branch");
11189 assert(Exit->getSingleSuccessor() == After &&
11190 "Exit block must jump to after block");
11191
11192 assert(After);
11193 assert(After->getSinglePredecessor() == Exit &&
11194 "After block only reachable from exit block");
11195 assert(After->empty() || !isa<PHINode>(After->front()));
11196
11197 Instruction *IndVar = getIndVar();
11198 assert(IndVar && "Canonical induction variable not found?");
11199 assert(isa<IntegerType>(IndVar->getType()) &&
11200 "Induction variable must be an integer");
11201 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11202 "Induction variable must be a PHI in the loop header");
11203 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11204 assert(
11205 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11206 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11207
11208 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11209 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11210 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11211 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11212 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11213 ->isOne());
11214
11215 Value *TripCount = getTripCount();
11216 assert(TripCount && "Loop trip count not found?");
11217 assert(IndVar->getType() == TripCount->getType() &&
11218 "Trip count and induction variable must have the same type");
11219
11220 auto *CmpI = cast<CmpInst>(&Cond->front());
11221 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11222 "Exit condition must be a signed less-than comparison");
11223 assert(CmpI->getOperand(0) == IndVar &&
11224 "Exit condition must compare the induction variable");
11225 assert(CmpI->getOperand(1) == TripCount &&
11226 "Exit condition must compare with the trip count");
11227#endif
11228}
11229
11230void CanonicalLoopInfo::invalidate() {
11231 Header = nullptr;
11232 Cond = nullptr;
11233 Latch = nullptr;
11234 Exit = nullptr;
11235}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
DXIL Finalize Linkage
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:136
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:557
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:640
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:447
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:668
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
LLVM_ABI void setInitializer(Constant *InitVal)
setInitializer - Sets the initializer for this global variable, removing any existing initializer if ...
Definition Globals.cpp:524
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
const Triple & getTargetTriple() const
Get the target triple which is a string describing the target host.
Definition Module.h:281
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1064
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1126
ArchType getArch() const
Get the parsed architecture type of this triple.
Definition Triple.h:413
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1142
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:237
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:390
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:554
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:701
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
@ CE
Windows NT (Windows on ARM)
Definition MCAsmInfo.h:48
initializer< Ty > init(const Ty &Val)
@ Switch
The "resume-switch" lowering, where there are separate resume and destroy functions that are shared b...
Definition CoroShape.h:31
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
@ OMP_DEVICEID_UNDEF
Device ID if the device was not defined, runtime should get it from environment variables in the spec...
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2484
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:365
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2148
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...