LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
65
66#include <cstdint>
67#include <optional>
68
69#define DEBUG_TYPE "openmp-ir-builder"
70
71using namespace llvm;
72using namespace omp;
73
74static cl::opt<bool>
75 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
76 cl::desc("Use optimistic attributes describing "
77 "'as-if' properties of runtime calls."),
78 cl::init(false));
79
81 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
82 cl::desc("Factor for the unroll threshold to account for code "
83 "simplifications still taking place"),
84 cl::init(1.5));
85
86#ifndef NDEBUG
87/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
88/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
89/// an InsertPoint stores the instruction before something is inserted. For
90/// instance, if both point to the same instruction, two IRBuilders alternating
91/// creating instruction will cause the instructions to be interleaved.
94 if (!IP1.isSet() || !IP2.isSet())
95 return false;
96 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
97}
98
100 // Valid ordered/unordered and base algorithm combinations.
101 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
102 case OMPScheduleType::UnorderedStaticChunked:
103 case OMPScheduleType::UnorderedStatic:
104 case OMPScheduleType::UnorderedDynamicChunked:
105 case OMPScheduleType::UnorderedGuidedChunked:
106 case OMPScheduleType::UnorderedRuntime:
107 case OMPScheduleType::UnorderedAuto:
108 case OMPScheduleType::UnorderedTrapezoidal:
109 case OMPScheduleType::UnorderedGreedy:
110 case OMPScheduleType::UnorderedBalanced:
111 case OMPScheduleType::UnorderedGuidedIterativeChunked:
112 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
113 case OMPScheduleType::UnorderedSteal:
114 case OMPScheduleType::UnorderedStaticBalancedChunked:
115 case OMPScheduleType::UnorderedGuidedSimd:
116 case OMPScheduleType::UnorderedRuntimeSimd:
117 case OMPScheduleType::OrderedStaticChunked:
118 case OMPScheduleType::OrderedStatic:
119 case OMPScheduleType::OrderedDynamicChunked:
120 case OMPScheduleType::OrderedGuidedChunked:
121 case OMPScheduleType::OrderedRuntime:
122 case OMPScheduleType::OrderedAuto:
123 case OMPScheduleType::OrderdTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedStaticChunked:
125 case OMPScheduleType::NomergeUnorderedStatic:
126 case OMPScheduleType::NomergeUnorderedDynamicChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedChunked:
128 case OMPScheduleType::NomergeUnorderedRuntime:
129 case OMPScheduleType::NomergeUnorderedAuto:
130 case OMPScheduleType::NomergeUnorderedTrapezoidal:
131 case OMPScheduleType::NomergeUnorderedGreedy:
132 case OMPScheduleType::NomergeUnorderedBalanced:
133 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
134 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
135 case OMPScheduleType::NomergeUnorderedSteal:
136 case OMPScheduleType::NomergeOrderedStaticChunked:
137 case OMPScheduleType::NomergeOrderedStatic:
138 case OMPScheduleType::NomergeOrderedDynamicChunked:
139 case OMPScheduleType::NomergeOrderedGuidedChunked:
140 case OMPScheduleType::NomergeOrderedRuntime:
141 case OMPScheduleType::NomergeOrderedAuto:
142 case OMPScheduleType::NomergeOrderedTrapezoidal:
143 case OMPScheduleType::OrderedDistributeChunked:
144 case OMPScheduleType::OrderedDistribute:
145 break;
146 default:
147 return false;
148 }
149
150 // Must not set both monotonicity modifiers at the same time.
151 OMPScheduleType MonotonicityFlags =
152 SchedType & OMPScheduleType::MonotonicityMask;
153 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
154 return false;
155
156 return true;
157}
158#endif
159
160/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
161/// debug location to the last instruction in the specified basic block if the
162/// insert point points to the end of the block.
165 Builder.restoreIP(IP);
166 llvm::BasicBlock *BB = Builder.GetInsertBlock();
167 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
168 if (!BB->empty() && I == BB->end())
169 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
170}
171
172static bool hasGridValue(const Triple &T) {
173 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
174}
175
176static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
177 if (T.isAMDGPU()) {
178 StringRef Features =
179 Kernel->getFnAttribute("target-features").getValueAsString();
180 if (Features.count("+wavefrontsize64"))
183 }
184 if (T.isNVPTX())
186 if (T.isSPIRV())
188 llvm_unreachable("No grid value available for this architecture!");
189}
190
191/// Determine which scheduling algorithm to use, determined from schedule clause
192/// arguments.
193static OMPScheduleType
194getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
195 bool HasSimdModifier, bool HasDistScheduleChunks) {
196 // Currently, the default schedule it static.
197 switch (ClauseKind) {
198 case OMP_SCHEDULE_Default:
199 case OMP_SCHEDULE_Static:
200 return HasChunks ? OMPScheduleType::BaseStaticChunked
201 : OMPScheduleType::BaseStatic;
202 case OMP_SCHEDULE_Dynamic:
203 return OMPScheduleType::BaseDynamicChunked;
204 case OMP_SCHEDULE_Guided:
205 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
206 : OMPScheduleType::BaseGuidedChunked;
207 case OMP_SCHEDULE_Auto:
209 case OMP_SCHEDULE_Runtime:
210 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
211 : OMPScheduleType::BaseRuntime;
212 case OMP_SCHEDULE_Distribute:
213 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
214 : OMPScheduleType::BaseDistribute;
215 }
216 llvm_unreachable("unhandled schedule clause argument");
217}
218
219/// Adds ordering modifier flags to schedule type.
220static OMPScheduleType
222 bool HasOrderedClause) {
223 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
224 OMPScheduleType::None &&
225 "Must not have ordering nor monotonicity flags already set");
226
227 OMPScheduleType OrderingModifier = HasOrderedClause
228 ? OMPScheduleType::ModifierOrdered
229 : OMPScheduleType::ModifierUnordered;
230 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
231
232 // Unsupported combinations
233 if (OrderingScheduleType ==
234 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
235 return OMPScheduleType::OrderedGuidedChunked;
236 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
237 OMPScheduleType::ModifierOrdered))
238 return OMPScheduleType::OrderedRuntime;
239
240 return OrderingScheduleType;
241}
242
243/// Adds monotonicity modifier flags to schedule type.
244static OMPScheduleType
246 bool HasSimdModifier, bool HasMonotonic,
247 bool HasNonmonotonic, bool HasOrderedClause) {
248 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
249 OMPScheduleType::None &&
250 "Must not have monotonicity flags already set");
251 assert((!HasMonotonic || !HasNonmonotonic) &&
252 "Monotonic and Nonmonotonic are contradicting each other");
253
254 if (HasMonotonic) {
255 return ScheduleType | OMPScheduleType::ModifierMonotonic;
256 } else if (HasNonmonotonic) {
257 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
258 } else {
259 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
260 // If the static schedule kind is specified or if the ordered clause is
261 // specified, and if the nonmonotonic modifier is not specified, the
262 // effect is as if the monotonic modifier is specified. Otherwise, unless
263 // the monotonic modifier is specified, the effect is as if the
264 // nonmonotonic modifier is specified.
265 OMPScheduleType BaseScheduleType =
266 ScheduleType & ~OMPScheduleType::ModifierMask;
267 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
268 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
269 HasOrderedClause) {
270 // The monotonic is used by default in openmp runtime library, so no need
271 // to set it.
272 return ScheduleType;
273 } else {
274 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
275 }
276 }
277}
278
279/// Determine the schedule type using schedule and ordering clause arguments.
280static OMPScheduleType
281computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
282 bool HasSimdModifier, bool HasMonotonicModifier,
283 bool HasNonmonotonicModifier, bool HasOrderedClause,
284 bool HasDistScheduleChunks) {
286 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
287 OMPScheduleType OrderedSchedule =
288 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
290 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
291 HasNonmonotonicModifier, HasOrderedClause);
292
294 return Result;
295}
296
297/// Given a function, if it represents the entry point of a target kernel, this
298/// returns the execution mode flags associated with that kernel.
299static std::optional<omp::OMPTgtExecModeFlags>
301 CallInst *TargetInitCall = nullptr;
302 for (Instruction &Inst : Kernel.getEntryBlock()) {
303 if (auto *Call = dyn_cast<CallInst>(&Inst)) {
304 if (Call->getCalledFunction()->getName() == "__kmpc_target_init") {
305 TargetInitCall = Call;
306 break;
307 }
308 }
309 }
310
311 if (!TargetInitCall)
312 return std::nullopt;
313
314 // Get the kernel mode information from the global variable associated to the
315 // first argument to the call to __kmpc_target_init. Refer to
316 // createTargetInit() to see how this is initialized.
317 Value *InitOperand = TargetInitCall->getArgOperand(0);
318 GlobalVariable *KernelEnv = nullptr;
319 if (auto *Cast = dyn_cast<ConstantExpr>(InitOperand))
320 KernelEnv = cast<GlobalVariable>(Cast->getOperand(0));
321 else
322 KernelEnv = cast<GlobalVariable>(InitOperand);
323 auto *KernelEnvInit = cast<ConstantStruct>(KernelEnv->getInitializer());
324 auto *ConfigEnv = cast<ConstantStruct>(KernelEnvInit->getOperand(0));
325 auto *KernelMode = cast<ConstantInt>(ConfigEnv->getOperand(2));
326 return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
327}
328
329static bool isGenericKernel(Function &Fn) {
330 std::optional<omp::OMPTgtExecModeFlags> ExecMode =
332 return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
333}
334
335/// Make \p Source branch to \p Target.
336///
337/// Handles two situations:
338/// * \p Source already has an unconditional branch.
339/// * \p Source is a degenerate block (no terminator because the BB is
340/// the current head of the IR construction).
342 if (Instruction *Term = Source->getTerminatorOrNull()) {
343 auto *Br = cast<UncondBrInst>(Term);
344 BasicBlock *Succ = Br->getSuccessor();
345 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
346 Br->setSuccessor(Target);
347 return;
348 }
349
350 auto *NewBr = UncondBrInst::Create(Target, Source);
351 NewBr->setDebugLoc(DL);
352}
353
355 bool CreateBranch, DebugLoc DL) {
356 assert(New->getFirstInsertionPt() == New->begin() &&
357 "Target BB must not have PHI nodes");
358
359 // Move instructions to new block.
360 BasicBlock *Old = IP.getBlock();
361 // If the `Old` block is empty then there are no instructions to move. But in
362 // the new debug scheme, it could have trailing debug records which will be
363 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
364 // reasons:
365 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
366 // 2. Even if `New` is not empty, the rationale to move those records to `New`
367 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
368 // assumes that `Old` is optimized out and is going away. This is not the case
369 // here. The `Old` block is still being used e.g. a branch instruction is
370 // added to it later in this function.
371 // So we call `BasicBlock::splice` only when `Old` is not empty.
372 if (!Old->empty())
373 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
374
375 if (CreateBranch) {
376 auto *NewBr = UncondBrInst::Create(New, Old);
377 NewBr->setDebugLoc(DL);
378 }
379}
380
381void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
382 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
383 BasicBlock *Old = Builder.GetInsertBlock();
384
385 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
386 if (CreateBranch)
387 Builder.SetInsertPoint(Old->getTerminator());
388 else
389 Builder.SetInsertPoint(Old);
390
391 // SetInsertPoint also updates the Builder's debug location, but we want to
392 // keep the one the Builder was configured to use.
393 Builder.SetCurrentDebugLocation(DebugLoc);
394}
395
397 DebugLoc DL, llvm::Twine Name) {
398 BasicBlock *Old = IP.getBlock();
400 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
401 Old->getParent(), Old->getNextNode());
402 spliceBB(IP, New, CreateBranch, DL);
403 New->replaceSuccessorsPhiUsesWith(Old, New);
404 return New;
405}
406
407BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
408 llvm::Twine Name) {
409 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
410 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
411 if (CreateBranch)
412 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
413 else
414 Builder.SetInsertPoint(Builder.GetInsertBlock());
415 // SetInsertPoint also updates the Builder's debug location, but we want to
416 // keep the one the Builder was configured to use.
417 Builder.SetCurrentDebugLocation(DebugLoc);
418 return New;
419}
420
421BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
422 llvm::Twine Name) {
423 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
424 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
425 if (CreateBranch)
426 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
427 else
428 Builder.SetInsertPoint(Builder.GetInsertBlock());
429 // SetInsertPoint also updates the Builder's debug location, but we want to
430 // keep the one the Builder was configured to use.
431 Builder.SetCurrentDebugLocation(DebugLoc);
432 return New;
433}
434
436 llvm::Twine Suffix) {
437 BasicBlock *Old = Builder.GetInsertBlock();
438 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
439}
440
441// This function creates a fake integer value and a fake use for the integer
442// value. It returns the fake value created. This is useful in modeling the
443// extra arguments to the outlined functions.
445 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
447 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
448 const Twine &Name = "", bool AsPtr = true,
449 bool Is64Bit = false) {
450 Builder.restoreIP(OuterAllocaIP);
451 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
452 Instruction *FakeVal;
453 AllocaInst *FakeValAddr =
454 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
455 ToBeDeleted.push_back(FakeValAddr);
456
457 if (AsPtr) {
458 FakeVal = FakeValAddr;
459 } else {
460 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
461 ToBeDeleted.push_back(FakeVal);
462 }
463
464 // Generate a fake use of this value
465 Builder.restoreIP(InnerAllocaIP);
466 Instruction *UseFakeVal;
467 if (AsPtr) {
468 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
469 } else {
470 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
471 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
472 }
473 ToBeDeleted.push_back(UseFakeVal);
474 return FakeVal;
475}
476
477//===----------------------------------------------------------------------===//
478// OpenMPIRBuilderConfig
479//===----------------------------------------------------------------------===//
480
481namespace {
483/// Values for bit flags for marking which requires clauses have been used.
484enum OpenMPOffloadingRequiresDirFlags {
485 /// flag undefined.
486 OMP_REQ_UNDEFINED = 0x000,
487 /// no requires directive present.
488 OMP_REQ_NONE = 0x001,
489 /// reverse_offload clause.
490 OMP_REQ_REVERSE_OFFLOAD = 0x002,
491 /// unified_address clause.
492 OMP_REQ_UNIFIED_ADDRESS = 0x004,
493 /// unified_shared_memory clause.
494 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
495 /// dynamic_allocators clause.
496 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
497 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
498};
499
500class OMPCodeExtractor : public CodeExtractor {
501public:
502 OMPCodeExtractor(OpenMPIRBuilder &OMPBuilder, ArrayRef<BasicBlock *> BBs,
503 DominatorTree *DT = nullptr, bool AggregateArgs = false,
504 BlockFrequencyInfo *BFI = nullptr,
505 BranchProbabilityInfo *BPI = nullptr,
506 AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
507 bool AllowAlloca = false,
508 BasicBlock *AllocationBlock = nullptr,
509 ArrayRef<BasicBlock *> DeallocationBlocks = {},
510 std::string Suffix = "", bool ArgsInZeroAddressSpace = false)
511 : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs,
512 AllowAlloca, AllocationBlock, DeallocationBlocks, Suffix,
513 ArgsInZeroAddressSpace),
514 OMPBuilder(OMPBuilder) {}
515
516 virtual ~OMPCodeExtractor() = default;
517
518protected:
519 OpenMPIRBuilder &OMPBuilder;
520};
521
522class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
523public:
524 using OMPCodeExtractor::OMPCodeExtractor;
525 virtual ~DeviceSharedMemCodeExtractor() = default;
526
527protected:
528 virtual Instruction *
529 allocateVar(IRBuilder<>::InsertPoint AllocaIP, Type *VarType,
530 const Twine &Name = Twine(""),
531 AddrSpaceCastInst **CastedAlloc = nullptr) override {
532 return OMPBuilder.createOMPAllocShared(AllocaIP, VarType, Name);
533 }
534
535 virtual Instruction *deallocateVar(IRBuilder<>::InsertPoint DeallocIP,
536 Value *Var, Type *VarType) override {
537 return OMPBuilder.createOMPFreeShared(DeallocIP, Var, VarType);
538 }
539};
540
541/// Helper storing information about regions to outline using device shared
542/// memory for intermediate allocations.
543struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo {
544 OpenMPIRBuilder &OMPBuilder;
545
546 DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder)
547 : OMPBuilder(OMPBuilder) {}
548 virtual ~DeviceSharedMemOutlineInfo() = default;
549
550 virtual std::unique_ptr<CodeExtractor>
551 createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
552 bool ArgsInZeroAddressSpace,
553 Twine Suffix = Twine("")) override;
554};
555
556} // anonymous namespace
557
559 : RequiresFlags(OMP_REQ_UNDEFINED) {}
560
563 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
564 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
567 RequiresFlags(OMP_REQ_UNDEFINED) {
568 if (HasRequiresReverseOffload)
569 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
570 if (HasRequiresUnifiedAddress)
571 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
572 if (HasRequiresUnifiedSharedMemory)
573 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
574 if (HasRequiresDynamicAllocators)
575 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
576}
577
579 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
580}
581
583 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
584}
585
587 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
588}
589
591 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
592}
593
595 return hasRequiresFlags() ? RequiresFlags
596 : static_cast<int64_t>(OMP_REQ_NONE);
597}
598
600 if (Value)
601 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
602 else
603 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
604}
605
607 if (Value)
608 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
609 else
610 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
611}
612
614 if (Value)
615 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
616 else
617 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
618}
619
621 if (Value)
622 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
623 else
624 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
625}
626
627//===----------------------------------------------------------------------===//
628// OpenMPIRBuilder
629//===----------------------------------------------------------------------===//
630
633 SmallVector<Value *> &ArgsVector) {
635 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
636 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
637 constexpr size_t MaxDim = 3;
638 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
639
640 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
641
642 Value *DynCGroupMemFallbackFlag =
643 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
644 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
645 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
646
647 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
648
649 Value *NumTeams3D =
650 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
651 Value *NumThreads3D =
652 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
653 for (unsigned I :
654 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
655 NumTeams3D =
656 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
657 for (unsigned I :
658 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
659 NumThreads3D =
660 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
661
662 ArgsVector = {Version,
663 PointerNum,
664 KernelArgs.RTArgs.BasePointersArray,
665 KernelArgs.RTArgs.PointersArray,
666 KernelArgs.RTArgs.SizesArray,
667 KernelArgs.RTArgs.MapTypesArray,
668 KernelArgs.RTArgs.MapNamesArray,
669 KernelArgs.RTArgs.MappersArray,
670 KernelArgs.NumIterations,
671 Flags,
672 NumTeams3D,
673 NumThreads3D,
674 KernelArgs.DynCGroupMem};
675}
676
678 LLVMContext &Ctx = Fn.getContext();
679
680 // Get the function's current attributes.
681 auto Attrs = Fn.getAttributes();
682 auto FnAttrs = Attrs.getFnAttrs();
683 auto RetAttrs = Attrs.getRetAttrs();
685 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
686 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
687
688 // Add AS to FnAS while taking special care with integer extensions.
689 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
690 bool Param = true) -> void {
691 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
692 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
693 if (HasSignExt || HasZeroExt) {
694 assert(AS.getNumAttributes() == 1 &&
695 "Currently not handling extension attr combined with others.");
696 if (Param) {
697 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
698 FnAS = FnAS.addAttribute(Ctx, AK);
699 } else if (auto AK =
700 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
701 FnAS = FnAS.addAttribute(Ctx, AK);
702 } else {
703 FnAS = FnAS.addAttributes(Ctx, AS);
704 }
705 };
706
707#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
708#include "llvm/Frontend/OpenMP/OMPKinds.def"
709
710 // Add attributes to the function declaration.
711 switch (FnID) {
712#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
713 case Enum: \
714 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
715 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
716 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
717 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
718 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
719 break;
720#include "llvm/Frontend/OpenMP/OMPKinds.def"
721 default:
722 // Attributes are optional.
723 break;
724 }
725}
726
729 FunctionType *FnTy = nullptr;
730 Function *Fn = nullptr;
731
732 // Try to find the declation in the module first.
733 switch (FnID) {
734#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
735 case Enum: \
736 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
737 IsVarArg); \
738 Fn = M.getFunction(Str); \
739 break;
740#include "llvm/Frontend/OpenMP/OMPKinds.def"
741 }
742
743 if (!Fn) {
744 // Create a new declaration if we need one.
745 switch (FnID) {
746#define OMP_RTL(Enum, Str, ...) \
747 case Enum: \
748 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
749 break;
750#include "llvm/Frontend/OpenMP/OMPKinds.def"
751 }
752 Fn->setCallingConv(Config.getRuntimeCC());
753 // Add information if the runtime function takes a callback function
754 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
755 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
756 LLVMContext &Ctx = Fn->getContext();
757 MDBuilder MDB(Ctx);
758 // Annotate the callback behavior of the runtime function:
759 // - The callback callee is argument number 2 (microtask).
760 // - The first two arguments of the callback callee are unknown (-1).
761 // - All variadic arguments to the runtime function are passed to the
762 // callback callee.
763 Fn->addMetadata(
764 LLVMContext::MD_callback,
766 2, {-1, -1}, /* VarArgsArePassed */ true)}));
767 }
768 }
769
770 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
771 << " with type " << *Fn->getFunctionType() << "\n");
772 addAttributes(FnID, *Fn);
773
774 } else {
775 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
776 << " with type " << *Fn->getFunctionType() << "\n");
777 }
778
779 assert(Fn && "Failed to create OpenMP runtime function");
780
781 return {FnTy, Fn};
782}
783
786 if (!FiniBB) {
787 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
789 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
790 Builder.SetInsertPoint(FiniBB);
791 // FiniCB adds the branch to the exit stub.
792 if (Error Err = FiniCB(Builder.saveIP()))
793 return Err;
794 }
795 return FiniBB;
796}
797
799 BasicBlock *OtherFiniBB) {
800 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
801 if (!FiniBB) {
802 FiniBB = OtherFiniBB;
803
804 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
805 if (Error Err = FiniCB(Builder.saveIP()))
806 return Err;
807
808 return Error::success();
809 }
810
811 // Move instructions from FiniBB to the start of OtherFiniBB.
812 auto EndIt = FiniBB->end();
813 if (FiniBB->size() >= 1)
814 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
815 EndIt = Prev;
816 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
817 EndIt);
818
819 FiniBB->replaceAllUsesWith(OtherFiniBB);
820 FiniBB->eraseFromParent();
821 FiniBB = OtherFiniBB;
822 return Error::success();
823}
824
827 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
828 assert(Fn && "Failed to create OpenMP runtime function pointer");
829 return Fn;
830}
831
834 StringRef Name) {
835 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
836 Call->setCallingConv(Config.getRuntimeCC());
837 return Call;
838}
839
840void OpenMPIRBuilder::initialize() { initializeTypes(M); }
841
844 BasicBlock &EntryBlock = Function->getEntryBlock();
845 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
846
847 // Loop over blocks looking for constant allocas, skipping the entry block
848 // as any allocas there are already in the desired location.
849 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
850 Block++) {
851 for (auto Inst = Block->getReverseIterator()->begin();
852 Inst != Block->getReverseIterator()->end();) {
854 Inst++;
856 continue;
857 AllocaInst->moveBeforePreserving(MoveLocInst);
858 } else {
859 Inst++;
860 }
861 }
862 }
863}
864
867
868 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
869 // TODO: For now, we support simple static allocations, we might need to
870 // move non-static ones as well. However, this will need further analysis to
871 // move the lenght arguments as well.
873 };
874
875 for (llvm::Instruction &Inst : Block)
877 if (ShouldHoistAlloca(*AllocaInst))
878 AllocasToMove.push_back(AllocaInst);
879
880 auto InsertPoint =
881 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
882
883 for (llvm::Instruction *AllocaInst : AllocasToMove)
885}
886
888 PostDominatorTree PostDomTree(*Func);
889 for (llvm::BasicBlock &BB : *Func)
890 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
892}
893
895 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
897 SmallVector<std::unique_ptr<OutlineInfo>, 16> DeferredOutlines;
898 for (std::unique_ptr<OutlineInfo> &OI : OutlineInfos) {
899 // Skip functions that have not finalized yet; may happen with nested
900 // function generation.
901 if (Fn && OI->getFunction() != Fn) {
902 DeferredOutlines.push_back(std::move(OI));
903 continue;
904 }
905
906 ParallelRegionBlockSet.clear();
907 Blocks.clear();
908 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
909
910 Function *OuterFn = OI->getFunction();
911 CodeExtractorAnalysisCache CEAC(*OuterFn);
912 // If we generate code for the target device, we need to allocate
913 // struct for aggregate params in the device default alloca address space.
914 // OpenMP runtime requires that the params of the extracted functions are
915 // passed as zero address space pointers. This flag ensures that
916 // CodeExtractor generates correct code for extracted functions
917 // which are used by OpenMP runtime.
918 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
919 std::unique_ptr<CodeExtractor> Extractor =
920 OI->createCodeExtractor(Blocks, ArgsInZeroAddressSpace, ".omp_par");
921
922 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
923 LLVM_DEBUG(dbgs() << "Entry " << OI->EntryBB->getName()
924 << " Exit: " << OI->ExitBB->getName() << "\n");
925 assert(Extractor->isEligible() &&
926 "Expected OpenMP outlining to be possible!");
927
928 for (auto *V : OI->ExcludeArgsFromAggregate)
929 Extractor->excludeArgFromAggregate(V);
930
931 Function *OutlinedFn =
932 Extractor->extractCodeRegion(CEAC, OI->Inputs, OI->Outputs);
933
934 // Forward target-cpu, target-features attributes to the outlined function.
935 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
936 if (TargetCpuAttr.isStringAttribute())
937 OutlinedFn->addFnAttr(TargetCpuAttr);
938
939 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
940 if (TargetFeaturesAttr.isStringAttribute())
941 OutlinedFn->addFnAttr(TargetFeaturesAttr);
942
943 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
944 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
945 assert(OutlinedFn->getReturnType()->isVoidTy() &&
946 "OpenMP outlined functions should not return a value!");
947
948 // For compability with the clang CG we move the outlined function after the
949 // one with the parallel region.
950 OutlinedFn->removeFromParent();
951 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
952
953 // Remove the artificial entry introduced by the extractor right away, we
954 // made our own entry block after all.
955 {
956 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
957 assert(ArtificialEntry.getUniqueSuccessor() == OI->EntryBB);
958 assert(OI->EntryBB->getUniquePredecessor() == &ArtificialEntry);
959 // Move instructions from the to-be-deleted ArtificialEntry to the entry
960 // basic block of the parallel region. CodeExtractor generates
961 // instructions to unwrap the aggregate argument and may sink
962 // allocas/bitcasts for values that are solely used in the outlined region
963 // and do not escape.
964 assert(!ArtificialEntry.empty() &&
965 "Expected instructions to add in the outlined region entry");
966 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
967 End = ArtificialEntry.rend();
968 It != End;) {
969 Instruction &I = *It;
970 It++;
971
972 if (I.isTerminator()) {
973 // Absorb any debug value that terminator may have
974 if (Instruction *TI = OI->EntryBB->getTerminatorOrNull())
975 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
976 continue;
977 }
978
979 I.moveBeforePreserving(*OI->EntryBB,
980 OI->EntryBB->getFirstInsertionPt());
981 }
982
983 OI->EntryBB->moveBefore(&ArtificialEntry);
984 ArtificialEntry.eraseFromParent();
985 }
986 assert(&OutlinedFn->getEntryBlock() == OI->EntryBB);
987 assert(OutlinedFn && OutlinedFn->hasNUses(1));
988
989 // Run a user callback, e.g. to add attributes.
990 if (OI->PostOutlineCB)
991 OI->PostOutlineCB(*OutlinedFn);
992
993 if (OI->FixUpNonEntryAllocas)
995 }
996
997 // Remove work items that have been completed.
998 OutlineInfos = std::move(DeferredOutlines);
999
1000 // The createTarget functions embeds user written code into
1001 // the target region which may inject allocas which need to
1002 // be moved to the entry block of our target or risk malformed
1003 // optimisations by later passes, this is only relevant for
1004 // the device pass which appears to be a little more delicate
1005 // when it comes to optimisations (however, we do not block on
1006 // that here, it's up to the inserter to the list to do so).
1007 // This notbaly has to occur after the OutlinedInfo candidates
1008 // have been extracted so we have an end product that will not
1009 // be implicitly adversely affected by any raises unless
1010 // intentionally appended to the list.
1011 // NOTE: This only does so for ConstantData, it could be extended
1012 // to ConstantExpr's with further effort, however, they should
1013 // largely be folded when they get here. Extending it to runtime
1014 // defined/read+writeable allocation sizes would be non-trivial
1015 // (need to factor in movement of any stores to variables the
1016 // allocation size depends on, as well as the usual loads,
1017 // otherwise it'll yield the wrong result after movement) and
1018 // likely be more suitable as an LLVM optimisation pass.
1021
1022 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
1023 [](EmitMetadataErrorKind Kind,
1024 const TargetRegionEntryInfo &EntryInfo) -> void {
1025 errs() << "Error of kind: " << Kind
1026 << " when emitting offload entries and metadata during "
1027 "OMPIRBuilder finalization \n";
1028 };
1029
1030 if (!OffloadInfoManager.empty())
1032
1033 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
1034 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
1035 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
1036 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
1037 }
1038
1039 IsFinalized = true;
1040}
1041
1042bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
1043
1045 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
1046}
1047
1049 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
1050 auto *GV =
1051 new GlobalVariable(M, I32Ty,
1052 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
1053 ConstantInt::get(I32Ty, Value), Name);
1054 GV->setVisibility(GlobalValue::HiddenVisibility);
1055
1056 return GV;
1057}
1058
1060 if (List.empty())
1061 return;
1062
1063 // Convert List to what ConstantArray needs.
1065 UsedArray.resize(List.size());
1066 for (unsigned I = 0, E = List.size(); I != E; ++I)
1068 cast<Constant>(&*List[I]), Builder.getPtrTy());
1069
1070 if (UsedArray.empty())
1071 return;
1072 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
1073
1074 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
1075 ConstantArray::get(ATy, UsedArray), Name);
1076
1077 GV->setSection("llvm.metadata");
1078}
1079
1082 OMPTgtExecModeFlags Mode) {
1083 auto *Int8Ty = Builder.getInt8Ty();
1084 auto *GVMode = new GlobalVariable(
1085 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
1086 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
1087 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1088 return GVMode;
1089}
1090
1092 uint32_t SrcLocStrSize,
1093 IdentFlag LocFlags,
1094 unsigned Reserve2Flags) {
1095 // Enable "C-mode".
1096 LocFlags |= OMP_IDENT_FLAG_KMPC;
1097
1098 Constant *&Ident =
1099 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1100 if (!Ident) {
1101 Constant *I32Null = ConstantInt::getNullValue(Int32);
1102 Constant *IdentData[] = {I32Null,
1103 ConstantInt::get(Int32, uint32_t(LocFlags)),
1104 ConstantInt::get(Int32, Reserve2Flags),
1105 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1106
1107 size_t SrcLocStrArgIdx = 4;
1108 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1110 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1111 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1112 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1113 Constant *Initializer =
1114 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1115
1116 // Look for existing encoding of the location + flags, not needed but
1117 // minimizes the difference to the existing solution while we transition.
1118 for (GlobalVariable &GV : M.globals())
1119 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1120 if (GV.getInitializer() == Initializer)
1121 Ident = &GV;
1122
1123 if (!Ident) {
1124 auto *GV = new GlobalVariable(
1125 M, OpenMPIRBuilder::Ident,
1126 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1128 M.getDataLayout().getDefaultGlobalsAddressSpace());
1129 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1130 GV->setAlignment(Align(8));
1131 Ident = GV;
1132 }
1133 }
1134
1135 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1136}
1137
1139 uint32_t &SrcLocStrSize) {
1140 SrcLocStrSize = LocStr.size();
1141 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1142 if (!SrcLocStr) {
1143 Constant *Initializer =
1144 ConstantDataArray::getString(M.getContext(), LocStr);
1145
1146 // Look for existing encoding of the location, not needed but minimizes the
1147 // difference to the existing solution while we transition.
1148 for (GlobalVariable &GV : M.globals())
1149 if (GV.isConstant() && GV.hasInitializer() &&
1150 GV.getInitializer() == Initializer)
1151 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1152
1153 SrcLocStr = Builder.CreateGlobalString(
1154 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1155 &M);
1156 }
1157 return SrcLocStr;
1158}
1159
1161 StringRef FileName,
1162 unsigned Line, unsigned Column,
1163 uint32_t &SrcLocStrSize) {
1164 SmallString<128> Buffer;
1165 Buffer.push_back(';');
1166 Buffer.append(FileName);
1167 Buffer.push_back(';');
1168 Buffer.append(FunctionName);
1169 Buffer.push_back(';');
1170 Buffer.append(std::to_string(Line));
1171 Buffer.push_back(';');
1172 Buffer.append(std::to_string(Column));
1173 Buffer.push_back(';');
1174 Buffer.push_back(';');
1175 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1176}
1177
1178Constant *
1180 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1181 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1182}
1183
1185 uint32_t &SrcLocStrSize,
1186 Function *F) {
1187 DILocation *DIL = DL.get();
1188 if (!DIL)
1189 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1190 StringRef FileName =
1191 !DIL->getFilename().empty() ? DIL->getFilename() : M.getName();
1192 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1193 if (Function.empty() && F)
1194 Function = F->getName();
1195 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1196 DIL->getColumn(), SrcLocStrSize);
1197}
1198
1200 uint32_t &SrcLocStrSize) {
1201 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1202 Loc.IP.getBlock()->getParent());
1203}
1204
1207 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1208 "omp_global_thread_num");
1209}
1210
1213 bool ForceSimpleCall, bool CheckCancelFlag) {
1214 if (!updateToLocation(Loc))
1215 return Loc.IP;
1216
1217 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1218 // __kmpc_barrier(loc, thread_id);
1219
1220 IdentFlag BarrierLocFlags;
1221 switch (Kind) {
1222 case OMPD_for:
1223 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1224 break;
1225 case OMPD_sections:
1226 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1227 break;
1228 case OMPD_single:
1229 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1230 break;
1231 case OMPD_barrier:
1232 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1233 break;
1234 default:
1235 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1236 break;
1237 }
1238
1239 uint32_t SrcLocStrSize;
1240 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1241 Value *Args[] = {
1242 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1243 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1244
1245 // If we are in a cancellable parallel region, barriers are cancellation
1246 // points.
1247 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1248 bool UseCancelBarrier =
1249 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1250
1252 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1253 ? OMPRTL___kmpc_cancel_barrier
1254 : OMPRTL___kmpc_barrier),
1255 Args);
1256
1257 if (UseCancelBarrier && CheckCancelFlag)
1258 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1259 return Err;
1260
1261 return Builder.saveIP();
1262}
1263
1266 Value *IfCondition,
1267 omp::Directive CanceledDirective) {
1268 if (!updateToLocation(Loc))
1269 return Loc.IP;
1270
1271 // LLVM utilities like blocks with terminators.
1272 auto *UI = Builder.CreateUnreachable();
1273
1274 Instruction *ThenTI = UI, *ElseTI = nullptr;
1275 if (IfCondition) {
1276 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1277
1278 // Even if the if condition evaluates to false, this should count as a
1279 // cancellation point
1280 Builder.SetInsertPoint(ElseTI);
1281 auto ElseIP = Builder.saveIP();
1282
1284 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1285 if (!IPOrErr)
1286 return IPOrErr;
1287 }
1288
1289 Builder.SetInsertPoint(ThenTI);
1290
1291 Value *CancelKind = nullptr;
1292 switch (CanceledDirective) {
1293#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1294 case DirectiveEnum: \
1295 CancelKind = Builder.getInt32(Value); \
1296 break;
1297#include "llvm/Frontend/OpenMP/OMPKinds.def"
1298 default:
1299 llvm_unreachable("Unknown cancel kind!");
1300 }
1301
1302 uint32_t SrcLocStrSize;
1303 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1304 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1305 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1307 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1308
1309 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1310 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1311 return Err;
1312
1313 // Update the insertion point and remove the terminator we introduced.
1314 Builder.SetInsertPoint(UI->getParent());
1315 UI->eraseFromParent();
1316
1317 return Builder.saveIP();
1318}
1319
1322 omp::Directive CanceledDirective) {
1323 if (!updateToLocation(Loc))
1324 return Loc.IP;
1325
1326 // LLVM utilities like blocks with terminators.
1327 auto *UI = Builder.CreateUnreachable();
1328 Builder.SetInsertPoint(UI);
1329
1330 Value *CancelKind = nullptr;
1331 switch (CanceledDirective) {
1332#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1333 case DirectiveEnum: \
1334 CancelKind = Builder.getInt32(Value); \
1335 break;
1336#include "llvm/Frontend/OpenMP/OMPKinds.def"
1337 default:
1338 llvm_unreachable("Unknown cancel kind!");
1339 }
1340
1341 uint32_t SrcLocStrSize;
1342 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1343 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1344 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1346 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1347
1348 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1349 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1350 return Err;
1351
1352 // Update the insertion point and remove the terminator we introduced.
1353 Builder.SetInsertPoint(UI->getParent());
1354 UI->eraseFromParent();
1355
1356 return Builder.saveIP();
1357}
1358
1360 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1361 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1362 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1363 if (!updateToLocation(Loc))
1364 return Loc.IP;
1365
1366 Builder.restoreIP(AllocaIP);
1367 auto *KernelArgsPtr =
1368 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1370
1371 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1372 llvm::Value *Arg =
1373 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1374 Builder.CreateAlignedStore(
1375 KernelArgs[I], Arg,
1376 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1377 }
1378
1379 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1380 NumThreads, HostPtr, KernelArgsPtr};
1381
1383 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1384 OffloadingArgs);
1385
1386 return Builder.saveIP();
1387}
1388
1390 const LocationDescription &Loc, Value *OutlinedFnID,
1391 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1392 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1393
1394 if (!updateToLocation(Loc))
1395 return Loc.IP;
1396
1397 // On top of the arrays that were filled up, the target offloading call
1398 // takes as arguments the device id as well as the host pointer. The host
1399 // pointer is used by the runtime library to identify the current target
1400 // region, so it only has to be unique and not necessarily point to
1401 // anything. It could be the pointer to the outlined function that
1402 // implements the target region, but we aren't using that so that the
1403 // compiler doesn't need to keep that, and could therefore inline the host
1404 // function if proven worthwhile during optimization.
1405
1406 // From this point on, we need to have an ID of the target region defined.
1407 assert(OutlinedFnID && "Invalid outlined function ID!");
1408 (void)OutlinedFnID;
1409
1410 // Return value of the runtime offloading call.
1411 Value *Return = nullptr;
1412
1413 // Arguments for the target kernel.
1414 SmallVector<Value *> ArgsVector;
1415 getKernelArgsVector(Args, Builder, ArgsVector);
1416
1417 // The target region is an outlined function launched by the runtime
1418 // via calls to __tgt_target_kernel().
1419 //
1420 // Note that on the host and CPU targets, the runtime implementation of
1421 // these calls simply call the outlined function without forking threads.
1422 // The outlined functions themselves have runtime calls to
1423 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1424 // the compiler in emitTeamsCall() and emitParallelCall().
1425 //
1426 // In contrast, on the NVPTX target, the implementation of
1427 // __tgt_target_teams() launches a GPU kernel with the requested number
1428 // of teams and threads so no additional calls to the runtime are required.
1429 // Check the error code and execute the host version if required.
1430 Builder.restoreIP(emitTargetKernel(
1431 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1432 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1433
1434 BasicBlock *OffloadFailedBlock =
1435 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1436 BasicBlock *OffloadContBlock =
1437 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1438 Value *Failed = Builder.CreateIsNotNull(Return);
1439 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1440
1441 auto CurFn = Builder.GetInsertBlock()->getParent();
1442 emitBlock(OffloadFailedBlock, CurFn);
1443 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1444 if (!AfterIP)
1445 return AfterIP.takeError();
1446 Builder.restoreIP(*AfterIP);
1447 emitBranch(OffloadContBlock);
1448 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1449 return Builder.saveIP();
1450}
1451
1453 Value *CancelFlag, omp::Directive CanceledDirective) {
1454 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1455 "Unexpected cancellation!");
1456
1457 // For a cancel barrier we create two new blocks.
1458 BasicBlock *BB = Builder.GetInsertBlock();
1459 BasicBlock *NonCancellationBlock;
1460 if (Builder.GetInsertPoint() == BB->end()) {
1461 // TODO: This branch will not be needed once we moved to the
1462 // OpenMPIRBuilder codegen completely.
1463 NonCancellationBlock = BasicBlock::Create(
1464 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1465 } else {
1466 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1468 Builder.SetInsertPoint(BB);
1469 }
1470 BasicBlock *CancellationBlock = BasicBlock::Create(
1471 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1472
1473 // Jump to them based on the return value.
1474 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1475 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1476 /* TODO weight */ nullptr, nullptr);
1477
1478 // From the cancellation block we finalize all variables and go to the
1479 // post finalization block that is known to the FiniCB callback.
1480 auto &FI = FinalizationStack.back();
1481 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1482 if (!FiniBBOrErr)
1483 return FiniBBOrErr.takeError();
1484 Builder.SetInsertPoint(CancellationBlock);
1485 Builder.CreateBr(*FiniBBOrErr);
1486
1487 // The continuation block is where code generation continues.
1488 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1489 return Error::success();
1490}
1491
1492/// Create wrapper function used to gather the outlined function's argument
1493/// structure from a shared buffer and to forward them to it when running in
1494/// Generic mode.
1495///
1496/// The outlined function is expected to receive 2 integer arguments followed by
1497/// an optional pointer argument to an argument structure holding the rest.
1499 Function &OutlinedFn) {
1500 size_t NumArgs = OutlinedFn.arg_size();
1501 assert((NumArgs == 2 || NumArgs == 3) &&
1502 "expected a 2-3 argument parallel outlined function");
1503 bool UseArgStruct = NumArgs == 3;
1504
1505 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1506 IRBuilder<>::InsertPointGuard IPG(Builder);
1507 auto *FnTy = FunctionType::get(Builder.getVoidTy(),
1508 {Builder.getInt16Ty(), Builder.getInt32Ty()},
1509 /*isVarArg=*/false);
1510 auto *WrapperFn =
1512 OutlinedFn.getName() + ".wrapper", OMPIRBuilder->M);
1513
1514 WrapperFn->addParamAttr(0, Attribute::NoUndef);
1515 WrapperFn->addParamAttr(0, Attribute::ZExt);
1516 WrapperFn->addParamAttr(1, Attribute::NoUndef);
1517
1518 BasicBlock *EntryBB =
1519 BasicBlock::Create(OMPIRBuilder->M.getContext(), "entry", WrapperFn);
1520 Builder.SetInsertPoint(EntryBB);
1521
1522 // Allocation.
1523 Value *AddrAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1524 /*ArraySize=*/nullptr, "addr");
1525 AddrAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1526 AddrAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1527 AddrAlloca->getName() + ".ascast");
1528
1529 Value *ZeroAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1530 /*ArraySize=*/nullptr, "zero");
1531 ZeroAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1532 ZeroAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1533 ZeroAlloca->getName() + ".ascast");
1534
1535 Value *ArgsAlloca = nullptr;
1536 if (UseArgStruct) {
1537 ArgsAlloca = Builder.CreateAlloca(Builder.getPtrTy(),
1538 /*ArraySize=*/nullptr, "global_args");
1539 ArgsAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1540 ArgsAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1541 ArgsAlloca->getName() + ".ascast");
1542 }
1543
1544 // Initialization.
1545 Builder.CreateStore(WrapperFn->getArg(1), AddrAlloca);
1546 Builder.CreateStore(Builder.getInt32(0), ZeroAlloca);
1547 if (UseArgStruct) {
1548 Builder.CreateCall(
1549 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(
1550 llvm::omp::RuntimeFunction::OMPRTL___kmpc_get_shared_variables),
1551 {ArgsAlloca});
1552 }
1553
1554 SmallVector<Value *, 3> Args{AddrAlloca, ZeroAlloca};
1555
1556 // Load structArg from global_args.
1557 if (UseArgStruct) {
1558 Value *StructArg = Builder.CreateLoad(Builder.getPtrTy(), ArgsAlloca);
1559 StructArg = Builder.CreateInBoundsGEP(Builder.getPtrTy(), StructArg,
1560 {Builder.getInt64(0)});
1561 StructArg = Builder.CreateLoad(Builder.getPtrTy(), StructArg, "structArg");
1562 Args.push_back(StructArg);
1563 }
1564
1565 // Call the outlined function holding the parallel body.
1566 Builder.CreateCall(&OutlinedFn, Args);
1567 Builder.CreateRetVoid();
1568
1569 return WrapperFn;
1570}
1571
1572// Callback used to create OpenMP runtime calls to support
1573// omp parallel clause for the device.
1574// We need to use this callback to replace call to the OutlinedFn in OuterFn
1575// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1577 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1578 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1579 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1580 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1581 assert(OutlinedFn.arg_size() >= 2 &&
1582 "Expected at least tid and bounded tid as arguments");
1583 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1584
1585 // Add some known attributes.
1586 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1587 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1588 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1589 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1590 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1591 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1592
1593 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1594 assert(CI && "Expected call instruction to outlined function");
1595 CI->getParent()->setName("omp_parallel");
1596
1597 Builder.SetInsertPoint(CI);
1598 Type *PtrTy = OMPIRBuilder->VoidPtr;
1599
1600 // Add alloca for kernel args
1601 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1602 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1603 AllocaInst *ArgsAlloca =
1604 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1605 Value *Args = ArgsAlloca;
1606 // Add address space cast if array for storing arguments is not allocated
1607 // in address space 0
1608 if (ArgsAlloca->getAddressSpace())
1609 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1610 Builder.restoreIP(CurrentIP);
1611
1612 // Store captured vars which are used by kmpc_parallel_60
1613 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1614 Value *V = *(CI->arg_begin() + 2 + Idx);
1615 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1616 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1617 Builder.CreateStore(V, StoreAddress);
1618 }
1619
1620 Value *Cond =
1621 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1622 : Builder.getInt32(1);
1623 Value *NumThreadsArg =
1624 NumThreads ? Builder.CreateZExtOrTrunc(NumThreads, OMPIRBuilder->Int32)
1625 : Builder.getInt32(-1);
1626
1627 // If this is not a Generic kernel, we can skip generating the wrapper.
1628 Value *WrapperFn;
1629 if (isGenericKernel(*OuterFn))
1630 WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
1631 else
1632 WrapperFn = Constant::getNullValue(PtrTy);
1633
1634 // Build kmpc_parallel_60 call
1635 Value *Parallel60CallArgs[] = {
1636 /* identifier*/ Ident,
1637 /* global thread num*/ ThreadID,
1638 /* if expression */ Cond,
1639 /* number of threads */ NumThreadsArg,
1640 /* Proc bind */ Builder.getInt32(-1),
1641 /* outlined function */ &OutlinedFn,
1642 /* wrapper function */ WrapperFn,
1643 /* arguments of the outlined funciton*/ Args,
1644 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1645 /* strict for number of threads */ Builder.getInt32(0)};
1646
1647 FunctionCallee RTLFn =
1648 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1649
1650 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1651
1652 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1653 << *Builder.GetInsertBlock()->getParent() << "\n");
1654
1655 // Initialize the local TID stack location with the argument value.
1656 Builder.SetInsertPoint(PrivTID);
1657 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1658 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1659 PrivTIDAddr);
1660
1661 // Remove redundant call to the outlined function.
1662 CI->eraseFromParent();
1663
1664 for (Instruction *I : ToBeDeleted) {
1665 I->eraseFromParent();
1666 }
1667}
1668
1669// Callback used to create OpenMP runtime calls to support
1670// omp parallel clause for the host.
1671// We need to use this callback to replace call to the OutlinedFn in OuterFn
1672// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1673static void
1675 Function *OuterFn, Value *Ident, Value *IfCondition,
1676 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1677 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1678 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1679 FunctionCallee RTLFn;
1680 if (IfCondition) {
1681 RTLFn =
1682 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1683 } else {
1684 RTLFn =
1685 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1686 }
1687 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1688 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1689 LLVMContext &Ctx = F->getContext();
1690 MDBuilder MDB(Ctx);
1691 // Annotate the callback behavior of the __kmpc_fork_call:
1692 // - The callback callee is argument number 2 (microtask).
1693 // - The first two arguments of the callback callee are unknown (-1).
1694 // - All variadic arguments to the __kmpc_fork_call are passed to the
1695 // callback callee.
1696 F->addMetadata(LLVMContext::MD_callback,
1698 2, {-1, -1},
1699 /* VarArgsArePassed */ true)}));
1700 }
1701 }
1702 // Add some known attributes.
1703 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1704 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1705 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1706
1707 assert(OutlinedFn.arg_size() >= 2 &&
1708 "Expected at least tid and bounded tid as arguments");
1709 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1710
1711 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1712 CI->getParent()->setName("omp_parallel");
1713 Builder.SetInsertPoint(CI);
1714
1715 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1716 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1717 &OutlinedFn};
1718
1719 SmallVector<Value *, 16> RealArgs;
1720 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1721 if (IfCondition) {
1722 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1723 RealArgs.push_back(Cond);
1724 }
1725 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1726
1727 // __kmpc_fork_call_if always expects a void ptr as the last argument
1728 // If there are no arguments, pass a null pointer.
1729 auto PtrTy = OMPIRBuilder->VoidPtr;
1730 if (IfCondition && NumCapturedVars == 0) {
1731 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1732 RealArgs.push_back(NullPtrValue);
1733 }
1734
1735 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1736
1737 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1738 << *Builder.GetInsertBlock()->getParent() << "\n");
1739
1740 // Initialize the local TID stack location with the argument value.
1741 Builder.SetInsertPoint(PrivTID);
1742 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1743 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1744 PrivTIDAddr);
1745
1746 // Remove redundant call to the outlined function.
1747 CI->eraseFromParent();
1748
1749 for (Instruction *I : ToBeDeleted) {
1750 I->eraseFromParent();
1751 }
1752}
1753
1755 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
1756 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB,
1757 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
1758 Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) {
1759 assert(!isConflictIP(Loc.IP, OuterAllocIP) && "IPs must not be ambiguous");
1760
1761 if (!updateToLocation(Loc))
1762 return Loc.IP;
1763
1764 uint32_t SrcLocStrSize;
1765 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1766 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1767 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1768 (ProcBind != OMP_PROC_BIND_default);
1769 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1770 // If we generate code for the target device, we need to allocate
1771 // struct for aggregate params in the device default alloca address space.
1772 // OpenMP runtime requires that the params of the extracted functions are
1773 // passed as zero address space pointers. This flag ensures that extracted
1774 // function arguments are declared in zero address space
1775 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1776
1777 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1778 // only if we compile for host side.
1779 if (NumThreads && !Config.isTargetDevice()) {
1780 Value *Args[] = {
1781 Ident, ThreadID,
1782 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1784 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1785 }
1786
1787 if (ProcBind != OMP_PROC_BIND_default) {
1788 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1789 Value *Args[] = {
1790 Ident, ThreadID,
1791 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1793 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1794 }
1795
1796 BasicBlock *InsertBB = Builder.GetInsertBlock();
1797 Function *OuterFn = InsertBB->getParent();
1798
1799 // Save the outer alloca block because the insertion iterator may get
1800 // invalidated and we still need this later.
1801 BasicBlock *OuterAllocaBlock = OuterAllocIP.getBlock();
1802
1803 // Vector to remember instructions we used only during the modeling but which
1804 // we want to delete at the end.
1806
1807 // Change the location to the outer alloca insertion point to create and
1808 // initialize the allocas we pass into the parallel region.
1809 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1810 Builder.restoreIP(NewOuter);
1811 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1812 AllocaInst *ZeroAddrAlloca =
1813 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1814 Instruction *TIDAddr = TIDAddrAlloca;
1815 Instruction *ZeroAddr = ZeroAddrAlloca;
1816 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1817 // Add additional casts to enforce pointers in zero address space
1818 TIDAddr = new AddrSpaceCastInst(
1819 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1820 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1821 ToBeDeleted.push_back(TIDAddr);
1822 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1823 PointerType ::get(M.getContext(), 0),
1824 "zero.addr.ascast");
1825 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1826 ToBeDeleted.push_back(ZeroAddr);
1827 }
1828
1829 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1830 // associated arguments in the outlined function, so we delete them later.
1831 ToBeDeleted.push_back(TIDAddrAlloca);
1832 ToBeDeleted.push_back(ZeroAddrAlloca);
1833
1834 // Create an artificial insertion point that will also ensure the blocks we
1835 // are about to split are not degenerated.
1836 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1837
1838 BasicBlock *EntryBB = UI->getParent();
1839 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1840 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1841 BasicBlock *PRegPreFiniBB =
1842 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1843 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1844
1845 auto FiniCBWrapper = [&](InsertPointTy IP) {
1846 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1847 // target to the region exit block.
1848 if (IP.getBlock()->end() == IP.getPoint()) {
1850 Builder.restoreIP(IP);
1851 Instruction *I = Builder.CreateBr(PRegExitBB);
1852 IP = InsertPointTy(I->getParent(), I->getIterator());
1853 }
1854 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1855 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1856 "Unexpected insertion point for finalization call!");
1857 return FiniCB(IP);
1858 };
1859
1860 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1861
1862 // Generate the privatization allocas in the block that will become the entry
1863 // of the outlined function.
1864 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1865 InsertPointTy InnerAllocaIP = Builder.saveIP();
1866
1867 AllocaInst *PrivTIDAddr =
1868 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1869 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1870
1871 // Add some fake uses for OpenMP provided arguments.
1872 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1873 Instruction *ZeroAddrUse =
1874 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1875 ToBeDeleted.push_back(ZeroAddrUse);
1876
1877 // EntryBB
1878 // |
1879 // V
1880 // PRegionEntryBB <- Privatization allocas are placed here.
1881 // |
1882 // V
1883 // PRegionBodyBB <- BodeGen is invoked here.
1884 // |
1885 // V
1886 // PRegPreFiniBB <- The block we will start finalization from.
1887 // |
1888 // V
1889 // PRegionExitBB <- A common exit to simplify block collection.
1890 //
1891
1892 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1893
1894 // Let the caller create the body.
1895 assert(BodyGenCB && "Expected body generation callback!");
1896 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1897 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP, PRegExitBB))
1898 return Err;
1899
1900 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1901
1902 // If OuterFn is a Generic kernel, we need to use device shared memory to
1903 // allocate argument structures. Otherwise, we use stack allocations as usual.
1904 bool UsesDeviceSharedMemory =
1905 Config.isTargetDevice() && isGenericKernel(*OuterFn);
1906 std::unique_ptr<OutlineInfo> OI =
1907 UsesDeviceSharedMemory
1908 ? std::make_unique<DeviceSharedMemOutlineInfo>(*this)
1909 : std::make_unique<OutlineInfo>();
1910
1911 if (Config.isTargetDevice()) {
1912 // Generate OpenMP target specific runtime call
1913 OI->PostOutlineCB = [=, ToBeDeletedVec =
1914 std::move(ToBeDeleted)](Function &OutlinedFn) {
1915 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1916 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1917 ThreadID, ToBeDeletedVec);
1918 };
1919 } else {
1920 // Generate OpenMP host runtime call
1921 OI->PostOutlineCB = [=, ToBeDeletedVec =
1922 std::move(ToBeDeleted)](Function &OutlinedFn) {
1923 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1924 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1925 };
1926 }
1927
1928 OI->FixUpNonEntryAllocas = true;
1929 OI->OuterAllocBB = OuterAllocaBlock;
1930 OI->EntryBB = PRegEntryBB;
1931 OI->ExitBB = PRegExitBB;
1932 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
1933 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
1934
1935 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1937 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
1938
1939 CodeExtractorAnalysisCache CEAC(*OuterFn);
1940 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1941 /* AggregateArgs */ false,
1942 /* BlockFrequencyInfo */ nullptr,
1943 /* BranchProbabilityInfo */ nullptr,
1944 /* AssumptionCache */ nullptr,
1945 /* AllowVarArgs */ true,
1946 /* AllowAlloca */ true,
1947 /* AllocationBlock */ OuterAllocaBlock,
1948 /* DeallocationBlocks */ {},
1949 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1950
1951 // Find inputs to, outputs from the code region.
1952 BasicBlock *CommonExit = nullptr;
1953 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1954 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1955
1956 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1957 /*CollectGlobalInputs=*/true);
1958
1959 Inputs.remove_if([&](Value *I) {
1961 return GV->getValueType() == OpenMPIRBuilder::Ident;
1962
1963 return false;
1964 });
1965
1966 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1967
1968 FunctionCallee TIDRTLFn =
1969 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1970
1971 auto PrivHelper = [&](Value &V) -> Error {
1972 if (&V == TIDAddr || &V == ZeroAddr) {
1973 OI->ExcludeArgsFromAggregate.push_back(&V);
1974 return Error::success();
1975 }
1976
1978 for (Use &U : V.uses())
1979 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1980 if (ParallelRegionBlockSet.count(UserI->getParent()))
1981 Uses.insert(&U);
1982
1983 // __kmpc_fork_call expects extra arguments as pointers. If the input
1984 // already has a pointer type, everything is fine. Otherwise, store the
1985 // value onto stack and load it back inside the to-be-outlined region. This
1986 // will ensure only the pointer will be passed to the function.
1987 // FIXME: if there are more than 15 trailing arguments, they must be
1988 // additionally packed in a struct.
1989 Value *Inner = &V;
1990 if (!V.getType()->isPointerTy()) {
1992 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1993
1994 Builder.restoreIP(OuterAllocIP);
1995 Value *Ptr;
1996 if (UsesDeviceSharedMemory) {
1997 // Use device shared memory instead, if needed.
1998 Ptr = createOMPAllocShared(OuterAllocIP, V.getType(),
1999 V.getName() + ".reloaded");
2000 for (BasicBlock *DeallocBlock : OuterDeallocBlocks)
2002 InsertPointTy(DeallocBlock, DeallocBlock->getFirstInsertionPt()),
2003 Ptr, V.getType());
2004 } else {
2005 Ptr = Builder.CreateAlloca(V.getType(), nullptr,
2006 V.getName() + ".reloaded");
2007 }
2008
2009 // Store to stack at end of the block that currently branches to the entry
2010 // block of the to-be-outlined region.
2011 Builder.SetInsertPoint(InsertBB,
2012 InsertBB->getTerminator()->getIterator());
2013 Builder.CreateStore(&V, Ptr);
2014
2015 // Load back next to allocations in the to-be-outlined region.
2016 Builder.restoreIP(InnerAllocaIP);
2017 Inner = Builder.CreateLoad(V.getType(), Ptr);
2018 }
2019
2020 Value *ReplacementValue = nullptr;
2021 CallInst *CI = dyn_cast<CallInst>(&V);
2022 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
2023 ReplacementValue = PrivTID;
2024 } else {
2025 InsertPointOrErrorTy AfterIP =
2026 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
2027 if (!AfterIP)
2028 return AfterIP.takeError();
2029 Builder.restoreIP(*AfterIP);
2030 InnerAllocaIP = {
2031 InnerAllocaIP.getBlock(),
2032 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
2033
2034 assert(ReplacementValue &&
2035 "Expected copy/create callback to set replacement value!");
2036 if (ReplacementValue == &V)
2037 return Error::success();
2038 }
2039
2040 for (Use *UPtr : Uses)
2041 UPtr->set(ReplacementValue);
2042
2043 return Error::success();
2044 };
2045
2046 // Reset the inner alloca insertion as it will be used for loading the values
2047 // wrapped into pointers before passing them into the to-be-outlined region.
2048 // Configure it to insert immediately after the fake use of zero address so
2049 // that they are available in the generated body and so that the
2050 // OpenMP-related values (thread ID and zero address pointers) remain leading
2051 // in the argument list.
2052 InnerAllocaIP = IRBuilder<>::InsertPoint(
2053 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
2054
2055 // Reset the outer alloca insertion point to the entry of the relevant block
2056 // in case it was invalidated.
2057 OuterAllocIP = IRBuilder<>::InsertPoint(
2058 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
2059
2060 for (Value *Input : Inputs) {
2061 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
2062 if (Error Err = PrivHelper(*Input))
2063 return Err;
2064 }
2065 LLVM_DEBUG({
2066 for (Value *Output : Outputs)
2067 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
2068 });
2069 assert(Outputs.empty() &&
2070 "OpenMP outlining should not produce live-out values!");
2071
2072 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
2073 LLVM_DEBUG({
2074 for (auto *BB : Blocks)
2075 dbgs() << " PBR: " << BB->getName() << "\n";
2076 });
2077
2078 // Adjust the finalization stack, verify the adjustment, and call the
2079 // finalize function a last time to finalize values between the pre-fini
2080 // block and the exit block if we left the parallel "the normal way".
2081 auto FiniInfo = FinalizationStack.pop_back_val();
2082 (void)FiniInfo;
2083 assert(FiniInfo.DK == OMPD_parallel &&
2084 "Unexpected finalization stack state!");
2085
2086 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
2087
2088 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
2089 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
2090 if (!FiniBBOrErr)
2091 return FiniBBOrErr.takeError();
2092 {
2094 Builder.restoreIP(PreFiniIP);
2095 Builder.CreateBr(*FiniBBOrErr);
2096 // There's currently a branch to omp.par.exit. Delete it. We will get there
2097 // via the fini block
2098 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
2099 Term->eraseFromParent();
2100 }
2101
2102 // Register the outlined info.
2103 addOutlineInfo(std::move(OI));
2104
2105 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
2106 UI->eraseFromParent();
2107
2108 return AfterIP;
2109}
2110
2112 // Build call void __kmpc_flush(ident_t *loc)
2113 uint32_t SrcLocStrSize;
2114 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2115 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
2116
2118 Args);
2119}
2120
2122 if (!updateToLocation(Loc))
2123 return;
2124 emitFlush(Loc);
2125}
2126
2128 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
2129 // global_tid);
2130 uint32_t SrcLocStrSize;
2131 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2132 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2133 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
2134
2135 // Ignore return result until untied tasks are supported.
2137 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
2138}
2139
2145
2147 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
2148 uint32_t SrcLocStrSize;
2149 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2150 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2151 Constant *I32Null = ConstantInt::getNullValue(Int32);
2152 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
2153
2155 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
2156}
2157
2163
2165 const DependData &Dep) {
2166 // Store the pointer to the variable
2167 Value *Addr = Builder.CreateStructGEP(
2168 DependInfo, Entry,
2169 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2170 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, SizeTy);
2171 Builder.CreateStore(DepValPtr, Addr);
2172 // Store the size of the variable
2173 Value *Size = Builder.CreateStructGEP(
2174 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Len));
2175 Builder.CreateStore(
2176 ConstantInt::get(SizeTy,
2177 M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2178 Size);
2179 // Store the dependency kind
2180 Value *Flags = Builder.CreateStructGEP(
2181 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Flags));
2182 Builder.CreateStore(ConstantInt::get(Builder.getInt8Ty(),
2183 static_cast<unsigned int>(Dep.DepKind)),
2184 Flags);
2185}
2186
2187// Processes the dependencies in Dependencies and does the following
2188// - Allocates space on the stack of an array of DependInfo objects
2189// - Populates each DependInfo object with relevant information of
2190// the corresponding dependence.
2191// - All code is inserted in the entry block of the current function.
2193 OpenMPIRBuilder &OMPBuilder,
2195 // Early return if we have no dependencies to process
2196 if (Dependencies.empty())
2197 return nullptr;
2198
2199 // Given a vector of DependData objects, in this function we create an
2200 // array on the stack that holds kmp_depend_info objects corresponding
2201 // to each dependency. This is then passed to the OpenMP runtime.
2202 // For example, if there are 'n' dependencies then the following psedo
2203 // code is generated. Assume the first dependence is on a variable 'a'
2204 //
2205 // \code{c}
2206 // DepArray = alloc(n x sizeof(kmp_depend_info);
2207 // idx = 0;
2208 // DepArray[idx].base_addr = ptrtoint(&a);
2209 // DepArray[idx].len = 8;
2210 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
2211 // ++idx;
2212 // DepArray[idx].base_addr = ...;
2213 // \endcode
2214
2215 IRBuilderBase &Builder = OMPBuilder.Builder;
2216 Type *DependInfo = OMPBuilder.DependInfo;
2217
2218 Value *DepArray = nullptr;
2219 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2220 Builder.SetInsertPoint(
2222
2223 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2224 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2225
2226 Builder.restoreIP(OldIP);
2227
2228 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2229 Value *Base =
2230 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2231 OMPBuilder.emitTaskDependency(Builder, Base, Dep);
2232 }
2233 return DepArray;
2234}
2235
2236/// Create the task duplication function passed to kmpc_taskloop.
2237Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2238 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2239 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2240 if (!DupCB)
2242 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2243
2244 // From OpenMP Runtime p_task_dup_t:
2245 // Routine optionally generated by the compiler for setting the lastprivate
2246 // flag and calling needed constructors for private/firstprivate objects (used
2247 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2248 // lastprivate flag.
2249 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2250
2251 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2252
2253 FunctionType *DupFuncTy = FunctionType::get(
2254 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2255 /*isVarArg=*/false);
2256
2257 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2258 "omp_taskloop_dup", M);
2259 Value *DestTaskArg = DupFunction->getArg(0);
2260 Value *SrcTaskArg = DupFunction->getArg(1);
2261 Value *LastprivateFlagArg = DupFunction->getArg(2);
2262 DestTaskArg->setName("dest_task");
2263 SrcTaskArg->setName("src_task");
2264 LastprivateFlagArg->setName("lastprivate_flag");
2265
2266 IRBuilderBase::InsertPointGuard Guard(Builder);
2267 Builder.SetInsertPoint(
2268 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2269
2270 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2271 Type *TaskWithPrivatesTy =
2272 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2273 Value *TaskPrivates = Builder.CreateGEP(
2274 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2275 Value *ContextPtr = Builder.CreateGEP(
2276 PrivatesTy, TaskPrivates,
2277 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2278 return ContextPtr;
2279 };
2280
2281 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2282 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2283
2284 DestTaskContextPtr->setName("destPtr");
2285 SrcTaskContextPtr->setName("srcPtr");
2286
2287 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2288 DupFunction->getEntryBlock().begin());
2289 InsertPointTy CodeGenIP = Builder.saveIP();
2290 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2291 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2292 if (!AfterIPOrError)
2293 return AfterIPOrError.takeError();
2294 Builder.restoreIP(*AfterIPOrError);
2295
2296 Builder.CreateRetVoid();
2297
2298 return DupFunction;
2299}
2300
2301OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2302 const LocationDescription &Loc, InsertPointTy AllocaIP,
2303 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2304 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2305 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2306 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2307 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2308 Value *TaskContextStructPtrVal) {
2309
2310 if (!updateToLocation(Loc))
2311 return InsertPointTy();
2312
2313 uint32_t SrcLocStrSize;
2314 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2315 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2316
2317 BasicBlock *TaskloopExitBB =
2318 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2319 BasicBlock *TaskloopBodyBB =
2320 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2321 BasicBlock *TaskloopAllocaBB =
2322 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2323
2324 InsertPointTy TaskloopAllocaIP =
2325 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2326 InsertPointTy TaskloopBodyIP =
2327 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2328
2329 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP, TaskloopExitBB))
2330 return Err;
2331
2332 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2333 if (!result) {
2334 return result.takeError();
2335 }
2336
2337 llvm::CanonicalLoopInfo *CLI = result.get();
2338 auto OI = std::make_unique<OutlineInfo>();
2339 OI->EntryBB = TaskloopAllocaBB;
2340 OI->OuterAllocBB = AllocaIP.getBlock();
2341 OI->ExitBB = TaskloopExitBB;
2342 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2343 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2344
2345 // Add the thread ID argument.
2346 SmallVector<Instruction *> ToBeDeleted;
2347 // dummy instruction to be used as a fake argument
2348 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2349 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2350 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2351 TaskloopAllocaIP, "lb", false, true);
2352 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2353 TaskloopAllocaIP, "ub", false, true);
2354 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2355 TaskloopAllocaIP, "step", false, true);
2356 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2357 // aggregate struct
2358 OI->Inputs.insert(FakeLB);
2359 OI->Inputs.insert(FakeUB);
2360 OI->Inputs.insert(FakeStep);
2361 if (TaskContextStructPtrVal)
2362 OI->Inputs.insert(TaskContextStructPtrVal);
2363 assert(((TaskContextStructPtrVal && DupCB) ||
2364 (!TaskContextStructPtrVal && !DupCB)) &&
2365 "Task context struct ptr and duplication callback must be both set "
2366 "or both null");
2367
2368 // It isn't safe to run the duplication bodygen callback inside the post
2369 // outlining callback so this has to be run now before we know the real task
2370 // shareds structure type.
2371 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2372 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2373 Type *FakeSharedsTy = StructType::get(
2374 Builder.getContext(),
2375 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2376 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2377 FakeSharedsTy,
2378 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2379 if (!TaskDupFnOrErr) {
2380 return TaskDupFnOrErr.takeError();
2381 }
2382 Value *TaskDupFn = *TaskDupFnOrErr;
2383
2384 OI->PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2385 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2386 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2387 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2388 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2389 // Replace the Stale CI by appropriate RTL function call.
2390 assert(OutlinedFn.hasOneUse() &&
2391 "there must be a single user for the outlined function");
2392 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2393
2394 /* Create the casting for the Bounds Values that can be used when outlining
2395 * to replace the uses of the fakes with real values */
2396 BasicBlock *CodeReplBB = StaleCI->getParent();
2397 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2398 Value *CastedLBVal =
2399 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2400 Value *CastedUBVal =
2401 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2402 Value *CastedStepVal =
2403 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2404
2405 Builder.SetInsertPoint(StaleCI);
2406
2407 // Gather the arguments for emitting the runtime call for
2408 // @__kmpc_omp_task_alloc
2409 Function *TaskAllocFn =
2410 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2411
2412 Value *ThreadID = getOrCreateThreadID(Ident);
2413
2414 if (!NoGroup) {
2415 // Emit runtime call for @__kmpc_taskgroup
2416 Function *TaskgroupFn =
2417 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2418 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2419 }
2420
2421 // `flags` Argument Configuration
2422 // Task is tied if (Flags & 1) == 1.
2423 // Task is untied if (Flags & 1) == 0.
2424 // Task is final if (Flags & 2) == 2.
2425 // Task is not final if (Flags & 2) == 0.
2426 // Task is mergeable if (Flags & 4) == 4.
2427 // Task is not mergeable if (Flags & 4) == 0.
2428 // Task is priority if (Flags & 32) == 32.
2429 // Task is not priority if (Flags & 32) == 0.
2430 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2431 if (Final)
2432 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2433 if (Mergeable)
2434 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2435 if (Priority)
2436 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2437
2438 Value *TaskSize = Builder.getInt64(
2439 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2440
2441 AllocaInst *ArgStructAlloca =
2443 assert(ArgStructAlloca &&
2444 "Unable to find the alloca instruction corresponding to arguments "
2445 "for extracted function");
2446 std::optional<TypeSize> ArgAllocSize =
2447 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2448 assert(ArgAllocSize &&
2449 "Unable to determine size of arguments for extracted function");
2450 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2451
2452 // Emit the @__kmpc_omp_task_alloc runtime call
2453 // The runtime call returns a pointer to an area where the task captured
2454 // variables must be copied before the task is run (TaskData)
2455 CallInst *TaskData = Builder.CreateCall(
2456 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2457 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2458 /*task_func=*/&OutlinedFn});
2459
2460 Value *Shareds = StaleCI->getArgOperand(1);
2461 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2462 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2463 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2464 SharedsSize);
2465 // Get the pointer to loop lb, ub, step from task ptr
2466 // and set up the lowerbound,upperbound and step values
2467 llvm::Value *Lb = Builder.CreateGEP(
2468 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2469
2470 llvm::Value *Ub = Builder.CreateGEP(
2471 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2472
2473 llvm::Value *Step = Builder.CreateGEP(
2474 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2475 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2476
2477 // set up the arguments for emitting kmpc_taskloop runtime call
2478 // setting values for ifval, nogroup, sched, grainsize, task_dup
2479 Value *IfCondVal =
2480 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2481 : Builder.getInt32(1);
2482 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2483 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2484 Value *NoGroupVal = Builder.getInt32(1);
2485 Value *SchedVal = Builder.getInt32(Sched);
2486 Value *GrainSizeVal =
2487 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2488 : Builder.getInt64(0);
2489 Value *TaskDup = TaskDupFn;
2490
2491 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2492 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2493
2494 // taskloop runtime call
2495 Function *TaskloopFn =
2496 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2497 Builder.CreateCall(TaskloopFn, Args);
2498
2499 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2500 // nogroup is not defined
2501 if (!NoGroup) {
2502 Function *EndTaskgroupFn =
2503 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2504 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2505 }
2506
2507 StaleCI->eraseFromParent();
2508
2509 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2510
2511 LoadInst *SharedsOutlined =
2512 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2513 OutlinedFn.getArg(1)->replaceUsesWithIf(
2514 SharedsOutlined,
2515 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2516
2517 Value *IV = CLI->getIndVar();
2518 Type *IVTy = IV->getType();
2519 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2520
2521 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2522 // UpperBound. These GEP's can be reused for loading the tasks respective
2523 // bounds.
2524 Value *TaskLB = nullptr;
2525 Value *TaskUB = nullptr;
2526 Value *TaskStep = nullptr;
2527 Value *LoadTaskLB = nullptr;
2528 Value *LoadTaskUB = nullptr;
2529 Value *LoadTaskStep = nullptr;
2530 for (Instruction &I : *TaskloopAllocaBB) {
2531 if (I.getOpcode() == Instruction::GetElementPtr) {
2532 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2533 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2534 switch (CI->getZExtValue()) {
2535 case 0:
2536 TaskLB = &I;
2537 break;
2538 case 1:
2539 TaskUB = &I;
2540 break;
2541 case 2:
2542 TaskStep = &I;
2543 break;
2544 }
2545 }
2546 } else if (I.getOpcode() == Instruction::Load) {
2547 LoadInst &Load = cast<LoadInst>(I);
2548 if (Load.getPointerOperand() == TaskLB) {
2549 assert(TaskLB != nullptr && "Expected value for TaskLB");
2550 LoadTaskLB = &I;
2551 } else if (Load.getPointerOperand() == TaskUB) {
2552 assert(TaskUB != nullptr && "Expected value for TaskUB");
2553 LoadTaskUB = &I;
2554 } else if (Load.getPointerOperand() == TaskStep) {
2555 assert(TaskStep != nullptr && "Expected value for TaskStep");
2556 LoadTaskStep = &I;
2557 }
2558 }
2559 }
2560
2561 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2562
2563 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2564 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2565 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2566 Value *TripCountMinusOne = Builder.CreateSDiv(
2567 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2568 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2569 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2570 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2571 // set the trip count in the CLI
2572 CLI->setTripCount(CastedTripCount);
2573
2574 Builder.SetInsertPoint(CLI->getBody(),
2575 CLI->getBody()->getFirstInsertionPt());
2576
2577 if (NumOfCollapseLoops > 1) {
2578 llvm::SmallVector<User *> UsersToReplace;
2579 // When using the collapse clause, the bounds of the loop have to be
2580 // adjusted to properly represent the iterator of the outer loop.
2581 Value *IVPlusTaskLB = Builder.CreateAdd(
2582 CLI->getIndVar(),
2583 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2584 // To ensure every Use is correctly captured, we first want to record
2585 // which users to replace the value in, and then replace the value.
2586 for (auto IVUse = CLI->getIndVar()->uses().begin();
2587 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2588 User *IVUser = IVUse->getUser();
2589 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2590 if (Op->getOpcode() == Instruction::URem ||
2591 Op->getOpcode() == Instruction::UDiv) {
2592 UsersToReplace.push_back(IVUser);
2593 }
2594 }
2595 }
2596 for (User *User : UsersToReplace) {
2597 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2598 }
2599 } else {
2600 // The canonical loop is generated with a fixed lower bound. We need to
2601 // update the index calculation code to use the task's lower bound. The
2602 // generated code looks like this:
2603 // %omp_loop.iv = phi ...
2604 // ...
2605 // %tmp = mul [type] %omp_loop.iv, step
2606 // %user_index = add [type] tmp, lb
2607 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2608 // of the normalised induction variable:
2609 // 1. This one: converting the normalised IV to the user IV
2610 // 2. The increment (add)
2611 // 3. The comparison against the trip count (icmp)
2612 // (1) is the only use that is a mul followed by an add so this cannot
2613 // match other IR.
2614 assert(CLI->getIndVar()->getNumUses() == 3 &&
2615 "Canonical loop should have exactly three uses of the ind var");
2616 for (User *IVUser : CLI->getIndVar()->users()) {
2617 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2618 if (Mul->getOpcode() == Instruction::Mul) {
2619 for (User *MulUser : Mul->users()) {
2620 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2621 if (Add->getOpcode() == Instruction::Add) {
2622 Add->setOperand(1, CastedTaskLB);
2623 }
2624 }
2625 }
2626 }
2627 }
2628 }
2629 }
2630
2631 FakeLB->replaceAllUsesWith(CastedLBVal);
2632 FakeUB->replaceAllUsesWith(CastedUBVal);
2633 FakeStep->replaceAllUsesWith(CastedStepVal);
2634 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2635 I->eraseFromParent();
2636 }
2637 };
2638
2639 addOutlineInfo(std::move(OI));
2640 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2641 return Builder.saveIP();
2642}
2643
2646 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2647 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2648 llvm::Type::getInt32Ty(M.getContext()));
2649}
2650
2652 const LocationDescription &Loc, InsertPointTy AllocaIP,
2653 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2654 bool Tied, Value *Final, Value *IfCondition,
2655 const DependenciesInfo &Dependencies, const AffinityData &Affinities,
2656 bool Mergeable, Value *EventHandle, Value *Priority) {
2657
2658 if (!updateToLocation(Loc))
2659 return InsertPointTy();
2660
2661 uint32_t SrcLocStrSize;
2662 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2663 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2664 // The current basic block is split into four basic blocks. After outlining,
2665 // they will be mapped as follows:
2666 // ```
2667 // def current_fn() {
2668 // current_basic_block:
2669 // br label %task.exit
2670 // task.exit:
2671 // ; instructions after task
2672 // }
2673 // def outlined_fn() {
2674 // task.alloca:
2675 // br label %task.body
2676 // task.body:
2677 // ret void
2678 // }
2679 // ```
2680 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2681 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2682 BasicBlock *TaskAllocaBB =
2683 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2684
2685 InsertPointTy TaskAllocaIP =
2686 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2687 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2688 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP, TaskExitBB))
2689 return Err;
2690
2691 auto OI = std::make_unique<OutlineInfo>();
2692 OI->EntryBB = TaskAllocaBB;
2693 OI->OuterAllocBB = AllocaIP.getBlock();
2694 OI->ExitBB = TaskExitBB;
2695 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2696 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2697
2698 // Add the thread ID argument.
2700 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2701 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2702
2703 OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2704 Affinities, Mergeable, Priority, EventHandle,
2705 TaskAllocaBB,
2706 ToBeDeleted](Function &OutlinedFn) mutable {
2707 // Replace the Stale CI by appropriate RTL function call.
2708 assert(OutlinedFn.hasOneUse() &&
2709 "there must be a single user for the outlined function");
2710 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2711
2712 // HasShareds is true if any variables are captured in the outlined region,
2713 // false otherwise.
2714 bool HasShareds = StaleCI->arg_size() > 1;
2715 Builder.SetInsertPoint(StaleCI);
2716
2717 // Gather the arguments for emitting the runtime call for
2718 // @__kmpc_omp_task_alloc
2719 Function *TaskAllocFn =
2720 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2721
2722 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2723 // call.
2724 Value *ThreadID = getOrCreateThreadID(Ident);
2725
2726 // Argument - `flags`
2727 // Task is tied iff (Flags & 1) == 1.
2728 // Task is untied iff (Flags & 1) == 0.
2729 // Task is final iff (Flags & 2) == 2.
2730 // Task is not final iff (Flags & 2) == 0.
2731 // Task is mergeable or merged-if0 iff (Flags & 4) == 4.
2732 // Task is neither mergeable nor merged-if0 iff (Flags & 4) == 0.
2733 // Task is detachable iff (Flags & 64) == 64.
2734 // Task is not detachable iff (Flags & 64) == 0.
2735 // Task is priority iff (Flags & 32) == 32.
2736 // Task is not priority iff (Flags & 32) == 0.
2737 // TODO: Handle the other flags.
2738 Value *Flags = Builder.getInt32(Tied);
2739 auto *ConstIfCondition = dyn_cast_or_null<ConstantInt>(IfCondition);
2740 bool UseMergedIf0Path = ConstIfCondition && ConstIfCondition->isZero();
2741 if (Final) {
2742 Value *FinalFlag =
2743 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2744 Flags = Builder.CreateOr(FinalFlag, Flags);
2745 }
2746
2747 if (Mergeable || UseMergedIf0Path)
2748 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2749 if (EventHandle)
2750 Flags = Builder.CreateOr(Builder.getInt32(64), Flags);
2751 if (Priority)
2752 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2753
2754 // Argument - `sizeof_kmp_task_t` (TaskSize)
2755 // Tasksize refers to the size in bytes of kmp_task_t data structure
2756 // including private vars accessed in task.
2757 // TODO: add kmp_task_t_with_privates (privates)
2758 Value *TaskSize = Builder.getInt64(
2759 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2760
2761 // Argument - `sizeof_shareds` (SharedsSize)
2762 // SharedsSize refers to the shareds array size in the kmp_task_t data
2763 // structure.
2764 Value *SharedsSize = Builder.getInt64(0);
2765 if (HasShareds) {
2766 AllocaInst *ArgStructAlloca =
2768 assert(ArgStructAlloca &&
2769 "Unable to find the alloca instruction corresponding to arguments "
2770 "for extracted function");
2771 std::optional<TypeSize> ArgAllocSize =
2772 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2773 assert(ArgAllocSize &&
2774 "Unable to determine size of arguments for extracted function");
2775 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2776 }
2777 // Emit the @__kmpc_omp_task_alloc runtime call
2778 // The runtime call returns a pointer to an area where the task captured
2779 // variables must be copied before the task is run (TaskData)
2781 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2782 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2783 /*task_func=*/&OutlinedFn});
2784
2785 if (Affinities.Count && Affinities.Info) {
2787 OMPRTL___kmpc_omp_reg_task_with_affinity);
2788
2789 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2790 Affinities.Count, Affinities.Info});
2791 }
2792
2793 // Emit detach clause initialization.
2794 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2795 // task_descriptor);
2796 if (EventHandle) {
2798 OMPRTL___kmpc_task_allow_completion_event);
2799 llvm::Value *EventVal =
2800 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2801 llvm::Value *EventHandleAddr =
2802 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2803 Builder.getPtrTy(0));
2804 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2805 Builder.CreateStore(EventVal, EventHandleAddr);
2806 }
2807 // Copy the arguments for outlined function
2808 if (HasShareds) {
2809 Value *Shareds = StaleCI->getArgOperand(1);
2810 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2811 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2812 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2813 SharedsSize);
2814 }
2815
2816 if (Priority) {
2817 //
2818 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2819 // we populate the priority information into the "kmp_task_t" here
2820 //
2821 // The struct "kmp_task_t" definition is available in kmp.h
2822 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2823 // data2 is used for priority
2824 //
2825 Type *Int32Ty = Builder.getInt32Ty();
2826 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2827 // kmp_task_t* => { ptr }
2828 Type *TaskPtr = StructType::get(VoidPtr);
2829 Value *TaskGEP =
2830 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2831 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2832 Type *TaskStructType = StructType::get(
2833 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2834 Value *PriorityData = Builder.CreateInBoundsGEP(
2835 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2836 // kmp_cmplrdata_t => { ptr, ptr }
2837 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2838 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2839 PriorityData, {Zero, Zero});
2840 Builder.CreateStore(Priority, CmplrData);
2841 }
2842
2843 Value *DepArray = nullptr;
2844 Value *NumDeps = nullptr;
2845 if (Dependencies.DepArray) {
2846 DepArray = Dependencies.DepArray;
2847 NumDeps = Dependencies.NumDeps;
2848 } else if (!Dependencies.Deps.empty()) {
2849 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
2850 NumDeps = Builder.getInt32(Dependencies.Deps.size());
2851 }
2852
2853 // In the presence of the `if` clause, the following IR is generated:
2854 // ...
2855 // %data = call @__kmpc_omp_task_alloc(...)
2856 // br i1 %if_condition, label %then, label %else
2857 // then:
2858 // call @__kmpc_omp_task(...)
2859 // br label %exit
2860 // else:
2861 // ;; Wait for resolution of dependencies, if any, before
2862 // ;; beginning the task
2863 // call @__kmpc_omp_wait_deps(...)
2864 // call @__kmpc_omp_task_begin_if0(...)
2865 // call @outlined_fn(...)
2866 // call @__kmpc_omp_task_complete_if0(...)
2867 // br label %exit
2868 // exit:
2869 // ...
2870 if (IfCondition && !UseMergedIf0Path) {
2871 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2872 // terminator.
2873 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2874 Instruction *IfTerminator =
2875 Builder.GetInsertPoint()->getParent()->getTerminator();
2876 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2877 Builder.SetInsertPoint(IfTerminator);
2878 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2879 &ElseTI);
2880 Builder.SetInsertPoint(ElseTI);
2881
2882 if (DepArray) {
2883 Function *TaskWaitFn =
2884 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2886 TaskWaitFn,
2887 {Ident, ThreadID, NumDeps, DepArray,
2888 ConstantInt::get(Builder.getInt32Ty(), 0),
2890 }
2891 Function *TaskBeginFn =
2892 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2893 Function *TaskCompleteFn =
2894 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2895 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2896 CallInst *CI = nullptr;
2897 if (HasShareds)
2898 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2899 else
2900 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2901 CI->setDebugLoc(StaleCI->getDebugLoc());
2902 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2903 Builder.SetInsertPoint(ThenTI);
2904 }
2905
2906 if (DepArray) {
2907 Function *TaskFn =
2908 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2910 TaskFn,
2911 {Ident, ThreadID, TaskData, NumDeps, DepArray,
2912 ConstantInt::get(Builder.getInt32Ty(), 0),
2914
2915 } else {
2916 // Emit the @__kmpc_omp_task runtime call to spawn the task
2917 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2918 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2919 }
2920
2921 StaleCI->eraseFromParent();
2922
2923 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2924 if (HasShareds) {
2925 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2926 OutlinedFn.getArg(1)->replaceUsesWithIf(
2927 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2928 }
2929
2930 for (Instruction *I : llvm::reverse(ToBeDeleted))
2931 I->eraseFromParent();
2932 };
2933
2934 addOutlineInfo(std::move(OI));
2935 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2936
2937 return Builder.saveIP();
2938}
2939
2941 const LocationDescription &Loc, InsertPointTy AllocaIP,
2942 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB) {
2943 if (!updateToLocation(Loc))
2944 return InsertPointTy();
2945
2946 uint32_t SrcLocStrSize;
2947 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2948 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2949 Value *ThreadID = getOrCreateThreadID(Ident);
2950
2951 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2952 Function *TaskgroupFn =
2953 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2954 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2955
2956 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2957 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP(), DeallocBlocks))
2958 return Err;
2959
2960 Builder.SetInsertPoint(TaskgroupExitBB);
2961 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2962 Function *EndTaskgroupFn =
2963 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2964 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2965
2966 return Builder.saveIP();
2967}
2968
2970 const LocationDescription &Loc, InsertPointTy AllocaIP,
2972 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2973 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2974
2975 if (!updateToLocation(Loc))
2976 return Loc.IP;
2977
2978 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2979
2980 // Each section is emitted as a switch case
2981 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2982 // -> OMP.createSection() which generates the IR for each section
2983 // Iterate through all sections and emit a switch construct:
2984 // switch (IV) {
2985 // case 0:
2986 // <SectionStmt[0]>;
2987 // break;
2988 // ...
2989 // case <NumSection> - 1:
2990 // <SectionStmt[<NumSection> - 1]>;
2991 // break;
2992 // }
2993 // ...
2994 // section_loop.after:
2995 // <FiniCB>;
2996 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2997 Builder.restoreIP(CodeGenIP);
2999 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
3000 Function *CurFn = Continue->getParent();
3001 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
3002
3003 unsigned CaseNumber = 0;
3004 for (auto SectionCB : SectionCBs) {
3006 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
3007 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
3008 Builder.SetInsertPoint(CaseBB);
3009 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
3010 if (Error Err =
3011 SectionCB(InsertPointTy(),
3012 {CaseEndBr->getParent(), CaseEndBr->getIterator()}, {}))
3013 return Err;
3014 CaseNumber++;
3015 }
3016 // remove the existing terminator from body BB since there can be no
3017 // terminators after switch/case
3018 return Error::success();
3019 };
3020 // Loop body ends here
3021 // LowerBound, UpperBound, and STride for createCanonicalLoop
3022 Type *I32Ty = Type::getInt32Ty(M.getContext());
3023 Value *LB = ConstantInt::get(I32Ty, 0);
3024 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
3025 Value *ST = ConstantInt::get(I32Ty, 1);
3027 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
3028 if (!LoopInfo)
3029 return LoopInfo.takeError();
3030
3031 InsertPointOrErrorTy WsloopIP =
3032 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
3033 WorksharingLoopType::ForStaticLoop, !IsNowait);
3034 if (!WsloopIP)
3035 return WsloopIP.takeError();
3036 InsertPointTy AfterIP = *WsloopIP;
3037
3038 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
3039 assert(LoopFini && "Bad structure of static workshare loop finalization");
3040
3041 // Apply the finalization callback in LoopAfterBB
3042 auto FiniInfo = FinalizationStack.pop_back_val();
3043 assert(FiniInfo.DK == OMPD_sections &&
3044 "Unexpected finalization stack state!");
3045 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
3046 return Err;
3047
3048 return AfterIP;
3049}
3050
3053 BodyGenCallbackTy BodyGenCB,
3054 FinalizeCallbackTy FiniCB) {
3055 if (!updateToLocation(Loc))
3056 return Loc.IP;
3057
3058 auto FiniCBWrapper = [&](InsertPointTy IP) {
3059 if (IP.getBlock()->end() != IP.getPoint())
3060 return FiniCB(IP);
3061 // This must be done otherwise any nested constructs using FinalizeOMPRegion
3062 // will fail because that function requires the Finalization Basic Block to
3063 // have a terminator, which is already removed by EmitOMPRegionBody.
3064 // IP is currently at cancelation block.
3065 // We need to backtrack to the condition block to fetch
3066 // the exit block and create a branch from cancelation
3067 // to exit block.
3069 Builder.restoreIP(IP);
3070 auto *CaseBB = Loc.IP.getBlock();
3071 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
3072 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
3073 Instruction *I = Builder.CreateBr(ExitBB);
3074 IP = InsertPointTy(I->getParent(), I->getIterator());
3075 return FiniCB(IP);
3076 };
3077
3078 Directive OMPD = Directive::OMPD_sections;
3079 // Since we are using Finalization Callback here, HasFinalize
3080 // and IsCancellable have to be true
3081 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
3082 /*Conditional*/ false, /*hasFinalize*/ true,
3083 /*IsCancellable*/ true);
3084}
3085
3091
3092Value *OpenMPIRBuilder::getGPUThreadID() {
3095 OMPRTL___kmpc_get_hardware_thread_id_in_block),
3096 {});
3097}
3098
3099Value *OpenMPIRBuilder::getGPUWarpSize() {
3101 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
3102}
3103
3104Value *OpenMPIRBuilder::getNVPTXWarpID() {
3105 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3106 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
3107}
3108
3109Value *OpenMPIRBuilder::getNVPTXLaneID() {
3110 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3111 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
3112 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
3113 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
3114 "nvptx_lane_id");
3115}
3116
3117Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
3118 Type *ToType) {
3119 Type *FromType = From->getType();
3120 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
3121 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
3122 assert(FromSize > 0 && "From size must be greater than zero");
3123 assert(ToSize > 0 && "To size must be greater than zero");
3124 if (FromType == ToType)
3125 return From;
3126 if (FromSize == ToSize)
3127 return Builder.CreateBitCast(From, ToType);
3128 if (ToType->isIntegerTy() && FromType->isIntegerTy())
3129 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
3130 InsertPointTy SaveIP = Builder.saveIP();
3131 Builder.restoreIP(AllocaIP);
3132 Value *CastItem = Builder.CreateAlloca(ToType);
3133 Builder.restoreIP(SaveIP);
3134
3135 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
3136 CastItem, Builder.getPtrTy(0));
3137 Builder.CreateStore(From, ValCastItem);
3138 return Builder.CreateLoad(ToType, CastItem);
3139}
3140
3141Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
3142 Value *Element,
3143 Type *ElementType,
3144 Value *Offset) {
3145 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
3146 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
3147
3148 // Cast all types to 32- or 64-bit values before calling shuffle routines.
3149 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
3150 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
3151 Value *WarpSize =
3152 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
3154 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
3155 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
3156 Value *WarpSizeCast =
3157 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
3158 Value *ShuffleCall =
3159 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
3160 return castValueToType(AllocaIP, ShuffleCall, CastTy);
3161}
3162
3163void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
3164 Value *DstAddr, Type *ElemType,
3165 Value *Offset, Type *ReductionArrayTy,
3166 bool IsByRefElem) {
3167 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
3168 // Create the loop over the big sized data.
3169 // ptr = (void*)Elem;
3170 // ptrEnd = (void*) Elem + 1;
3171 // Step = 8;
3172 // while (ptr + Step < ptrEnd)
3173 // shuffle((int64_t)*ptr);
3174 // Step = 4;
3175 // while (ptr + Step < ptrEnd)
3176 // shuffle((int32_t)*ptr);
3177 // ...
3178 Type *IndexTy = Builder.getIndexTy(
3179 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3180 Value *ElemPtr = DstAddr;
3181 Value *Ptr = SrcAddr;
3182 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
3183 if (Size < IntSize)
3184 continue;
3185 Type *IntType = Builder.getIntNTy(IntSize * 8);
3186 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3187 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
3188 Value *SrcAddrGEP =
3189 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
3190 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3191 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
3192
3193 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3194 if ((Size / IntSize) > 1) {
3195 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
3196 SrcAddrGEP, Builder.getPtrTy());
3197 BasicBlock *PreCondBB =
3198 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
3199 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
3200 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
3201 BasicBlock *CurrentBB = Builder.GetInsertBlock();
3202 emitBlock(PreCondBB, CurFunc);
3203 PHINode *PhiSrc =
3204 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
3205 PhiSrc->addIncoming(Ptr, CurrentBB);
3206 PHINode *PhiDest =
3207 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
3208 PhiDest->addIncoming(ElemPtr, CurrentBB);
3209 Ptr = PhiSrc;
3210 ElemPtr = PhiDest;
3211 Value *PtrDiff = Builder.CreatePtrDiff(
3212 Builder.getInt8Ty(), PtrEnd,
3213 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
3214 Builder.CreateCondBr(
3215 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
3216 ExitBB);
3217 emitBlock(ThenBB, CurFunc);
3218 Value *Res = createRuntimeShuffleFunction(
3219 AllocaIP,
3220 Builder.CreateAlignedLoad(
3221 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3222 IntType, Offset);
3223 Builder.CreateAlignedStore(Res, ElemPtr,
3224 M.getDataLayout().getPrefTypeAlign(ElemType));
3225 Value *LocalPtr =
3226 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3227 Value *LocalElemPtr =
3228 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3229 PhiSrc->addIncoming(LocalPtr, ThenBB);
3230 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3231 emitBranch(PreCondBB);
3232 emitBlock(ExitBB, CurFunc);
3233 } else {
3234 Value *Res = createRuntimeShuffleFunction(
3235 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3236 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3237 Res->getType()->getScalarSizeInBits())
3238 Res = Builder.CreateTrunc(Res, ElemType);
3239 Builder.CreateStore(Res, ElemPtr);
3240 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3241 ElemPtr =
3242 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3243 }
3244 Size = Size % IntSize;
3245 }
3246}
3247
3248Error OpenMPIRBuilder::emitReductionListCopy(
3249 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3250 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3251 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3252 Type *IndexTy = Builder.getIndexTy(
3253 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3254 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3255
3256 // Iterates, element-by-element, through the source Reduce list and
3257 // make a copy.
3258 for (auto En : enumerate(ReductionInfos)) {
3259 const ReductionInfo &RI = En.value();
3260 Value *SrcElementAddr = nullptr;
3261 AllocaInst *DestAlloca = nullptr;
3262 Value *DestElementAddr = nullptr;
3263 Value *DestElementPtrAddr = nullptr;
3264 // Should we shuffle in an element from a remote lane?
3265 bool ShuffleInElement = false;
3266 // Set to true to update the pointer in the dest Reduce list to a
3267 // newly created element.
3268 bool UpdateDestListPtr = false;
3269
3270 // Step 1.1: Get the address for the src element in the Reduce list.
3271 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3272 ReductionArrayTy, SrcBase,
3273 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3274 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3275
3276 // Step 1.2: Create a temporary to store the element in the destination
3277 // Reduce list.
3278 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3279 ReductionArrayTy, DestBase,
3280 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3281 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3282 switch (Action) {
3284 InsertPointTy CurIP = Builder.saveIP();
3285 Builder.restoreIP(AllocaIP);
3286
3287 Type *DestAllocaType =
3288 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3289 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3290 ".omp.reduction.element");
3291 DestAlloca->setAlignment(
3292 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3293 DestElementAddr = DestAlloca;
3294 DestElementAddr =
3295 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3296 DestElementAddr->getName() + ".ascast");
3297 Builder.restoreIP(CurIP);
3298 ShuffleInElement = true;
3299 UpdateDestListPtr = true;
3300 break;
3301 }
3303 DestElementAddr =
3304 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3305 break;
3306 }
3307 }
3308
3309 // Now that all active lanes have read the element in the
3310 // Reduce list, shuffle over the value from the remote lane.
3311 if (ShuffleInElement) {
3312 Type *ShuffleType = RI.ElementType;
3313 Value *ShuffleSrcAddr = SrcElementAddr;
3314 Value *ShuffleDestAddr = DestElementAddr;
3315 AllocaInst *LocalStorage = nullptr;
3316
3317 if (IsByRefElem) {
3318 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3319 assert(RI.ByRefAllocatedType &&
3320 "Expected by-ref allocated type to be set");
3321 // For by-ref reductions, we need to copy from the remote lane the
3322 // actual value of the partial reduction computed by that remote lane;
3323 // rather than, for example, a pointer to that data or, even worse, a
3324 // pointer to the descriptor of the by-ref reduction element.
3325 ShuffleType = RI.ByRefElementType;
3326
3327 if (RI.DataPtrPtrGen) {
3328 // Descriptor-based by-ref: extract data pointer from descriptor.
3329 InsertPointOrErrorTy GenResult = RI.DataPtrPtrGen(
3330 Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3331
3332 if (!GenResult)
3333 return GenResult.takeError();
3334
3335 ShuffleSrcAddr =
3336 Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3337
3338 {
3339 InsertPointTy OldIP = Builder.saveIP();
3340 Builder.restoreIP(AllocaIP);
3341
3342 LocalStorage = Builder.CreateAlloca(ShuffleType);
3343 Builder.restoreIP(OldIP);
3344 ShuffleDestAddr = LocalStorage;
3345 }
3346 } else {
3347 // Non-descriptor by-ref: the pointer already references data
3348 // directly. Shuffle into the destination alloca.
3349 ShuffleDestAddr = DestElementAddr;
3350 }
3351 }
3352
3353 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3354 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3355
3356 if (IsByRefElem && RI.DataPtrPtrGen) {
3357 // Copy descriptor from source and update base_ptr to shuffled data
3358 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3359 DestAlloca, Builder.getPtrTy(), ".ascast");
3360
3361 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3362 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3363 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3364
3365 if (!GenResult)
3366 return GenResult.takeError();
3367 }
3368 } else {
3369 switch (RI.EvaluationKind) {
3370 case EvalKind::Scalar: {
3371 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3372 // Store the source element value to the dest element address.
3373 Builder.CreateStore(Elem, DestElementAddr);
3374 break;
3375 }
3376 case EvalKind::Complex: {
3377 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3378 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3379 Value *SrcReal = Builder.CreateLoad(
3380 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3381 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3382 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3383 Value *SrcImg = Builder.CreateLoad(
3384 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3385
3386 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3387 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3388 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3389 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3390 Builder.CreateStore(SrcReal, DestRealPtr);
3391 Builder.CreateStore(SrcImg, DestImgPtr);
3392 break;
3393 }
3394 case EvalKind::Aggregate: {
3395 Value *SizeVal = Builder.getInt64(
3396 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3397 Builder.CreateMemCpy(
3398 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3399 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3400 SizeVal, false);
3401 break;
3402 }
3403 };
3404 }
3405
3406 // Step 3.1: Modify reference in dest Reduce list as needed.
3407 // Modifying the reference in Reduce list to point to the newly
3408 // created element. The element is live in the current function
3409 // scope and that of functions it invokes (i.e., reduce_function).
3410 // RemoteReduceData[i] = (void*)&RemoteElem
3411 if (UpdateDestListPtr) {
3412 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3413 DestElementAddr, Builder.getPtrTy(),
3414 DestElementAddr->getName() + ".ascast");
3415 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3416 }
3417 }
3418
3419 return Error::success();
3420}
3421
3422Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3423 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3424 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3425 InsertPointTy SavedIP = Builder.saveIP();
3426 LLVMContext &Ctx = M.getContext();
3427 FunctionType *FuncTy = FunctionType::get(
3428 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3429 /* IsVarArg */ false);
3430 Function *WcFunc =
3432 "_omp_reduction_inter_warp_copy_func", &M);
3433 WcFunc->setCallingConv(Config.getRuntimeCC());
3434 WcFunc->setAttributes(FuncAttrs);
3435 WcFunc->addParamAttr(0, Attribute::NoUndef);
3436 WcFunc->addParamAttr(1, Attribute::NoUndef);
3437 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3438 Builder.SetInsertPoint(EntryBB);
3439
3440 // ReduceList: thread local Reduce list.
3441 // At the stage of the computation when this function is called, partially
3442 // aggregated values reside in the first lane of every active warp.
3443 Argument *ReduceListArg = WcFunc->getArg(0);
3444 // NumWarps: number of warps active in the parallel region. This could
3445 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3446 Argument *NumWarpsArg = WcFunc->getArg(1);
3447
3448 // This array is used as a medium to transfer, one reduce element at a time,
3449 // the data from the first lane of every warp to lanes in the first warp
3450 // in order to perform the final step of a reduction in a parallel region
3451 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3452 // for reduced latency, as well as to have a distinct copy for concurrently
3453 // executing target regions. The array is declared with common linkage so
3454 // as to be shared across compilation units.
3455 StringRef TransferMediumName =
3456 "__openmp_nvptx_data_transfer_temporary_storage";
3457 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3458 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3459 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3460 if (!TransferMedium) {
3461 TransferMedium = new GlobalVariable(
3462 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3463 UndefValue::get(ArrayTy), TransferMediumName,
3464 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3465 /*AddressSpace=*/3);
3466 }
3467
3468 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3469 Value *GPUThreadID = getGPUThreadID();
3470 // nvptx_lane_id = nvptx_id % warpsize
3471 Value *LaneID = getNVPTXLaneID();
3472 // nvptx_warp_id = nvptx_id / warpsize
3473 Value *WarpID = getNVPTXWarpID();
3474
3475 InsertPointTy AllocaIP =
3476 InsertPointTy(Builder.GetInsertBlock(),
3477 Builder.GetInsertBlock()->getFirstInsertionPt());
3478 Type *Arg0Type = ReduceListArg->getType();
3479 Type *Arg1Type = NumWarpsArg->getType();
3480 Builder.restoreIP(AllocaIP);
3481 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3482 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3483 AllocaInst *NumWarpsAlloca =
3484 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3485 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3486 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3487 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3488 NumWarpsAlloca, Builder.getPtrTy(0),
3489 NumWarpsAlloca->getName() + ".ascast");
3490 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3491 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3492 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3493 InsertPointTy CodeGenIP =
3494 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3495 Builder.restoreIP(CodeGenIP);
3496
3497 Value *ReduceList =
3498 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3499
3500 for (auto En : enumerate(ReductionInfos)) {
3501 //
3502 // Warp master copies reduce element to transfer medium in __shared__
3503 // memory.
3504 //
3505 const ReductionInfo &RI = En.value();
3506 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3507 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3508 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3509 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3510 Type *CType = Builder.getIntNTy(TySize * 8);
3511
3512 unsigned NumIters = RealTySize / TySize;
3513 if (NumIters == 0)
3514 continue;
3515 Value *Cnt = nullptr;
3516 Value *CntAddr = nullptr;
3517 BasicBlock *PrecondBB = nullptr;
3518 BasicBlock *ExitBB = nullptr;
3519 if (NumIters > 1) {
3520 CodeGenIP = Builder.saveIP();
3521 Builder.restoreIP(AllocaIP);
3522 CntAddr =
3523 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3524
3525 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3526 CntAddr->getName() + ".ascast");
3527 Builder.restoreIP(CodeGenIP);
3528 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3529 CntAddr,
3530 /*Volatile=*/false);
3531 PrecondBB = BasicBlock::Create(Ctx, "precond");
3532 ExitBB = BasicBlock::Create(Ctx, "exit");
3533 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3534 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3535 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3536 /*Volatile=*/false);
3537 Value *Cmp = Builder.CreateICmpULT(
3538 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3539 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3540 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3541 }
3542
3543 // kmpc_barrier.
3544 InsertPointOrErrorTy BarrierIP1 =
3545 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3546 omp::Directive::OMPD_unknown,
3547 /* ForceSimpleCall */ false,
3548 /* CheckCancelFlag */ true);
3549 if (!BarrierIP1)
3550 return BarrierIP1.takeError();
3551 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3552 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3553 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3554
3555 // if (lane_id == 0)
3556 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3557 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3558 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3559
3560 // Reduce element = LocalReduceList[i]
3561 auto *RedListArrayTy =
3562 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3563 Type *IndexTy = Builder.getIndexTy(
3564 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3565 Value *ElemPtrPtr =
3566 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3567 {ConstantInt::get(IndexTy, 0),
3568 ConstantInt::get(IndexTy, En.index())});
3569 // elemptr = ((CopyType*)(elemptrptr)) + I
3570 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3571
3572 if (IsByRefElem && RI.DataPtrPtrGen) {
3573 InsertPointOrErrorTy GenRes =
3574 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3575
3576 if (!GenRes)
3577 return GenRes.takeError();
3578
3579 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3580 }
3581
3582 if (NumIters > 1)
3583 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3584
3585 // Get pointer to location in transfer medium.
3586 // MediumPtr = &medium[warp_id]
3587 Value *MediumPtr = Builder.CreateInBoundsGEP(
3588 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3589 // elem = *elemptr
3590 //*MediumPtr = elem
3591 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3592 // Store the source element value to the dest element address.
3593 Builder.CreateStore(Elem, MediumPtr,
3594 /*IsVolatile*/ true);
3595 Builder.CreateBr(MergeBB);
3596
3597 // else
3598 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3599 Builder.CreateBr(MergeBB);
3600
3601 // endif
3602 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3603 InsertPointOrErrorTy BarrierIP2 =
3604 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3605 omp::Directive::OMPD_unknown,
3606 /* ForceSimpleCall */ false,
3607 /* CheckCancelFlag */ true);
3608 if (!BarrierIP2)
3609 return BarrierIP2.takeError();
3610
3611 // Warp 0 copies reduce element from transfer medium
3612 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3613 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3614 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3615
3616 Value *NumWarpsVal =
3617 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3618 // Up to 32 threads in warp 0 are active.
3619 Value *IsActiveThread =
3620 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3621 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3622
3623 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3624
3625 // SecMediumPtr = &medium[tid]
3626 // SrcMediumVal = *SrcMediumPtr
3627 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3628 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3629 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3630 Value *TargetElemPtrPtr =
3631 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3632 {ConstantInt::get(IndexTy, 0),
3633 ConstantInt::get(IndexTy, En.index())});
3634 Value *TargetElemPtrVal =
3635 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3636 Value *TargetElemPtr = TargetElemPtrVal;
3637
3638 if (IsByRefElem && RI.DataPtrPtrGen) {
3639 InsertPointOrErrorTy GenRes =
3640 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3641
3642 if (!GenRes)
3643 return GenRes.takeError();
3644
3645 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3646 }
3647
3648 if (NumIters > 1)
3649 TargetElemPtr =
3650 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3651
3652 // *TargetElemPtr = SrcMediumVal;
3653 Value *SrcMediumValue =
3654 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3655 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3656 Builder.CreateBr(W0MergeBB);
3657
3658 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3659 Builder.CreateBr(W0MergeBB);
3660
3661 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3662
3663 if (NumIters > 1) {
3664 Cnt = Builder.CreateNSWAdd(
3665 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3666 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3667
3668 auto *CurFn = Builder.GetInsertBlock()->getParent();
3669 emitBranch(PrecondBB);
3670 emitBlock(ExitBB, CurFn);
3671 }
3672 RealTySize %= TySize;
3673 }
3674 }
3675
3676 Builder.CreateRetVoid();
3677 Builder.restoreIP(SavedIP);
3678
3679 return WcFunc;
3680}
3681
3682Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3683 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3684 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3685 LLVMContext &Ctx = M.getContext();
3686 FunctionType *FuncTy =
3687 FunctionType::get(Builder.getVoidTy(),
3688 {Builder.getPtrTy(), Builder.getInt16Ty(),
3689 Builder.getInt16Ty(), Builder.getInt16Ty()},
3690 /* IsVarArg */ false);
3691 Function *SarFunc =
3693 "_omp_reduction_shuffle_and_reduce_func", &M);
3694 SarFunc->setCallingConv(Config.getRuntimeCC());
3695 SarFunc->setAttributes(FuncAttrs);
3696 SarFunc->addParamAttr(0, Attribute::NoUndef);
3697 SarFunc->addParamAttr(1, Attribute::NoUndef);
3698 SarFunc->addParamAttr(2, Attribute::NoUndef);
3699 SarFunc->addParamAttr(3, Attribute::NoUndef);
3700 SarFunc->addParamAttr(1, Attribute::SExt);
3701 SarFunc->addParamAttr(2, Attribute::SExt);
3702 SarFunc->addParamAttr(3, Attribute::SExt);
3703 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3704 Builder.SetInsertPoint(EntryBB);
3705
3706 // Thread local Reduce list used to host the values of data to be reduced.
3707 Argument *ReduceListArg = SarFunc->getArg(0);
3708 // Current lane id; could be logical.
3709 Argument *LaneIDArg = SarFunc->getArg(1);
3710 // Offset of the remote source lane relative to the current lane.
3711 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3712 // Algorithm version. This is expected to be known at compile time.
3713 Argument *AlgoVerArg = SarFunc->getArg(3);
3714
3715 Type *ReduceListArgType = ReduceListArg->getType();
3716 Type *LaneIDArgType = LaneIDArg->getType();
3717 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3718 Value *ReduceListAlloca = Builder.CreateAlloca(
3719 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3720 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3721 LaneIDArg->getName() + ".addr");
3722 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3723 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3724 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3725 AlgoVerArg->getName() + ".addr");
3726 ArrayType *RedListArrayTy =
3727 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3728
3729 // Create a local thread-private variable to host the Reduce list
3730 // from a remote lane.
3731 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3732 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3733
3734 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3735 ReduceListAlloca, ReduceListArgType,
3736 ReduceListAlloca->getName() + ".ascast");
3737 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3738 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3739 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3740 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3741 RemoteLaneOffsetAlloca->getName() + ".ascast");
3742 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3743 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3744 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3745 RemoteReductionListAlloca, Builder.getPtrTy(),
3746 RemoteReductionListAlloca->getName() + ".ascast");
3747
3748 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3749 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3750 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3751 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3752
3753 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3754 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3755 Value *RemoteLaneOffset =
3756 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3757 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3758
3759 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3760
3761 // This loop iterates through the list of reduce elements and copies,
3762 // element by element, from a remote lane in the warp to RemoteReduceList,
3763 // hosted on the thread's stack.
3764 Error EmitRedLsCpRes = emitReductionListCopy(
3765 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3766 ReduceList, RemoteListAddrCast, IsByRef,
3767 {RemoteLaneOffset, nullptr, nullptr});
3768
3769 if (EmitRedLsCpRes)
3770 return EmitRedLsCpRes;
3771
3772 // The actions to be performed on the Remote Reduce list is dependent
3773 // on the algorithm version.
3774 //
3775 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3776 // LaneId % 2 == 0 && Offset > 0):
3777 // do the reduction value aggregation
3778 //
3779 // The thread local variable Reduce list is mutated in place to host the
3780 // reduced data, which is the aggregated value produced from local and
3781 // remote lanes.
3782 //
3783 // Note that AlgoVer is expected to be a constant integer known at compile
3784 // time.
3785 // When AlgoVer==0, the first conjunction evaluates to true, making
3786 // the entire predicate true during compile time.
3787 // When AlgoVer==1, the second conjunction has only the second part to be
3788 // evaluated during runtime. Other conjunctions evaluates to false
3789 // during compile time.
3790 // When AlgoVer==2, the third conjunction has only the second part to be
3791 // evaluated during runtime. Other conjunctions evaluates to false
3792 // during compile time.
3793 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3794 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3795 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3796 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3797 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3798 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3799 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3800 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3801 Value *RemoteOffsetComp =
3802 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3803 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3804 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3805 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3806
3807 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3808 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3809 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3810
3811 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3812 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3813 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3814 ReduceList, Builder.getPtrTy());
3815 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3816 RemoteListAddrCast, Builder.getPtrTy());
3817 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3818 ->addFnAttr(Attribute::NoUnwind);
3819 Builder.CreateBr(MergeBB);
3820
3821 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3822 Builder.CreateBr(MergeBB);
3823
3824 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3825
3826 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3827 // Reduce list.
3828 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3829 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3830 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3831
3832 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3833 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3834 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3835 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3836
3837 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3838
3839 EmitRedLsCpRes = emitReductionListCopy(
3840 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3841 RemoteListAddrCast, ReduceList, IsByRef);
3842
3843 if (EmitRedLsCpRes)
3844 return EmitRedLsCpRes;
3845
3846 Builder.CreateBr(CpyMergeBB);
3847
3848 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3849 Builder.CreateBr(CpyMergeBB);
3850
3851 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3852
3853 Builder.CreateRetVoid();
3854
3855 return SarFunc;
3856}
3857
3859OpenMPIRBuilder::generateReductionDescriptor(
3860 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3861 Type *DescriptorType,
3862 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3863 DataPtrPtrGen) {
3864
3865 // Copy the source descriptor to preserve all metadata (rank, extents,
3866 // strides, etc.)
3867 Value *DescriptorSize =
3868 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3869 Builder.CreateMemCpy(
3870 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3871 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3872 DescriptorSize);
3873
3874 // Update the base pointer field to point to the local shuffled data
3875 Value *DataPtrField;
3876 InsertPointOrErrorTy GenResult =
3877 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3878
3879 if (!GenResult)
3880 return GenResult.takeError();
3881
3882 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3883 DataPtr, Builder.getPtrTy(), ".ascast"),
3884 DataPtrField);
3885
3886 return Builder.saveIP();
3887}
3888
3889Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3890 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3891 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3892 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3893 LLVMContext &Ctx = M.getContext();
3894 FunctionType *FuncTy = FunctionType::get(
3895 Builder.getVoidTy(),
3896 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3897 /* IsVarArg */ false);
3898 Function *LtGCFunc =
3900 "_omp_reduction_list_to_global_copy_func", &M);
3901 LtGCFunc->setAttributes(FuncAttrs);
3902 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3903 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3904 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3905
3906 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3907 Builder.SetInsertPoint(EntryBlock);
3908
3909 // Buffer: global reduction buffer.
3910 Argument *BufferArg = LtGCFunc->getArg(0);
3911 // Idx: index of the buffer.
3912 Argument *IdxArg = LtGCFunc->getArg(1);
3913 // ReduceList: thread local Reduce list.
3914 Argument *ReduceListArg = LtGCFunc->getArg(2);
3915
3916 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3917 BufferArg->getName() + ".addr");
3918 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3919 IdxArg->getName() + ".addr");
3920 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3921 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3922 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3923 BufferArgAlloca, Builder.getPtrTy(),
3924 BufferArgAlloca->getName() + ".ascast");
3925 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3926 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3927 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3928 ReduceListArgAlloca, Builder.getPtrTy(),
3929 ReduceListArgAlloca->getName() + ".ascast");
3930
3931 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3932 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3933 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3934
3935 Value *LocalReduceList =
3936 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3937 Value *BufferArgVal =
3938 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3939 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3940 Type *IndexTy = Builder.getIndexTy(
3941 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3942 for (auto En : enumerate(ReductionInfos)) {
3943 const ReductionInfo &RI = En.value();
3944 auto *RedListArrayTy =
3945 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3946 // Reduce element = LocalReduceList[i]
3947 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3948 RedListArrayTy, LocalReduceList,
3949 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3950 // elemptr = ((CopyType*)(elemptrptr)) + I
3951 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3952
3953 // Global = Buffer.VD[Idx];
3954 Value *BufferVD =
3955 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3956 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3957 ReductionsBufferTy, BufferVD, 0, En.index());
3958
3959 switch (RI.EvaluationKind) {
3960 case EvalKind::Scalar: {
3961 Value *TargetElement;
3962
3963 if (IsByRef.empty() || !IsByRef[En.index()]) {
3964 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3965 } else {
3966 if (RI.DataPtrPtrGen) {
3967 InsertPointOrErrorTy GenResult =
3968 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3969
3970 if (!GenResult)
3971 return GenResult.takeError();
3972
3973 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3974 }
3975 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3976 }
3977
3978 Builder.CreateStore(TargetElement, GlobVal);
3979 break;
3980 }
3981 case EvalKind::Complex: {
3982 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3983 RI.ElementType, ElemPtr, 0, 0, ".realp");
3984 Value *SrcReal = Builder.CreateLoad(
3985 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3986 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3987 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3988 Value *SrcImg = Builder.CreateLoad(
3989 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3990
3991 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3992 RI.ElementType, GlobVal, 0, 0, ".realp");
3993 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3994 RI.ElementType, GlobVal, 0, 1, ".imagp");
3995 Builder.CreateStore(SrcReal, DestRealPtr);
3996 Builder.CreateStore(SrcImg, DestImgPtr);
3997 break;
3998 }
3999 case EvalKind::Aggregate: {
4000 Value *SizeVal =
4001 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4002 Builder.CreateMemCpy(
4003 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
4004 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
4005 break;
4006 }
4007 }
4008 }
4009
4010 Builder.CreateRetVoid();
4011 Builder.restoreIP(OldIP);
4012 return LtGCFunc;
4013}
4014
4015Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
4016 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4017 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4018 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4019 LLVMContext &Ctx = M.getContext();
4020 FunctionType *FuncTy = FunctionType::get(
4021 Builder.getVoidTy(),
4022 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4023 /* IsVarArg */ false);
4024 Function *LtGRFunc =
4026 "_omp_reduction_list_to_global_reduce_func", &M);
4027 LtGRFunc->setAttributes(FuncAttrs);
4028 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
4029 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
4030 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
4031
4032 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
4033 Builder.SetInsertPoint(EntryBlock);
4034
4035 // Buffer: global reduction buffer.
4036 Argument *BufferArg = LtGRFunc->getArg(0);
4037 // Idx: index of the buffer.
4038 Argument *IdxArg = LtGRFunc->getArg(1);
4039 // ReduceList: thread local Reduce list.
4040 Argument *ReduceListArg = LtGRFunc->getArg(2);
4041
4042 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4043 BufferArg->getName() + ".addr");
4044 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4045 IdxArg->getName() + ".addr");
4046 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4047 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4048 auto *RedListArrayTy =
4049 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4050
4051 // 1. Build a list of reduction variables.
4052 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4053 Value *LocalReduceList =
4054 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4055
4056 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4057
4058 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4059 BufferArgAlloca, Builder.getPtrTy(),
4060 BufferArgAlloca->getName() + ".ascast");
4061 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4062 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4063 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4064 ReduceListArgAlloca, Builder.getPtrTy(),
4065 ReduceListArgAlloca->getName() + ".ascast");
4066 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4067 LocalReduceList, Builder.getPtrTy(),
4068 LocalReduceList->getName() + ".ascast");
4069
4070 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4071 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4072 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4073
4074 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4075 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4076 Type *IndexTy = Builder.getIndexTy(
4077 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4078 for (auto En : enumerate(ReductionInfos)) {
4079 const ReductionInfo &RI = En.value();
4080
4081 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4082 RedListArrayTy, LocalReduceListAddrCast,
4083 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4084 Value *BufferVD =
4085 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4086 // Global = Buffer.VD[Idx];
4087 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4088 ReductionsBufferTy, BufferVD, 0, En.index());
4089
4090 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4091 InsertPointTy OldIP = Builder.saveIP();
4092 Builder.restoreIP(AllocaIP);
4093
4094 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4095 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4096 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4097
4098 Builder.restoreIP(OldIP);
4099
4100 // Get source descriptor from the reduce list argument
4101 Value *ReduceList =
4102 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4103 Value *SrcElementPtrPtr =
4104 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
4105 {ConstantInt::get(IndexTy, 0),
4106 ConstantInt::get(IndexTy, En.index())});
4107 Value *SrcDescriptorAddr =
4108 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4109
4110 // Copy descriptor from source and update base_ptr to global buffer data
4111 InsertPointOrErrorTy GenResult =
4112 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4113 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4114
4115 if (!GenResult)
4116 return GenResult.takeError();
4117
4118 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4119 } else {
4120 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4121 }
4122 }
4123
4124 // Call reduce_function(GlobalReduceList, ReduceList)
4125 Value *ReduceList =
4126 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4127 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
4128 ->addFnAttr(Attribute::NoUnwind);
4129 Builder.CreateRetVoid();
4130 Builder.restoreIP(OldIP);
4131 return LtGRFunc;
4132}
4133
4134Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
4135 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
4136 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4137 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4138 LLVMContext &Ctx = M.getContext();
4139 FunctionType *FuncTy = FunctionType::get(
4140 Builder.getVoidTy(),
4141 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4142 /* IsVarArg */ false);
4143 Function *GtLCFunc =
4145 "_omp_reduction_global_to_list_copy_func", &M);
4146 GtLCFunc->setAttributes(FuncAttrs);
4147 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
4148 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
4149 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
4150
4151 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
4152 Builder.SetInsertPoint(EntryBlock);
4153
4154 // Buffer: global reduction buffer.
4155 Argument *BufferArg = GtLCFunc->getArg(0);
4156 // Idx: index of the buffer.
4157 Argument *IdxArg = GtLCFunc->getArg(1);
4158 // ReduceList: thread local Reduce list.
4159 Argument *ReduceListArg = GtLCFunc->getArg(2);
4160
4161 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4162 BufferArg->getName() + ".addr");
4163 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4164 IdxArg->getName() + ".addr");
4165 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4166 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4167 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4168 BufferArgAlloca, Builder.getPtrTy(),
4169 BufferArgAlloca->getName() + ".ascast");
4170 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4171 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4172 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4173 ReduceListArgAlloca, Builder.getPtrTy(),
4174 ReduceListArgAlloca->getName() + ".ascast");
4175 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4176 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4177 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4178
4179 Value *LocalReduceList =
4180 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4181 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4182 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4183 Type *IndexTy = Builder.getIndexTy(
4184 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4185 for (auto En : enumerate(ReductionInfos)) {
4186 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4187 auto *RedListArrayTy =
4188 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4189 // Reduce element = LocalReduceList[i]
4190 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
4191 RedListArrayTy, LocalReduceList,
4192 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4193 // elemptr = ((CopyType*)(elemptrptr)) + I
4194 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
4195 // Global = Buffer.VD[Idx];
4196 Value *BufferVD =
4197 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4198 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4199 ReductionsBufferTy, BufferVD, 0, En.index());
4200
4201 switch (RI.EvaluationKind) {
4202 case EvalKind::Scalar: {
4203 Type *ElemType = RI.ElementType;
4204
4205 if (!IsByRef.empty() && IsByRef[En.index()]) {
4206 ElemType = RI.ByRefElementType;
4207 if (RI.DataPtrPtrGen) {
4208 InsertPointOrErrorTy GenResult =
4209 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
4210
4211 if (!GenResult)
4212 return GenResult.takeError();
4213
4214 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
4215 }
4216 }
4217
4218 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
4219 Builder.CreateStore(TargetElement, ElemPtr);
4220 break;
4221 }
4222 case EvalKind::Complex: {
4223 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
4224 RI.ElementType, GlobValPtr, 0, 0, ".realp");
4225 Value *SrcReal = Builder.CreateLoad(
4226 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
4227 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
4228 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
4229 Value *SrcImg = Builder.CreateLoad(
4230 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4231
4232 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4233 RI.ElementType, ElemPtr, 0, 0, ".realp");
4234 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4235 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4236 Builder.CreateStore(SrcReal, DestRealPtr);
4237 Builder.CreateStore(SrcImg, DestImgPtr);
4238 break;
4239 }
4240 case EvalKind::Aggregate: {
4241 Value *SizeVal =
4242 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4243 Builder.CreateMemCpy(
4244 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4245 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4246 SizeVal, false);
4247 break;
4248 }
4249 }
4250 }
4251
4252 Builder.CreateRetVoid();
4253 Builder.restoreIP(OldIP);
4254 return GtLCFunc;
4255}
4256
4257Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4258 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4259 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4260 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4261 LLVMContext &Ctx = M.getContext();
4262 auto *FuncTy = FunctionType::get(
4263 Builder.getVoidTy(),
4264 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4265 /* IsVarArg */ false);
4266 Function *GtLRFunc =
4268 "_omp_reduction_global_to_list_reduce_func", &M);
4269 GtLRFunc->setAttributes(FuncAttrs);
4270 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4271 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4272 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4273
4274 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4275 Builder.SetInsertPoint(EntryBlock);
4276
4277 // Buffer: global reduction buffer.
4278 Argument *BufferArg = GtLRFunc->getArg(0);
4279 // Idx: index of the buffer.
4280 Argument *IdxArg = GtLRFunc->getArg(1);
4281 // ReduceList: thread local Reduce list.
4282 Argument *ReduceListArg = GtLRFunc->getArg(2);
4283
4284 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4285 BufferArg->getName() + ".addr");
4286 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4287 IdxArg->getName() + ".addr");
4288 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4289 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4290 ArrayType *RedListArrayTy =
4291 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4292
4293 // 1. Build a list of reduction variables.
4294 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4295 Value *LocalReduceList =
4296 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4297
4298 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4299
4300 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4301 BufferArgAlloca, Builder.getPtrTy(),
4302 BufferArgAlloca->getName() + ".ascast");
4303 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4304 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4305 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4306 ReduceListArgAlloca, Builder.getPtrTy(),
4307 ReduceListArgAlloca->getName() + ".ascast");
4308 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4309 LocalReduceList, Builder.getPtrTy(),
4310 LocalReduceList->getName() + ".ascast");
4311
4312 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4313 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4314 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4315
4316 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4317 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4318 Type *IndexTy = Builder.getIndexTy(
4319 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4320 for (auto En : enumerate(ReductionInfos)) {
4321 const ReductionInfo &RI = En.value();
4322
4323 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4324 RedListArrayTy, ReductionList,
4325 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4326 // Global = Buffer.VD[Idx];
4327 Value *BufferVD =
4328 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4329 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4330 ReductionsBufferTy, BufferVD, 0, En.index());
4331
4332 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4333 InsertPointTy OldIP = Builder.saveIP();
4334 Builder.restoreIP(AllocaIP);
4335
4336 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4337 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4338 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4339
4340 Builder.restoreIP(OldIP);
4341
4342 // Get source descriptor from the reduce list
4343 Value *ReduceListVal =
4344 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4345 Value *SrcElementPtrPtr =
4346 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4347 {ConstantInt::get(IndexTy, 0),
4348 ConstantInt::get(IndexTy, En.index())});
4349 Value *SrcDescriptorAddr =
4350 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4351
4352 // Copy descriptor from source and update base_ptr to global buffer data
4353 InsertPointOrErrorTy GenResult =
4354 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4355 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4356 if (!GenResult)
4357 return GenResult.takeError();
4358
4359 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4360 } else {
4361 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4362 }
4363 }
4364
4365 // Call reduce_function(ReduceList, GlobalReduceList)
4366 Value *ReduceList =
4367 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4368 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4369 ->addFnAttr(Attribute::NoUnwind);
4370 Builder.CreateRetVoid();
4371 Builder.restoreIP(OldIP);
4372 return GtLRFunc;
4373}
4374
4375std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4376 std::string Suffix =
4377 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4378 return (Name + Suffix).str();
4379}
4380
4381Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4382 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4384 AttributeList FuncAttrs) {
4385 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4386 {Builder.getPtrTy(), Builder.getPtrTy()},
4387 /* IsVarArg */ false);
4388 std::string Name = getReductionFuncName(ReducerName);
4389 Function *ReductionFunc =
4391 ReductionFunc->setCallingConv(Config.getRuntimeCC());
4392 ReductionFunc->setAttributes(FuncAttrs);
4393 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4394 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4395 BasicBlock *EntryBB =
4396 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4397 Builder.SetInsertPoint(EntryBB);
4398
4399 // Need to alloca memory here and deal with the pointers before getting
4400 // LHS/RHS pointers out
4401 Value *LHSArrayPtr = nullptr;
4402 Value *RHSArrayPtr = nullptr;
4403 Argument *Arg0 = ReductionFunc->getArg(0);
4404 Argument *Arg1 = ReductionFunc->getArg(1);
4405 Type *Arg0Type = Arg0->getType();
4406 Type *Arg1Type = Arg1->getType();
4407
4408 Value *LHSAlloca =
4409 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4410 Value *RHSAlloca =
4411 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4412 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4413 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4414 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4415 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4416 Builder.CreateStore(Arg0, LHSAddrCast);
4417 Builder.CreateStore(Arg1, RHSAddrCast);
4418 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4419 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4420
4421 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4422 Type *IndexTy = Builder.getIndexTy(
4423 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4424 SmallVector<Value *> LHSPtrs, RHSPtrs;
4425 for (auto En : enumerate(ReductionInfos)) {
4426 const ReductionInfo &RI = En.value();
4427 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4428 RedArrayTy, RHSArrayPtr,
4429 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4430 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4431 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4432 RHSI8Ptr, RI.PrivateVariable->getType(),
4433 RHSI8Ptr->getName() + ".ascast");
4434
4435 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4436 RedArrayTy, LHSArrayPtr,
4437 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4438 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4439 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4440 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4441
4443 LHSPtrs.emplace_back(LHSPtr);
4444 RHSPtrs.emplace_back(RHSPtr);
4445 } else {
4446 Value *LHS = LHSPtr;
4447 Value *RHS = RHSPtr;
4448
4449 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4450 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4451 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4452 }
4453
4454 Value *Reduced;
4455 InsertPointOrErrorTy AfterIP =
4456 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4457 if (!AfterIP)
4458 return AfterIP.takeError();
4459 if (!Builder.GetInsertBlock())
4460 return ReductionFunc;
4461
4462 Builder.restoreIP(*AfterIP);
4463
4464 if (!IsByRef.empty() && !IsByRef[En.index()])
4465 Builder.CreateStore(Reduced, LHSPtr);
4466 }
4467 }
4468
4470 for (auto En : enumerate(ReductionInfos)) {
4471 unsigned Index = En.index();
4472 const ReductionInfo &RI = En.value();
4473 Value *LHSFixupPtr, *RHSFixupPtr;
4474 Builder.restoreIP(RI.ReductionGenClang(
4475 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4476
4477 // Fix the CallBack code genereated to use the correct Values for the LHS
4478 // and RHS
4479 LHSFixupPtr->replaceUsesWithIf(
4480 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4481 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4482 ReductionFunc;
4483 });
4484 RHSFixupPtr->replaceUsesWithIf(
4485 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4486 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4487 ReductionFunc;
4488 });
4489 }
4490
4491 Builder.CreateRetVoid();
4492 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4493 // to the entry block (this is dones for higher opt levels by later passes in
4494 // the pipeline). This has caused issues because non-entry `alloca`s force the
4495 // function to use dynamic stack allocations and we might run out of scratch
4496 // memory.
4497 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4498
4499 return ReductionFunc;
4500}
4501
4502static void
4504 bool IsGPU) {
4505 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4506 (void)RI;
4507 assert(RI.Variable && "expected non-null variable");
4508 assert(RI.PrivateVariable && "expected non-null private variable");
4509 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4510 "expected non-null reduction generator callback");
4511 if (!IsGPU) {
4512 assert(
4513 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4514 "expected variables and their private equivalents to have the same "
4515 "type");
4516 }
4517 assert(RI.Variable->getType()->isPointerTy() &&
4518 "expected variables to be pointers");
4519 }
4520}
4521
4523 const LocationDescription &Loc, InsertPointTy AllocaIP,
4524 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4525 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4526 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4527 unsigned ReductionBufNum, Value *SrcLocInfo) {
4528 if (!updateToLocation(Loc))
4529 return InsertPointTy();
4530 Builder.restoreIP(CodeGenIP);
4531 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4532 LLVMContext &Ctx = M.getContext();
4533
4534 // Source location for the ident struct
4535 if (!SrcLocInfo) {
4536 uint32_t SrcLocStrSize;
4537 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4538 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4539 }
4540
4541 if (ReductionInfos.size() == 0)
4542 return Builder.saveIP();
4543
4544 BasicBlock *ContinuationBlock = nullptr;
4546 // Copied code from createReductions
4547 BasicBlock *InsertBlock = Loc.IP.getBlock();
4548 ContinuationBlock =
4549 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4550 InsertBlock->getTerminator()->eraseFromParent();
4551 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4552 }
4553
4554 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4555 AttributeList FuncAttrs;
4556 AttrBuilder AttrBldr(Ctx);
4557 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4558 AttrBldr.addAttribute(Attr);
4559 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4560 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4561
4562 CodeGenIP = Builder.saveIP();
4563 Expected<Function *> ReductionResult = createReductionFunction(
4564 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4565 ReductionGenCBKind, FuncAttrs);
4566 if (!ReductionResult)
4567 return ReductionResult.takeError();
4568 Function *ReductionFunc = *ReductionResult;
4569 Builder.restoreIP(CodeGenIP);
4570
4571 // Set the grid value in the config needed for lowering later on
4572 if (GridValue.has_value())
4573 Config.setGridValue(GridValue.value());
4574 else
4575 Config.setGridValue(getGridValue(T, ReductionFunc));
4576
4577 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4578 // RedList, shuffle_reduce_func, interwarp_copy_func);
4579 // or
4580 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4581 Value *Res;
4582
4583 // 1. Build a list of reduction variables.
4584 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4585 auto Size = ReductionInfos.size();
4586 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4587 Type *FuncPtrTy =
4588 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4589 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4590 CodeGenIP = Builder.saveIP();
4591 Builder.restoreIP(AllocaIP);
4592 Value *ReductionListAlloca =
4593 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4594 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4595 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4596 Builder.restoreIP(CodeGenIP);
4597 Type *IndexTy = Builder.getIndexTy(
4598 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4599 for (auto En : enumerate(ReductionInfos)) {
4600 const ReductionInfo &RI = En.value();
4601 Value *ElemPtr = Builder.CreateInBoundsGEP(
4602 RedArrayTy, ReductionList,
4603 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4604
4605 Value *PrivateVar = RI.PrivateVariable;
4606 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4607 if (IsByRefElem)
4608 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4609
4610 Value *CastElem =
4611 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4612 Builder.CreateStore(CastElem, ElemPtr);
4613 }
4614 CodeGenIP = Builder.saveIP();
4615 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4616 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4617
4618 if (!SarFunc)
4619 return SarFunc.takeError();
4620
4621 Expected<Function *> CopyResult =
4622 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4623 if (!CopyResult)
4624 return CopyResult.takeError();
4625 Function *WcFunc = *CopyResult;
4626 Builder.restoreIP(CodeGenIP);
4627
4628 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4629
4630 // NOTE: ReductionDataSize is passed as the reduce_data_size
4631 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4632 // the runtime implementations do not currently use it. The teams
4633 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4634 // (set separately via TargetKernelDefaultAttrs). It is computed
4635 // here conservatively as max(element sizes) * N rather than the
4636 // exact sum, which over-calculates the size for mixed reduction
4637 // types but is harmless given the argument is unused.
4638 // TODO: Consider dropping this computation if the runtime API is
4639 // ever revised to remove the unused parameter.
4640 unsigned MaxDataSize = 0;
4641 SmallVector<Type *> ReductionTypeArgs;
4642 for (auto En : enumerate(ReductionInfos)) {
4643 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4644 // the actual data size stored in the global reduction buffer, consistent
4645 // with the ReductionsBufferTy struct used for GEP offsets below.
4646 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4647 ? En.value().ByRefElementType
4648 : En.value().ElementType;
4649 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4650 if (Size > MaxDataSize)
4651 MaxDataSize = Size;
4652 ReductionTypeArgs.emplace_back(RedTypeArg);
4653 }
4654 Value *ReductionDataSize =
4655 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4656 if (!IsTeamsReduction) {
4657 Value *SarFuncCast =
4658 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4659 Value *WcFuncCast =
4660 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4661 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4662 WcFuncCast};
4664 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4665 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4666 } else {
4667 CodeGenIP = Builder.saveIP();
4668 StructType *ReductionsBufferTy = StructType::create(
4669 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4670 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4671 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4672
4673 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4674 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4675 if (!LtGCFunc)
4676 return LtGCFunc.takeError();
4677
4678 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4679 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4680 if (!LtGRFunc)
4681 return LtGRFunc.takeError();
4682
4683 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4684 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4685 if (!GtLCFunc)
4686 return GtLCFunc.takeError();
4687
4688 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4689 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4690 if (!GtLRFunc)
4691 return GtLRFunc.takeError();
4692
4693 Builder.restoreIP(CodeGenIP);
4694
4695 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4696 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4697
4698 Value *Args3[] = {SrcLocInfo,
4699 KernelTeamsReductionPtr,
4700 Builder.getInt32(ReductionBufNum),
4701 ReductionDataSize,
4702 RL,
4703 *SarFunc,
4704 WcFunc,
4705 *LtGCFunc,
4706 *LtGRFunc,
4707 *GtLCFunc,
4708 *GtLRFunc};
4709
4710 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4711 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4712 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4713 }
4714
4715 // 5. Build if (res == 1)
4716 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4717 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4718 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4719 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4720
4721 // 6. Build then branch: where we have reduced values in the master
4722 // thread in each team.
4723 // __kmpc_end_reduce{_nowait}(<gtid>);
4724 // break;
4725 emitBlock(ThenBB, CurFunc);
4726
4727 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4728 for (auto En : enumerate(ReductionInfos)) {
4729 const ReductionInfo &RI = En.value();
4731 Value *RedValue = RI.Variable;
4732 Value *RHS =
4733 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4734
4736 Value *LHSPtr, *RHSPtr;
4737 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4738 &LHSPtr, &RHSPtr, CurFunc));
4739
4740 // Fix the CallBack code genereated to use the correct Values for the LHS
4741 // and RHS. Cast to match types before replacing (necessary to handle
4742 // different address spaces).
4743 if (LHSPtr->getType() != RedValue->getType())
4744 RedValue = Builder.CreatePointerBitCastOrAddrSpaceCast(
4745 RedValue, LHSPtr->getType());
4746 if (RHSPtr->getType() != RHS->getType())
4747 RHS =
4748 Builder.CreatePointerBitCastOrAddrSpaceCast(RHS, RHSPtr->getType());
4749
4750 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4751 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4752 ReductionFunc;
4753 });
4754 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4755 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4756 ReductionFunc;
4757 });
4758 } else {
4759 if (IsByRef.empty() || !IsByRef[En.index()]) {
4760 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4761 "red.value." + Twine(En.index()));
4762 }
4763 Value *PrivateRedValue = Builder.CreateLoad(
4764 ValueType, RHS, "red.private.value" + Twine(En.index()));
4765 Value *Reduced;
4766 InsertPointOrErrorTy AfterIP =
4767 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4768 if (!AfterIP)
4769 return AfterIP.takeError();
4770 Builder.restoreIP(*AfterIP);
4771
4772 if (!IsByRef.empty() && !IsByRef[En.index()])
4773 Builder.CreateStore(Reduced, RI.Variable);
4774 }
4775 }
4776 emitBlock(ExitBB, CurFunc);
4777 if (ContinuationBlock) {
4778 Builder.CreateBr(ContinuationBlock);
4779 Builder.SetInsertPoint(ContinuationBlock);
4780 }
4781 Config.setEmitLLVMUsed();
4782
4783 return Builder.saveIP();
4784}
4785
4787 Type *VoidTy = Type::getVoidTy(M.getContext());
4788 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4789 auto *FuncTy =
4790 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4792 ".omp.reduction.func", &M);
4793}
4794
4796 Function *ReductionFunc,
4798 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4799 Module *Module = ReductionFunc->getParent();
4800 BasicBlock *ReductionFuncBlock =
4801 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4802 Builder.SetInsertPoint(ReductionFuncBlock);
4803 Value *LHSArrayPtr = nullptr;
4804 Value *RHSArrayPtr = nullptr;
4805 if (IsGPU) {
4806 // Need to alloca memory here and deal with the pointers before getting
4807 // LHS/RHS pointers out
4808 //
4809 Argument *Arg0 = ReductionFunc->getArg(0);
4810 Argument *Arg1 = ReductionFunc->getArg(1);
4811 Type *Arg0Type = Arg0->getType();
4812 Type *Arg1Type = Arg1->getType();
4813
4814 Value *LHSAlloca =
4815 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4816 Value *RHSAlloca =
4817 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4818 Value *LHSAddrCast =
4819 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4820 Value *RHSAddrCast =
4821 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4822 Builder.CreateStore(Arg0, LHSAddrCast);
4823 Builder.CreateStore(Arg1, RHSAddrCast);
4824 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4825 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4826 } else {
4827 LHSArrayPtr = ReductionFunc->getArg(0);
4828 RHSArrayPtr = ReductionFunc->getArg(1);
4829 }
4830
4831 unsigned NumReductions = ReductionInfos.size();
4832 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4833
4834 for (auto En : enumerate(ReductionInfos)) {
4835 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4836 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4837 RedArrayTy, LHSArrayPtr, 0, En.index());
4838 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4839 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4840 LHSI8Ptr, RI.Variable->getType());
4841 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4842 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4843 RedArrayTy, RHSArrayPtr, 0, En.index());
4844 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4845 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4846 RHSI8Ptr, RI.PrivateVariable->getType());
4847 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4848 Value *Reduced;
4850 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4851 if (!AfterIP)
4852 return AfterIP.takeError();
4853
4854 Builder.restoreIP(*AfterIP);
4855 // TODO: Consider flagging an error.
4856 if (!Builder.GetInsertBlock())
4857 return Error::success();
4858
4859 // store is inside of the reduction region when using by-ref
4860 if (!IsByRef[En.index()])
4861 Builder.CreateStore(Reduced, LHSPtr);
4862 }
4863 Builder.CreateRetVoid();
4864 return Error::success();
4865}
4866
4868 const LocationDescription &Loc, InsertPointTy AllocaIP,
4869 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4870 bool IsNoWait, bool IsTeamsReduction) {
4871 assert(ReductionInfos.size() == IsByRef.size());
4872 if (Config.isGPU())
4873 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4874 IsByRef, IsNoWait, IsTeamsReduction);
4875
4876 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4877
4878 if (!updateToLocation(Loc))
4879 return InsertPointTy();
4880
4881 if (ReductionInfos.size() == 0)
4882 return Builder.saveIP();
4883
4884 BasicBlock *InsertBlock = Loc.IP.getBlock();
4885 BasicBlock *ContinuationBlock =
4886 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4887 InsertBlock->getTerminator()->eraseFromParent();
4888
4889 // Create and populate array of type-erased pointers to private reduction
4890 // values.
4891 unsigned NumReductions = ReductionInfos.size();
4892 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4893 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4894 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4895
4896 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4897
4898 for (auto En : enumerate(ReductionInfos)) {
4899 unsigned Index = En.index();
4900 const ReductionInfo &RI = En.value();
4901 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4902 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4903 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4904 }
4905
4906 // Emit a call to the runtime function that orchestrates the reduction.
4907 // Declare the reduction function in the process.
4908 Type *IndexTy = Builder.getIndexTy(
4909 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4910 Function *Func = Builder.GetInsertBlock()->getParent();
4911 Module *Module = Func->getParent();
4912 uint32_t SrcLocStrSize;
4913 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4914 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4915 return RI.AtomicReductionGen;
4916 });
4917 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4918 CanGenerateAtomic
4919 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4920 : IdentFlag(0));
4921 Value *ThreadId = getOrCreateThreadID(Ident);
4922 Constant *NumVariables = Builder.getInt32(NumReductions);
4923 const DataLayout &DL = Module->getDataLayout();
4924 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4925 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4926 Function *ReductionFunc = getFreshReductionFunc(*Module);
4927 Value *Lock = getOMPCriticalRegionLock(".reduction");
4929 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4930 : RuntimeFunction::OMPRTL___kmpc_reduce);
4931 CallInst *ReduceCall =
4932 createRuntimeFunctionCall(ReduceFunc,
4933 {Ident, ThreadId, NumVariables, RedArraySize,
4934 RedArray, ReductionFunc, Lock},
4935 "reduce");
4936
4937 // Create final reduction entry blocks for the atomic and non-atomic case.
4938 // Emit IR that dispatches control flow to one of the blocks based on the
4939 // reduction supporting the atomic mode.
4940 BasicBlock *NonAtomicRedBlock =
4941 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4942 BasicBlock *AtomicRedBlock =
4943 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4944 SwitchInst *Switch =
4945 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4946 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4947 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4948
4949 // Populate the non-atomic reduction using the elementwise reduction function.
4950 // This loads the elements from the global and private variables and reduces
4951 // them before storing back the result to the global variable.
4952 Builder.SetInsertPoint(NonAtomicRedBlock);
4953 for (auto En : enumerate(ReductionInfos)) {
4954 const ReductionInfo &RI = En.value();
4956 // We have one less load for by-ref case because that load is now inside of
4957 // the reduction region
4958 Value *RedValue = RI.Variable;
4959 if (!IsByRef[En.index()]) {
4960 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4961 "red.value." + Twine(En.index()));
4962 }
4963 Value *PrivateRedValue =
4964 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4965 "red.private.value." + Twine(En.index()));
4966 Value *Reduced;
4967 InsertPointOrErrorTy AfterIP =
4968 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4969 if (!AfterIP)
4970 return AfterIP.takeError();
4971 Builder.restoreIP(*AfterIP);
4972
4973 if (!Builder.GetInsertBlock())
4974 return InsertPointTy();
4975 // for by-ref case, the load is inside of the reduction region
4976 if (!IsByRef[En.index()])
4977 Builder.CreateStore(Reduced, RI.Variable);
4978 }
4979 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4980 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4981 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4982 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4983 Builder.CreateBr(ContinuationBlock);
4984
4985 // Populate the atomic reduction using the atomic elementwise reduction
4986 // function. There are no loads/stores here because they will be happening
4987 // inside the atomic elementwise reduction.
4988 Builder.SetInsertPoint(AtomicRedBlock);
4989 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4990 for (const ReductionInfo &RI : ReductionInfos) {
4992 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4993 if (!AfterIP)
4994 return AfterIP.takeError();
4995 Builder.restoreIP(*AfterIP);
4996 if (!Builder.GetInsertBlock())
4997 return InsertPointTy();
4998 }
4999 Builder.CreateBr(ContinuationBlock);
5000 } else {
5001 Builder.CreateUnreachable();
5002 }
5003
5004 // Populate the outlined reduction function using the elementwise reduction
5005 // function. Partial values are extracted from the type-erased array of
5006 // pointers to private variables.
5007 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
5008 IsByRef, /*isGPU=*/false);
5009 if (Err)
5010 return Err;
5011
5012 if (!Builder.GetInsertBlock())
5013 return InsertPointTy();
5014
5015 Builder.SetInsertPoint(ContinuationBlock);
5016 return Builder.saveIP();
5017}
5018
5021 BodyGenCallbackTy BodyGenCB,
5022 FinalizeCallbackTy FiniCB) {
5023 if (!updateToLocation(Loc))
5024 return Loc.IP;
5025
5026 Directive OMPD = Directive::OMPD_master;
5027 uint32_t SrcLocStrSize;
5028 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5029 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5030 Value *ThreadId = getOrCreateThreadID(Ident);
5031 Value *Args[] = {Ident, ThreadId};
5032
5033 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
5034 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5035
5036 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
5037 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
5038
5039 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5040 /*Conditional*/ true, /*hasFinalize*/ true);
5041}
5042
5045 BodyGenCallbackTy BodyGenCB,
5046 FinalizeCallbackTy FiniCB, Value *Filter) {
5047 if (!updateToLocation(Loc))
5048 return Loc.IP;
5049
5050 Directive OMPD = Directive::OMPD_masked;
5051 uint32_t SrcLocStrSize;
5052 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5053 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5054 Value *ThreadId = getOrCreateThreadID(Ident);
5055 Value *Args[] = {Ident, ThreadId, Filter};
5056 Value *ArgsEnd[] = {Ident, ThreadId};
5057
5058 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
5059 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5060
5061 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
5062 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
5063
5064 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5065 /*Conditional*/ true, /*hasFinalize*/ true);
5066}
5067
5069 llvm::FunctionCallee Callee,
5071 const llvm::Twine &Name) {
5072 llvm::CallInst *Call = Builder.CreateCall(
5073 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
5074 Call->setDoesNotThrow();
5075 return Call;
5076}
5077
5078// Expects input basic block is dominated by BeforeScanBB.
5079// Once Scan directive is encountered, the code after scan directive should be
5080// dominated by AfterScanBB. Scan directive splits the code sequence to
5081// scan and input phase. Based on whether inclusive or exclusive
5082// clause is used in the scan directive and whether input loop or scan loop
5083// is lowered, it adds jumps to input and scan phase. First Scan loop is the
5084// input loop and second is the scan loop. The code generated handles only
5085// inclusive scans now.
5087 const LocationDescription &Loc, InsertPointTy AllocaIP,
5088 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
5089 bool IsInclusive, ScanInfo *ScanRedInfo) {
5090 if (ScanRedInfo->OMPFirstScanLoop) {
5091 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
5092 ScanVarsType, ScanRedInfo);
5093 if (Err)
5094 return Err;
5095 }
5096 if (!updateToLocation(Loc))
5097 return Loc.IP;
5098
5099 llvm::Value *IV = ScanRedInfo->IV;
5100
5101 if (ScanRedInfo->OMPFirstScanLoop) {
5102 // Emit buffer[i] = red; at the end of the input phase.
5103 for (size_t i = 0; i < ScanVars.size(); i++) {
5104 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5105 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5106 Type *DestTy = ScanVarsType[i];
5107 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5108 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
5109
5110 Builder.CreateStore(Src, Val);
5111 }
5112 }
5113 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5114 emitBlock(ScanRedInfo->OMPScanDispatch,
5115 Builder.GetInsertBlock()->getParent());
5116
5117 if (!ScanRedInfo->OMPFirstScanLoop) {
5118 IV = ScanRedInfo->IV;
5119 // Emit red = buffer[i]; at the entrance to the scan phase.
5120 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
5121 for (size_t i = 0; i < ScanVars.size(); i++) {
5122 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5123 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5124 Type *DestTy = ScanVarsType[i];
5125 Value *SrcPtr =
5126 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5127 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
5128 Builder.CreateStore(Src, ScanVars[i]);
5129 }
5130 }
5131
5132 // TODO: Update it to CreateBr and remove dead blocks
5133 llvm::Value *CmpI = Builder.getInt1(true);
5134 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
5135 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
5136 ScanRedInfo->OMPAfterScanBlock);
5137 } else {
5138 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
5139 ScanRedInfo->OMPBeforeScanBlock);
5140 }
5141 emitBlock(ScanRedInfo->OMPAfterScanBlock,
5142 Builder.GetInsertBlock()->getParent());
5143 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
5144 return Builder.saveIP();
5145}
5146
5147Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
5148 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
5149 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
5150
5151 Builder.restoreIP(AllocaIP);
5152 // Create the shared pointer at alloca IP.
5153 for (size_t i = 0; i < ScanVars.size(); i++) {
5154 llvm::Value *BuffPtr =
5155 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
5156 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
5157 }
5158
5159 // Allocate temporary buffer by master thread
5160 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5161 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5162 Builder.restoreIP(CodeGenIP);
5163 Value *AllocSpan =
5164 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
5165 for (size_t i = 0; i < ScanVars.size(); i++) {
5166 Type *IntPtrTy = Builder.getInt32Ty();
5167 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
5168 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
5169 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
5170 AllocSpan, nullptr, "arr");
5171 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
5172 }
5173 return Error::success();
5174 };
5175 // TODO: Perform finalization actions for variables. This has to be
5176 // called for variables which have destructors/finalizers.
5177 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5178
5179 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
5180 llvm::Value *FilterVal = Builder.getInt32(0);
5182 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5183
5184 if (!AfterIP)
5185 return AfterIP.takeError();
5186 Builder.restoreIP(*AfterIP);
5187 BasicBlock *InputBB = Builder.GetInsertBlock();
5188 if (InputBB->hasTerminator())
5189 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5190 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5191 if (!AfterIP)
5192 return AfterIP.takeError();
5193 Builder.restoreIP(*AfterIP);
5194
5195 return Error::success();
5196}
5197
5198Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
5199 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
5200 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5201 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5202 Builder.restoreIP(CodeGenIP);
5203 for (ReductionInfo RedInfo : ReductionInfos) {
5204 Value *PrivateVar = RedInfo.PrivateVariable;
5205 Value *OrigVar = RedInfo.Variable;
5206 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
5207 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5208
5209 Type *SrcTy = RedInfo.ElementType;
5210 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
5211 "arrayOffset");
5212 Value *Src = Builder.CreateLoad(SrcTy, Val);
5213
5214 Builder.CreateStore(Src, OrigVar);
5215 Builder.CreateFree(Buff);
5216 }
5217 return Error::success();
5218 };
5219 // TODO: Perform finalization actions for variables. This has to be
5220 // called for variables which have destructors/finalizers.
5221 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5222
5223 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
5224 Builder.SetInsertPoint(TI);
5225 else
5226 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
5227
5228 llvm::Value *FilterVal = Builder.getInt32(0);
5230 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5231
5232 if (!AfterIP)
5233 return AfterIP.takeError();
5234 Builder.restoreIP(*AfterIP);
5235 BasicBlock *InputBB = Builder.GetInsertBlock();
5236 if (InputBB->hasTerminator())
5237 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5238 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5239 if (!AfterIP)
5240 return AfterIP.takeError();
5241 Builder.restoreIP(*AfterIP);
5242 return Error::success();
5243}
5244
5246 const LocationDescription &Loc,
5248 ScanInfo *ScanRedInfo) {
5249
5250 if (!updateToLocation(Loc))
5251 return Loc.IP;
5252 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5253 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5254 Builder.restoreIP(CodeGenIP);
5255 Function *CurFn = Builder.GetInsertBlock()->getParent();
5256 // for (int k = 0; k <= ceil(log2(n)); ++k)
5257 llvm::BasicBlock *LoopBB =
5258 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5259 llvm::BasicBlock *ExitBB =
5260 splitBB(Builder, false, "omp.outer.log.scan.exit");
5262 Builder.GetInsertBlock()->getModule(),
5263 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5264 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5265 llvm::Value *Arg =
5266 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5267 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5269 Builder.GetInsertBlock()->getModule(),
5270 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5271 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5272 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5273 llvm::Value *NMin1 = Builder.CreateNUWSub(
5274 ScanRedInfo->Span,
5275 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5276 Builder.SetInsertPoint(InputBB);
5277 Builder.CreateBr(LoopBB);
5278 emitBlock(LoopBB, CurFn);
5279 Builder.SetInsertPoint(LoopBB);
5280
5281 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5282 // size pow2k = 1;
5283 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5284 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5285 InputBB);
5286 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5287 InputBB);
5288 // for (size i = n - 1; i >= 2 ^ k; --i)
5289 // tmp[i] op= tmp[i-pow2k];
5290 llvm::BasicBlock *InnerLoopBB =
5291 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5292 llvm::BasicBlock *InnerExitBB =
5293 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5294 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5295 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5296 emitBlock(InnerLoopBB, CurFn);
5297 Builder.SetInsertPoint(InnerLoopBB);
5298 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5299 IVal->addIncoming(NMin1, LoopBB);
5300 for (ReductionInfo RedInfo : ReductionInfos) {
5301 Value *ReductionVal = RedInfo.PrivateVariable;
5302 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5303 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5304 Type *DestTy = RedInfo.ElementType;
5305 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5306 Value *LHSPtr =
5307 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5308 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5309 Value *RHSPtr =
5310 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5311 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5312 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5313 llvm::Value *Result;
5314 InsertPointOrErrorTy AfterIP =
5315 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5316 if (!AfterIP)
5317 return AfterIP.takeError();
5318 Builder.CreateStore(Result, LHSPtr);
5319 }
5320 llvm::Value *NextIVal = Builder.CreateNUWSub(
5321 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5322 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5323 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5324 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5325 emitBlock(InnerExitBB, CurFn);
5326 llvm::Value *Next = Builder.CreateNUWAdd(
5327 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5328 Counter->addIncoming(Next, Builder.GetInsertBlock());
5329 // pow2k <<= 1;
5330 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5331 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5332 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5333 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5334 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5335 return Error::success();
5336 };
5337
5338 // TODO: Perform finalization actions for variables. This has to be
5339 // called for variables which have destructors/finalizers.
5340 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5341
5342 llvm::Value *FilterVal = Builder.getInt32(0);
5344 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5345
5346 if (!AfterIP)
5347 return AfterIP.takeError();
5348 Builder.restoreIP(*AfterIP);
5349 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5350
5351 if (!AfterIP)
5352 return AfterIP.takeError();
5353 Builder.restoreIP(*AfterIP);
5354 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5355 if (Err)
5356 return Err;
5357
5358 return AfterIP;
5359}
5360
5361Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5362 llvm::function_ref<Error()> InputLoopGen,
5363 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5364 ScanInfo *ScanRedInfo) {
5365
5366 {
5367 // Emit loop with input phase:
5368 // for (i: 0..<num_iters>) {
5369 // <input phase>;
5370 // buffer[i] = red;
5371 // }
5372 ScanRedInfo->OMPFirstScanLoop = true;
5373 Error Err = InputLoopGen();
5374 if (Err)
5375 return Err;
5376 }
5377 {
5378 // Emit loop with scan phase:
5379 // for (i: 0..<num_iters>) {
5380 // red = buffer[i];
5381 // <scan phase>;
5382 // }
5383 ScanRedInfo->OMPFirstScanLoop = false;
5384 Error Err = ScanLoopGen(Builder.saveIP());
5385 if (Err)
5386 return Err;
5387 }
5388 return Error::success();
5389}
5390
5391void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5392 Function *Fun = Builder.GetInsertBlock()->getParent();
5393 ScanRedInfo->OMPScanDispatch =
5394 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5395 ScanRedInfo->OMPAfterScanBlock =
5396 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5397 ScanRedInfo->OMPBeforeScanBlock =
5398 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5399 ScanRedInfo->OMPScanLoopExit =
5400 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5401}
5403 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5404 BasicBlock *PostInsertBefore, const Twine &Name) {
5405 Module *M = F->getParent();
5406 LLVMContext &Ctx = M->getContext();
5407 Type *IndVarTy = TripCount->getType();
5408
5409 // Create the basic block structure.
5410 BasicBlock *Preheader =
5411 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5412 BasicBlock *Header =
5413 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5414 BasicBlock *Cond =
5415 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5416 BasicBlock *Body =
5417 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5418 BasicBlock *Latch =
5419 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5420 BasicBlock *Exit =
5421 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5422 BasicBlock *After =
5423 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5424
5425 // Use specified DebugLoc for new instructions.
5426 Builder.SetCurrentDebugLocation(DL);
5427
5428 Builder.SetInsertPoint(Preheader);
5429 Builder.CreateBr(Header);
5430
5431 Builder.SetInsertPoint(Header);
5432 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5433 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5434 Builder.CreateBr(Cond);
5435
5436 Builder.SetInsertPoint(Cond);
5437 Value *Cmp =
5438 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5439 Builder.CreateCondBr(Cmp, Body, Exit);
5440
5441 Builder.SetInsertPoint(Body);
5442 Builder.CreateBr(Latch);
5443
5444 Builder.SetInsertPoint(Latch);
5445 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5446 "omp_" + Name + ".next", /*HasNUW=*/true);
5447 Builder.CreateBr(Header);
5448 IndVarPHI->addIncoming(Next, Latch);
5449
5450 Builder.SetInsertPoint(Exit);
5451 Builder.CreateBr(After);
5452
5453 // Remember and return the canonical control flow.
5454 LoopInfos.emplace_front();
5455 CanonicalLoopInfo *CL = &LoopInfos.front();
5456
5457 CL->Header = Header;
5458 CL->Cond = Cond;
5459 CL->Latch = Latch;
5460 CL->Exit = Exit;
5461
5462#ifndef NDEBUG
5463 CL->assertOK();
5464#endif
5465 return CL;
5466}
5467
5470 LoopBodyGenCallbackTy BodyGenCB,
5471 Value *TripCount, const Twine &Name) {
5472 BasicBlock *BB = Loc.IP.getBlock();
5473 BasicBlock *NextBB = BB->getNextNode();
5474
5475 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5476 NextBB, NextBB, Name);
5477 BasicBlock *After = CL->getAfter();
5478
5479 // If location is not set, don't connect the loop.
5480 if (updateToLocation(Loc)) {
5481 // Split the loop at the insertion point: Branch to the preheader and move
5482 // every following instruction to after the loop (the After BB). Also, the
5483 // new successor is the loop's after block.
5484 spliceBB(Builder, After, /*CreateBranch=*/false);
5485 Builder.CreateBr(CL->getPreheader());
5486 }
5487
5488 // Emit the body content. We do it after connecting the loop to the CFG to
5489 // avoid that the callback encounters degenerate BBs.
5490 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5491 return Err;
5492
5493#ifndef NDEBUG
5494 CL->assertOK();
5495#endif
5496 return CL;
5497}
5498
5500 ScanInfos.emplace_front();
5501 ScanInfo *Result = &ScanInfos.front();
5502 return Result;
5503}
5504
5508 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5509 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5510 LocationDescription ComputeLoc =
5511 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5512 updateToLocation(ComputeLoc);
5513
5515
5517 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5518 ScanRedInfo->Span = TripCount;
5519 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5520 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5521
5522 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5523 Builder.restoreIP(CodeGenIP);
5524 ScanRedInfo->IV = IV;
5525 createScanBBs(ScanRedInfo);
5526 BasicBlock *InputBlock = Builder.GetInsertBlock();
5527 Instruction *Terminator = InputBlock->getTerminator();
5528 assert(Terminator->getNumSuccessors() == 1);
5529 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5530 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5531 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5532 Builder.GetInsertBlock()->getParent());
5533 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5534 emitBlock(ScanRedInfo->OMPScanLoopExit,
5535 Builder.GetInsertBlock()->getParent());
5536 Builder.CreateBr(ContinueBlock);
5537 Builder.SetInsertPoint(
5538 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5539 return BodyGenCB(Builder.saveIP(), IV);
5540 };
5541
5542 const auto &&InputLoopGen = [&]() -> Error {
5544 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5545 ComputeIP, Name, true, ScanRedInfo);
5546 if (!LoopInfo)
5547 return LoopInfo.takeError();
5548 Result.push_back(*LoopInfo);
5549 Builder.restoreIP((*LoopInfo)->getAfterIP());
5550 return Error::success();
5551 };
5552 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5554 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5555 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5556 if (!LoopInfo)
5557 return LoopInfo.takeError();
5558 Result.push_back(*LoopInfo);
5559 Builder.restoreIP((*LoopInfo)->getAfterIP());
5560 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5561 return Error::success();
5562 };
5563 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5564 if (Err)
5565 return Err;
5566 return Result;
5567}
5568
5570 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5571 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5572
5573 // Consider the following difficulties (assuming 8-bit signed integers):
5574 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5575 // DO I = 1, 100, 50
5576 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5577 // DO I = 100, 0, -128
5578
5579 // Start, Stop and Step must be of the same integer type.
5580 auto *IndVarTy = cast<IntegerType>(Start->getType());
5581 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5582 assert(IndVarTy == Step->getType() && "Step type mismatch");
5583
5585
5586 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5587 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5588
5589 // Like Step, but always positive.
5590 Value *Incr = Step;
5591
5592 // Distance between Start and Stop; always positive.
5593 Value *Span;
5594
5595 // Condition whether there are no iterations are executed at all, e.g. because
5596 // UB < LB.
5597 Value *ZeroCmp;
5598
5599 if (IsSigned) {
5600 // Ensure that increment is positive. If not, negate and invert LB and UB.
5601 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5602 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5603 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5604 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5605 Span = Builder.CreateSub(UB, LB, "", false, true);
5606 ZeroCmp = Builder.CreateICmp(
5607 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5608 } else {
5609 Span = Builder.CreateSub(Stop, Start, "", true);
5610 ZeroCmp = Builder.CreateICmp(
5611 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5612 }
5613
5614 Value *CountIfLooping;
5615 if (InclusiveStop) {
5616 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5617 } else {
5618 // Avoid incrementing past stop since it could overflow.
5619 Value *CountIfTwo = Builder.CreateAdd(
5620 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5621 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5622 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5623 }
5624
5625 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5626 "omp_" + Name + ".tripcount");
5627}
5628
5631 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5632 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5633 ScanInfo *ScanRedInfo) {
5634 LocationDescription ComputeLoc =
5635 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5636
5638 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5639
5640 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5641 Builder.restoreIP(CodeGenIP);
5642 Value *Span = Builder.CreateMul(IV, Step);
5643 Value *IndVar = Builder.CreateAdd(Span, Start);
5644 if (InScan)
5645 ScanRedInfo->IV = IndVar;
5646 return BodyGenCB(Builder.saveIP(), IndVar);
5647 };
5648 LocationDescription LoopLoc =
5649 ComputeIP.isSet()
5650 ? Loc
5651 : LocationDescription(Builder.saveIP(),
5652 Builder.getCurrentDebugLocation());
5653 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5654}
5655
5656// Returns an LLVM function to call for initializing loop bounds using OpenMP
5657// static scheduling for composite `distribute parallel for` depending on
5658// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5659// integers as unsigned similarly to CanonicalLoopInfo.
5660static FunctionCallee
5662 OpenMPIRBuilder &OMPBuilder) {
5663 unsigned Bitwidth = Ty->getIntegerBitWidth();
5664 if (Bitwidth == 32)
5665 return OMPBuilder.getOrCreateRuntimeFunction(
5666 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5667 if (Bitwidth == 64)
5668 return OMPBuilder.getOrCreateRuntimeFunction(
5669 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5670 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5671}
5672
5673// Returns an LLVM function to call for initializing loop bounds using OpenMP
5674// static scheduling depending on `type`. Only i32 and i64 are supported by the
5675// runtime. Always interpret integers as unsigned similarly to
5676// CanonicalLoopInfo.
5678 OpenMPIRBuilder &OMPBuilder) {
5679 unsigned Bitwidth = Ty->getIntegerBitWidth();
5680 if (Bitwidth == 32)
5681 return OMPBuilder.getOrCreateRuntimeFunction(
5682 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5683 if (Bitwidth == 64)
5684 return OMPBuilder.getOrCreateRuntimeFunction(
5685 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5686 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5687}
5688
5689OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5690 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5691 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5692 OMPScheduleType DistScheduleSchedType) {
5693 assert(CLI->isValid() && "Requires a valid canonical loop");
5694 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5695 "Require dedicated allocate IP");
5696
5697 // Set up the source location value for OpenMP runtime.
5698 Builder.restoreIP(CLI->getPreheaderIP());
5699 Builder.SetCurrentDebugLocation(DL);
5700
5701 uint32_t SrcLocStrSize;
5702 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5704 switch (LoopType) {
5705 case WorksharingLoopType::ForStaticLoop:
5706 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5707 break;
5708 case WorksharingLoopType::DistributeStaticLoop:
5709 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5710 break;
5711 case WorksharingLoopType::DistributeForStaticLoop:
5712 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5713 break;
5714 }
5715 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5716
5717 // Declare useful OpenMP runtime functions.
5718 Value *IV = CLI->getIndVar();
5719 Type *IVTy = IV->getType();
5720 FunctionCallee StaticInit =
5721 LoopType == WorksharingLoopType::DistributeForStaticLoop
5722 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5723 : getKmpcForStaticInitForType(IVTy, M, *this);
5724 FunctionCallee StaticFini =
5725 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5726
5727 // Allocate space for computed loop bounds as expected by the "init" function.
5728 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5729
5730 Type *I32Type = Type::getInt32Ty(M.getContext());
5731 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5732 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5733 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5734 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5735 CLI->setLastIter(PLastIter);
5736
5737 // At the end of the preheader, prepare for calling the "init" function by
5738 // storing the current loop bounds into the allocated space. A canonical loop
5739 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5740 // and produces an inclusive upper bound.
5741 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5742 Constant *Zero = ConstantInt::get(IVTy, 0);
5743 Constant *One = ConstantInt::get(IVTy, 1);
5744 Builder.CreateStore(Zero, PLowerBound);
5745 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5746 Builder.CreateStore(UpperBound, PUpperBound);
5747 Builder.CreateStore(One, PStride);
5748
5749 Value *ThreadNum =
5750 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5751
5752 OMPScheduleType SchedType =
5753 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5754 ? OMPScheduleType::OrderedDistribute
5756 Constant *SchedulingType =
5757 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5758
5759 // Call the "init" function and update the trip count of the loop with the
5760 // value it produced.
5761 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5762 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5763 this](Value *SchedulingType, auto &Builder) {
5764 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5765 PLowerBound, PUpperBound});
5766 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5767 Value *PDistUpperBound =
5768 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5769 Args.push_back(PDistUpperBound);
5770 }
5771 Args.append({PStride, One, Zero});
5772 createRuntimeFunctionCall(StaticInit, Args);
5773 };
5774 BuildInitCall(SchedulingType, Builder);
5775 if (HasDistSchedule &&
5776 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5777 Constant *DistScheduleSchedType = ConstantInt::get(
5778 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5779 // We want to emit a second init function call for the dist_schedule clause
5780 // to the Distribute construct. This should only be done however if a
5781 // Workshare Loop is nested within a Distribute Construct
5782 BuildInitCall(DistScheduleSchedType, Builder);
5783 }
5784 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5785 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5786 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5787 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5788 CLI->setTripCount(TripCount);
5789
5790 // Update all uses of the induction variable except the one in the condition
5791 // block that compares it with the actual upper bound, and the increment in
5792 // the latch block.
5793
5794 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5795 Builder.SetInsertPoint(CLI->getBody(),
5796 CLI->getBody()->getFirstInsertionPt());
5797 Builder.SetCurrentDebugLocation(DL);
5798 return Builder.CreateAdd(OldIV, LowerBound);
5799 });
5800
5801 // In the "exit" block, call the "fini" function.
5802 Builder.SetInsertPoint(CLI->getExit(),
5803 CLI->getExit()->getTerminator()->getIterator());
5804 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5805
5806 // Add the barrier if requested.
5807 if (NeedsBarrier) {
5808 InsertPointOrErrorTy BarrierIP =
5810 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5811 /* CheckCancelFlag */ false);
5812 if (!BarrierIP)
5813 return BarrierIP.takeError();
5814 }
5815
5816 InsertPointTy AfterIP = CLI->getAfterIP();
5817 CLI->invalidate();
5818
5819 return AfterIP;
5820}
5821
5822static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5823 LoopInfo &LI);
5824static void addLoopMetadata(CanonicalLoopInfo *Loop,
5825 ArrayRef<Metadata *> Properties);
5826
5828 LLVMContext &Ctx, Loop *Loop,
5830 SmallVector<Metadata *> &LoopMDList) {
5831 SmallSet<BasicBlock *, 8> Reachable;
5832
5833 // Get the basic blocks from the loop in which memref instructions
5834 // can be found.
5835 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5836 // preferably without running any passes.
5837 for (BasicBlock *Block : Loop->getBlocks()) {
5838 if (Block == CLI->getCond() || Block == CLI->getHeader())
5839 continue;
5840 Reachable.insert(Block);
5841 }
5842
5843 // Add access group metadata to memory-access instructions.
5844 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5845 for (BasicBlock *BB : Reachable)
5846 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5847 // TODO: If the loop has existing parallel access metadata, have
5848 // to combine two lists.
5849 LoopMDList.push_back(MDNode::get(
5850 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5851}
5852
5854OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5855 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5856 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5857 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5858 assert(CLI->isValid() && "Requires a valid canonical loop");
5859 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5860
5861 LLVMContext &Ctx = CLI->getFunction()->getContext();
5862 Value *IV = CLI->getIndVar();
5863 Value *OrigTripCount = CLI->getTripCount();
5864 Type *IVTy = IV->getType();
5865 assert(IVTy->getIntegerBitWidth() <= 64 &&
5866 "Max supported tripcount bitwidth is 64 bits");
5867 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5868 : Type::getInt64Ty(Ctx);
5869 Type *I32Type = Type::getInt32Ty(M.getContext());
5870 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5871 Constant *One = ConstantInt::get(InternalIVTy, 1);
5872
5873 Function *F = CLI->getFunction();
5874 // Blocks must have terminators.
5875 // FIXME: Don't run analyses on incomplete/invalid IR.
5877 for (BasicBlock &BB : *F)
5878 if (!BB.hasTerminator())
5879 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5881 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5882 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5883 LoopAnalysis LIA;
5884 LoopInfo &&LI = LIA.run(*F, FAM);
5885 for (Instruction *I : UIs)
5886 I->eraseFromParent();
5887 Loop *L = LI.getLoopFor(CLI->getHeader());
5888 SmallVector<Metadata *> LoopMDList;
5889 if (ChunkSize || DistScheduleChunkSize)
5890 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5891 addLoopMetadata(CLI, LoopMDList);
5892
5893 // Declare useful OpenMP runtime functions.
5894 FunctionCallee StaticInit =
5895 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5896 FunctionCallee StaticFini =
5897 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5898
5899 // Allocate space for computed loop bounds as expected by the "init" function.
5900 Builder.restoreIP(AllocaIP);
5901 Builder.SetCurrentDebugLocation(DL);
5902 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5903 Value *PLowerBound =
5904 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5905 Value *PUpperBound =
5906 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5907 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5908 CLI->setLastIter(PLastIter);
5909
5910 // Set up the source location value for the OpenMP runtime.
5911 Builder.restoreIP(CLI->getPreheaderIP());
5912 Builder.SetCurrentDebugLocation(DL);
5913
5914 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5915 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5916 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5917 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5918 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5919 "distschedulechunksize");
5920 Value *CastedTripCount =
5921 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5922
5923 Constant *SchedulingType =
5924 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5925 Constant *DistSchedulingType =
5926 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5927 Builder.CreateStore(Zero, PLowerBound);
5928 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5929 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5930 Value *UpperBound =
5931 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5932 Builder.CreateStore(UpperBound, PUpperBound);
5933 Builder.CreateStore(One, PStride);
5934
5935 // Call the "init" function and update the trip count of the loop with the
5936 // value it produced.
5937 uint32_t SrcLocStrSize;
5938 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5939 IdentFlag Flag = OMP_IDENT_FLAG_WORK_LOOP;
5940 if (DistScheduleSchedType != OMPScheduleType::None) {
5941 Flag |= OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5942 }
5943 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5944 Value *ThreadNum =
5945 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5946 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5947 PUpperBound, PStride, One,
5948 this](Value *SchedulingType, Value *ChunkSize,
5949 auto &Builder) {
5951 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5952 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5953 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5954 /*pstride=*/PStride, /*incr=*/One,
5955 /*chunk=*/ChunkSize});
5956 };
5957 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5958 if (DistScheduleSchedType != OMPScheduleType::None &&
5959 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5960 SchedType != OMPScheduleType::OrderedDistribute) {
5961 // We want to emit a second init function call for the dist_schedule clause
5962 // to the Distribute construct. This should only be done however if a
5963 // Workshare Loop is nested within a Distribute Construct
5964 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5965 }
5966
5967 // Load values written by the "init" function.
5968 Value *FirstChunkStart =
5969 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5970 Value *FirstChunkStop =
5971 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5972 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5973 Value *ChunkRange =
5974 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5975 Value *NextChunkStride =
5976 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5977
5978 // Create outer "dispatch" loop for enumerating the chunks.
5979 BasicBlock *DispatchEnter = splitBB(Builder, true);
5980 Value *DispatchCounter;
5981
5982 // It is safe to assume this didn't return an error because the callback
5983 // passed into createCanonicalLoop is the only possible error source, and it
5984 // always returns success.
5985 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5986 {Builder.saveIP(), DL},
5987 [&](InsertPointTy BodyIP, Value *Counter) {
5988 DispatchCounter = Counter;
5989 return Error::success();
5990 },
5991 FirstChunkStart, CastedTripCount, NextChunkStride,
5992 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5993 "dispatch"));
5994
5995 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5996 // not have to preserve the canonical invariant.
5997 BasicBlock *DispatchBody = DispatchCLI->getBody();
5998 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5999 BasicBlock *DispatchExit = DispatchCLI->getExit();
6000 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
6001 DispatchCLI->invalidate();
6002
6003 // Rewire the original loop to become the chunk loop inside the dispatch loop.
6004 redirectTo(DispatchAfter, CLI->getAfter(), DL);
6005 redirectTo(CLI->getExit(), DispatchLatch, DL);
6006 redirectTo(DispatchBody, DispatchEnter, DL);
6007
6008 // Prepare the prolog of the chunk loop.
6009 Builder.restoreIP(CLI->getPreheaderIP());
6010 Builder.SetCurrentDebugLocation(DL);
6011
6012 // Compute the number of iterations of the chunk loop.
6013 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
6014 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
6015 Value *IsLastChunk =
6016 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
6017 Value *CountUntilOrigTripCount =
6018 Builder.CreateSub(CastedTripCount, DispatchCounter);
6019 Value *ChunkTripCount = Builder.CreateSelect(
6020 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
6021 Value *BackcastedChunkTC =
6022 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
6023 CLI->setTripCount(BackcastedChunkTC);
6024
6025 // Update all uses of the induction variable except the one in the condition
6026 // block that compares it with the actual upper bound, and the increment in
6027 // the latch block.
6028 Value *BackcastedDispatchCounter =
6029 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
6030 CLI->mapIndVar([&](Instruction *) -> Value * {
6031 Builder.restoreIP(CLI->getBodyIP());
6032 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
6033 });
6034
6035 // In the "exit" block, call the "fini" function.
6036 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
6037 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
6038
6039 // Add the barrier if requested.
6040 if (NeedsBarrier) {
6041 InsertPointOrErrorTy AfterIP =
6042 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
6043 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
6044 if (!AfterIP)
6045 return AfterIP.takeError();
6046 }
6047
6048#ifndef NDEBUG
6049 // Even though we currently do not support applying additional methods to it,
6050 // the chunk loop should remain a canonical loop.
6051 CLI->assertOK();
6052#endif
6053
6054 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
6055}
6056
6057// Returns an LLVM function to call for executing an OpenMP static worksharing
6058// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
6059// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
6060static FunctionCallee
6062 WorksharingLoopType LoopType) {
6063 unsigned Bitwidth = Ty->getIntegerBitWidth();
6064 Module &M = OMPBuilder->M;
6065 switch (LoopType) {
6066 case WorksharingLoopType::ForStaticLoop:
6067 if (Bitwidth == 32)
6068 return OMPBuilder->getOrCreateRuntimeFunction(
6069 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
6070 if (Bitwidth == 64)
6071 return OMPBuilder->getOrCreateRuntimeFunction(
6072 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
6073 break;
6074 case WorksharingLoopType::DistributeStaticLoop:
6075 if (Bitwidth == 32)
6076 return OMPBuilder->getOrCreateRuntimeFunction(
6077 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
6078 if (Bitwidth == 64)
6079 return OMPBuilder->getOrCreateRuntimeFunction(
6080 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
6081 break;
6082 case WorksharingLoopType::DistributeForStaticLoop:
6083 if (Bitwidth == 32)
6084 return OMPBuilder->getOrCreateRuntimeFunction(
6085 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
6086 if (Bitwidth == 64)
6087 return OMPBuilder->getOrCreateRuntimeFunction(
6088 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
6089 break;
6090 }
6091 if (Bitwidth != 32 && Bitwidth != 64) {
6092 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
6093 }
6094 llvm_unreachable("Unknown type of OpenMP worksharing loop");
6095}
6096
6097// Inserts a call to proper OpenMP Device RTL function which handles
6098// loop worksharing.
6100 WorksharingLoopType LoopType,
6101 BasicBlock *InsertBlock, Value *Ident,
6102 Value *LoopBodyArg, Value *TripCount,
6103 Function &LoopBodyFn, bool NoLoop) {
6104 Type *TripCountTy = TripCount->getType();
6105 Module &M = OMPBuilder->M;
6106 IRBuilder<> &Builder = OMPBuilder->Builder;
6107 FunctionCallee RTLFn =
6108 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
6109 SmallVector<Value *, 8> RealArgs;
6110 RealArgs.push_back(Ident);
6111 RealArgs.push_back(&LoopBodyFn);
6112 RealArgs.push_back(LoopBodyArg);
6113 RealArgs.push_back(TripCount);
6114 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
6115 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6116 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6117 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6118 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6119 return;
6120 }
6121 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
6122 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
6123 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6124 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
6125
6126 RealArgs.push_back(
6127 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
6128 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6129 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
6130 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6131 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
6132 } else {
6133 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6134 }
6135
6136 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6137}
6138
6140 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
6141 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
6142 WorksharingLoopType LoopType, bool NoLoop) {
6143 IRBuilder<> &Builder = OMPIRBuilder->Builder;
6144 BasicBlock *Preheader = CLI->getPreheader();
6145 Value *TripCount = CLI->getTripCount();
6146
6147 // After loop body outling, the loop body contains only set up
6148 // of loop body argument structure and the call to the outlined
6149 // loop body function. Firstly, we need to move setup of loop body args
6150 // into loop preheader.
6151 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
6152 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
6153
6154 // The next step is to remove the whole loop. We do not it need anymore.
6155 // That's why make an unconditional branch from loop preheader to loop
6156 // exit block
6157 Builder.restoreIP({Preheader, Preheader->end()});
6158 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
6159 Preheader->getTerminator()->eraseFromParent();
6160 Builder.CreateBr(CLI->getExit());
6161
6162 // Delete dead loop blocks
6163 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
6164 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
6165 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
6166 CleanUpInfo.EntryBB = CLI->getHeader();
6167 CleanUpInfo.ExitBB = CLI->getExit();
6168 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
6169 DeleteDeadBlocks(BlocksToBeRemoved);
6170
6171 // Find the instruction which corresponds to loop body argument structure
6172 // and remove the call to loop body function instruction.
6173 Value *LoopBodyArg;
6174 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
6175 assert(OutlinedFnUser &&
6176 "Expected unique undroppable user of outlined function");
6177 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
6178 assert(OutlinedFnCallInstruction && "Expected outlined function call");
6179 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
6180 "Expected outlined function call to be located in loop preheader");
6181 // Check in case no argument structure has been passed.
6182 if (OutlinedFnCallInstruction->arg_size() > 1)
6183 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
6184 else
6185 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
6186 OutlinedFnCallInstruction->eraseFromParent();
6187
6188 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
6189 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
6190
6191 for (auto &ToBeDeletedItem : ToBeDeleted)
6192 ToBeDeletedItem->eraseFromParent();
6193 CLI->invalidate();
6194}
6195
6196OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
6197 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
6198 WorksharingLoopType LoopType, bool NoLoop) {
6199 uint32_t SrcLocStrSize;
6200 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6202 switch (LoopType) {
6203 case WorksharingLoopType::ForStaticLoop:
6204 Flag = OMP_IDENT_FLAG_WORK_LOOP;
6205 break;
6206 case WorksharingLoopType::DistributeStaticLoop:
6207 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
6208 break;
6209 case WorksharingLoopType::DistributeForStaticLoop:
6210 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
6211 break;
6212 }
6213 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
6214
6215 auto OI = std::make_unique<OutlineInfo>();
6216 OI->OuterAllocBB = CLI->getPreheader();
6217 Function *OuterFn = CLI->getPreheader()->getParent();
6218
6219 // Instructions which need to be deleted at the end of code generation
6220 SmallVector<Instruction *, 4> ToBeDeleted;
6221
6222 OI->OuterAllocBB = AllocaIP.getBlock();
6223
6224 // Mark the body loop as region which needs to be extracted
6225 OI->EntryBB = CLI->getBody();
6226 OI->ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
6227 "omp.prelatch");
6228
6229 // Prepare loop body for extraction
6230 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
6231
6232 // Insert new loop counter variable which will be used only in loop
6233 // body.
6234 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
6235 Instruction *NewLoopCntLoad =
6236 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
6237 // New loop counter instructions are redundant in the loop preheader when
6238 // code generation for workshare loop is finshed. That's why mark them as
6239 // ready for deletion.
6240 ToBeDeleted.push_back(NewLoopCntLoad);
6241 ToBeDeleted.push_back(NewLoopCnt);
6242
6243 // Analyse loop body region. Find all input variables which are used inside
6244 // loop body region.
6245 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
6247 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
6248
6249 CodeExtractorAnalysisCache CEAC(*OuterFn);
6250 CodeExtractor Extractor(Blocks,
6251 /* DominatorTree */ nullptr,
6252 /* AggregateArgs */ true,
6253 /* BlockFrequencyInfo */ nullptr,
6254 /* BranchProbabilityInfo */ nullptr,
6255 /* AssumptionCache */ nullptr,
6256 /* AllowVarArgs */ true,
6257 /* AllowAlloca */ true,
6258 /* AllocationBlock */ CLI->getPreheader(),
6259 /* DeallocationBlocks */ {},
6260 /* Suffix */ ".omp_wsloop",
6261 /* AggrArgsIn0AddrSpace */ true);
6262
6263 BasicBlock *CommonExit = nullptr;
6264 SetVector<Value *> SinkingCands, HoistingCands;
6265
6266 // Find allocas outside the loop body region which are used inside loop
6267 // body
6268 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6269
6270 // We need to model loop body region as the function f(cnt, loop_arg).
6271 // That's why we replace loop induction variable by the new counter
6272 // which will be one of loop body function argument
6274 CLI->getIndVar()->user_end());
6275 for (auto Use : Users) {
6276 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6277 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6278 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6279 }
6280 }
6281 }
6282 // Make sure that loop counter variable is not merged into loop body
6283 // function argument structure and it is passed as separate variable
6284 OI->ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6285
6286 // PostOutline CB is invoked when loop body function is outlined and
6287 // loop body is replaced by call to outlined function. We need to add
6288 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6289 // function will handle loop control logic.
6290 //
6291 OI->PostOutlineCB = [=, ToBeDeletedVec =
6292 std::move(ToBeDeleted)](Function &OutlinedFn) {
6293 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6294 LoopType, NoLoop);
6295 };
6296 addOutlineInfo(std::move(OI));
6297 return CLI->getAfterIP();
6298}
6299
6302 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6303 bool HasSimdModifier, bool HasMonotonicModifier,
6304 bool HasNonmonotonicModifier, bool HasOrderedClause,
6305 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6306 Value *DistScheduleChunkSize) {
6307 if (Config.isTargetDevice())
6308 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6309 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6310 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6311 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6312
6313 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6314 OMPScheduleType::ModifierOrdered;
6315 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6316 if (HasDistSchedule) {
6317 DistScheduleSchedType = DistScheduleChunkSize
6318 ? OMPScheduleType::OrderedDistributeChunked
6319 : OMPScheduleType::OrderedDistribute;
6320 }
6321 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6322 case OMPScheduleType::BaseStatic:
6323 case OMPScheduleType::BaseDistribute:
6324 assert((!ChunkSize || !DistScheduleChunkSize) &&
6325 "No chunk size with static-chunked schedule");
6326 if (IsOrdered && !HasDistSchedule)
6327 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6328 NeedsBarrier, ChunkSize);
6329 // FIXME: Monotonicity ignored?
6330 if (DistScheduleChunkSize)
6331 return applyStaticChunkedWorkshareLoop(
6332 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6333 DistScheduleChunkSize, DistScheduleSchedType);
6334 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6335 HasDistSchedule);
6336
6337 case OMPScheduleType::BaseStaticChunked:
6338 case OMPScheduleType::BaseDistributeChunked:
6339 if (IsOrdered && !HasDistSchedule)
6340 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6341 NeedsBarrier, ChunkSize);
6342 // FIXME: Monotonicity ignored?
6343 return applyStaticChunkedWorkshareLoop(
6344 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6345 DistScheduleChunkSize, DistScheduleSchedType);
6346
6347 case OMPScheduleType::BaseRuntime:
6348 case OMPScheduleType::BaseAuto:
6349 case OMPScheduleType::BaseGreedy:
6350 case OMPScheduleType::BaseBalanced:
6351 case OMPScheduleType::BaseSteal:
6352 case OMPScheduleType::BaseRuntimeSimd:
6353 assert(!ChunkSize &&
6354 "schedule type does not support user-defined chunk sizes");
6355 [[fallthrough]];
6356 case OMPScheduleType::BaseGuidedSimd:
6357 case OMPScheduleType::BaseDynamicChunked:
6358 case OMPScheduleType::BaseGuidedChunked:
6359 case OMPScheduleType::BaseGuidedIterativeChunked:
6360 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6361 case OMPScheduleType::BaseStaticBalancedChunked:
6362 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6363 NeedsBarrier, ChunkSize);
6364
6365 default:
6366 llvm_unreachable("Unknown/unimplemented schedule kind");
6367 }
6368}
6369
6370/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6371/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6372/// the runtime. Always interpret integers as unsigned similarly to
6373/// CanonicalLoopInfo.
6374static FunctionCallee
6376 unsigned Bitwidth = Ty->getIntegerBitWidth();
6377 if (Bitwidth == 32)
6378 return OMPBuilder.getOrCreateRuntimeFunction(
6379 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6380 if (Bitwidth == 64)
6381 return OMPBuilder.getOrCreateRuntimeFunction(
6382 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6383 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6384}
6385
6386/// Returns an LLVM function to call for updating the next loop using OpenMP
6387/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6388/// the runtime. Always interpret integers as unsigned similarly to
6389/// CanonicalLoopInfo.
6390static FunctionCallee
6392 unsigned Bitwidth = Ty->getIntegerBitWidth();
6393 if (Bitwidth == 32)
6394 return OMPBuilder.getOrCreateRuntimeFunction(
6395 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6396 if (Bitwidth == 64)
6397 return OMPBuilder.getOrCreateRuntimeFunction(
6398 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6399 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6400}
6401
6402/// Returns an LLVM function to call for finalizing the dynamic loop using
6403/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6404/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6405static FunctionCallee
6407 unsigned Bitwidth = Ty->getIntegerBitWidth();
6408 if (Bitwidth == 32)
6409 return OMPBuilder.getOrCreateRuntimeFunction(
6410 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6411 if (Bitwidth == 64)
6412 return OMPBuilder.getOrCreateRuntimeFunction(
6413 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6414 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6415}
6416
6418OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6419 InsertPointTy AllocaIP,
6420 OMPScheduleType SchedType,
6421 bool NeedsBarrier, Value *Chunk) {
6422 assert(CLI->isValid() && "Requires a valid canonical loop");
6423 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6424 "Require dedicated allocate IP");
6426 "Require valid schedule type");
6427
6428 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6429 OMPScheduleType::ModifierOrdered;
6430
6431 // Set up the source location value for OpenMP runtime.
6432 Builder.SetCurrentDebugLocation(DL);
6433
6434 uint32_t SrcLocStrSize;
6435 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6436 Value *SrcLoc =
6437 getOrCreateIdent(SrcLocStr, SrcLocStrSize, OMP_IDENT_FLAG_WORK_LOOP);
6438
6439 // Declare useful OpenMP runtime functions.
6440 Value *IV = CLI->getIndVar();
6441 Type *IVTy = IV->getType();
6442 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6443 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6444
6445 // Allocate space for computed loop bounds as expected by the "init" function.
6446 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6447 Type *I32Type = Type::getInt32Ty(M.getContext());
6448 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6449 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6450 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6451 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6452 CLI->setLastIter(PLastIter);
6453
6454 // At the end of the preheader, prepare for calling the "init" function by
6455 // storing the current loop bounds into the allocated space. A canonical loop
6456 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6457 // and produces an inclusive upper bound.
6458 BasicBlock *PreHeader = CLI->getPreheader();
6459 Builder.SetInsertPoint(PreHeader->getTerminator());
6460 Constant *One = ConstantInt::get(IVTy, 1);
6461 Builder.CreateStore(One, PLowerBound);
6462 Value *UpperBound = CLI->getTripCount();
6463 Builder.CreateStore(UpperBound, PUpperBound);
6464 Builder.CreateStore(One, PStride);
6465
6466 BasicBlock *Header = CLI->getHeader();
6467 BasicBlock *Exit = CLI->getExit();
6468 BasicBlock *Cond = CLI->getCond();
6469 BasicBlock *Latch = CLI->getLatch();
6470 InsertPointTy AfterIP = CLI->getAfterIP();
6471
6472 // The CLI will be "broken" in the code below, as the loop is no longer
6473 // a valid canonical loop.
6474
6475 if (!Chunk)
6476 Chunk = One;
6477
6478 Value *ThreadNum =
6479 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
6480
6481 Constant *SchedulingType =
6482 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6483
6484 // Call the "init" function.
6485 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6486 /* LowerBound */ One, UpperBound,
6487 /* step */ One, Chunk});
6488
6489 // An outer loop around the existing one.
6490 BasicBlock *OuterCond = BasicBlock::Create(
6491 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6492 PreHeader->getParent());
6493 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6494 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6496 DynamicNext,
6497 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6498 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6499 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6500 Value *LowerBound =
6501 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6502 Builder.CreateCondBr(MoreWork, Header, Exit);
6503
6504 // Change PHI-node in loop header to use outer cond rather than preheader,
6505 // and set IV to the LowerBound.
6506 Instruction *Phi = &Header->front();
6507 auto *PI = cast<PHINode>(Phi);
6508 PI->setIncomingBlock(0, OuterCond);
6509 PI->setIncomingValue(0, LowerBound);
6510
6511 // Then set the pre-header to jump to the OuterCond
6512 Instruction *Term = PreHeader->getTerminator();
6513 auto *Br = cast<UncondBrInst>(Term);
6514 Br->setSuccessor(OuterCond);
6515
6516 // Modify the inner condition:
6517 // * Use the UpperBound returned from the DynamicNext call.
6518 // * jump to the loop outer loop when done with one of the inner loops.
6519 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6520 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6521 Instruction *Comp = &*Builder.GetInsertPoint();
6522 auto *CI = cast<CmpInst>(Comp);
6523 CI->setOperand(1, UpperBound);
6524 // Redirect the inner exit to branch to outer condition.
6525 Instruction *Branch = &Cond->back();
6526 auto *BI = cast<CondBrInst>(Branch);
6527 assert(BI->getSuccessor(1) == Exit);
6528 BI->setSuccessor(1, OuterCond);
6529
6530 // Call the "fini" function if "ordered" is present in wsloop directive.
6531 if (Ordered) {
6532 Builder.SetInsertPoint(&Latch->back());
6533 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6534 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6535 }
6536
6537 // Add the barrier if requested.
6538 if (NeedsBarrier) {
6539 Builder.SetInsertPoint(&Exit->back());
6540 InsertPointOrErrorTy BarrierIP =
6542 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6543 /* CheckCancelFlag */ false);
6544 if (!BarrierIP)
6545 return BarrierIP.takeError();
6546 }
6547
6548 CLI->invalidate();
6549 return AfterIP;
6550}
6551
6552/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6553/// after this \p OldTarget will be orphaned.
6555 BasicBlock *NewTarget, DebugLoc DL) {
6556 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6557 redirectTo(Pred, NewTarget, DL);
6558}
6559
6560/// Determine which blocks in \p BBs are reachable from outside and remove the
6561/// ones that are not reachable from the function.
6564 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6565 for (Use &U : BB->uses()) {
6566 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6567 if (!UseInst)
6568 continue;
6569 if (BBsToErase.count(UseInst->getParent()))
6570 continue;
6571 return true;
6572 }
6573 return false;
6574 };
6575
6576 while (BBsToErase.remove_if(HasRemainingUses)) {
6577 // Try again if anything was removed.
6578 }
6579
6580 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6581 DeleteDeadBlocks(BBVec);
6582}
6583
6584CanonicalLoopInfo *
6586 InsertPointTy ComputeIP) {
6587 assert(Loops.size() >= 1 && "At least one loop required");
6588 size_t NumLoops = Loops.size();
6589
6590 // Nothing to do if there is already just one loop.
6591 if (NumLoops == 1)
6592 return Loops.front();
6593
6594 CanonicalLoopInfo *Outermost = Loops.front();
6595 CanonicalLoopInfo *Innermost = Loops.back();
6596 BasicBlock *OrigPreheader = Outermost->getPreheader();
6597 BasicBlock *OrigAfter = Outermost->getAfter();
6598 Function *F = OrigPreheader->getParent();
6599
6600 // Loop control blocks that may become orphaned later.
6601 SmallVector<BasicBlock *, 12> OldControlBBs;
6602 OldControlBBs.reserve(6 * Loops.size());
6604 Loop->collectControlBlocks(OldControlBBs);
6605
6606 // Setup the IRBuilder for inserting the trip count computation.
6607 Builder.SetCurrentDebugLocation(DL);
6608 if (ComputeIP.isSet())
6609 Builder.restoreIP(ComputeIP);
6610 else
6611 Builder.restoreIP(Outermost->getPreheaderIP());
6612
6613 // Derive the collapsed' loop trip count.
6614 // TODO: Find common/largest indvar type.
6615 Value *CollapsedTripCount = nullptr;
6616 for (CanonicalLoopInfo *L : Loops) {
6617 assert(L->isValid() &&
6618 "All loops to collapse must be valid canonical loops");
6619 Value *OrigTripCount = L->getTripCount();
6620 if (!CollapsedTripCount) {
6621 CollapsedTripCount = OrigTripCount;
6622 continue;
6623 }
6624
6625 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6626 CollapsedTripCount =
6627 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6628 }
6629
6630 // Create the collapsed loop control flow.
6631 CanonicalLoopInfo *Result =
6632 createLoopSkeleton(DL, CollapsedTripCount, F,
6633 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6634
6635 // Build the collapsed loop body code.
6636 // Start with deriving the input loop induction variables from the collapsed
6637 // one, using a divmod scheme. To preserve the original loops' order, the
6638 // innermost loop use the least significant bits.
6639 Builder.restoreIP(Result->getBodyIP());
6640
6641 Value *Leftover = Result->getIndVar();
6642 SmallVector<Value *> NewIndVars;
6643 NewIndVars.resize(NumLoops);
6644 for (int i = NumLoops - 1; i >= 1; --i) {
6645 Value *OrigTripCount = Loops[i]->getTripCount();
6646
6647 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6648 NewIndVars[i] = NewIndVar;
6649
6650 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6651 }
6652 // Outermost loop gets all the remaining bits.
6653 NewIndVars[0] = Leftover;
6654
6655 // Construct the loop body control flow.
6656 // We progressively construct the branch structure following in direction of
6657 // the control flow, from the leading in-between code, the loop nest body, the
6658 // trailing in-between code, and rejoining the collapsed loop's latch.
6659 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6660 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6661 // its predecessors as sources.
6662 BasicBlock *ContinueBlock = Result->getBody();
6663 BasicBlock *ContinuePred = nullptr;
6664 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6665 BasicBlock *NextSrc) {
6666 if (ContinueBlock)
6667 redirectTo(ContinueBlock, Dest, DL);
6668 else
6669 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6670
6671 ContinueBlock = nullptr;
6672 ContinuePred = NextSrc;
6673 };
6674
6675 // The code before the nested loop of each level.
6676 // Because we are sinking it into the nest, it will be executed more often
6677 // that the original loop. More sophisticated schemes could keep track of what
6678 // the in-between code is and instantiate it only once per thread.
6679 for (size_t i = 0; i < NumLoops - 1; ++i)
6680 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6681
6682 // Connect the loop nest body.
6683 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6684
6685 // The code after the nested loop at each level.
6686 for (size_t i = NumLoops - 1; i > 0; --i)
6687 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6688
6689 // Connect the finished loop to the collapsed loop latch.
6690 ContinueWith(Result->getLatch(), nullptr);
6691
6692 // Replace the input loops with the new collapsed loop.
6693 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6694 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6695
6696 // Replace the input loop indvars with the derived ones.
6697 for (size_t i = 0; i < NumLoops; ++i)
6698 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6699
6700 // Remove unused parts of the input loops.
6701 removeUnusedBlocksFromParent(OldControlBBs);
6702
6703 for (CanonicalLoopInfo *L : Loops)
6704 L->invalidate();
6705
6706#ifndef NDEBUG
6707 Result->assertOK();
6708#endif
6709 return Result;
6710}
6711
6712std::vector<CanonicalLoopInfo *>
6714 ArrayRef<Value *> TileSizes) {
6715 assert(TileSizes.size() == Loops.size() &&
6716 "Must pass as many tile sizes as there are loops");
6717 int NumLoops = Loops.size();
6718 assert(NumLoops >= 1 && "At least one loop to tile required");
6719
6720 CanonicalLoopInfo *OutermostLoop = Loops.front();
6721 CanonicalLoopInfo *InnermostLoop = Loops.back();
6722 Function *F = OutermostLoop->getBody()->getParent();
6723 BasicBlock *InnerEnter = InnermostLoop->getBody();
6724 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6725
6726 // Loop control blocks that may become orphaned later.
6727 SmallVector<BasicBlock *, 12> OldControlBBs;
6728 OldControlBBs.reserve(6 * Loops.size());
6730 Loop->collectControlBlocks(OldControlBBs);
6731
6732 // Collect original trip counts and induction variable to be accessible by
6733 // index. Also, the structure of the original loops is not preserved during
6734 // the construction of the tiled loops, so do it before we scavenge the BBs of
6735 // any original CanonicalLoopInfo.
6736 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6737 for (CanonicalLoopInfo *L : Loops) {
6738 assert(L->isValid() && "All input loops must be valid canonical loops");
6739 OrigTripCounts.push_back(L->getTripCount());
6740 OrigIndVars.push_back(L->getIndVar());
6741 }
6742
6743 // Collect the code between loop headers. These may contain SSA definitions
6744 // that are used in the loop nest body. To be usable with in the innermost
6745 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6746 // these instructions may be executed more often than before the tiling.
6747 // TODO: It would be sufficient to only sink them into body of the
6748 // corresponding tile loop.
6750 for (int i = 0; i < NumLoops - 1; ++i) {
6751 CanonicalLoopInfo *Surrounding = Loops[i];
6752 CanonicalLoopInfo *Nested = Loops[i + 1];
6753
6754 BasicBlock *EnterBB = Surrounding->getBody();
6755 BasicBlock *ExitBB = Nested->getHeader();
6756 InbetweenCode.emplace_back(EnterBB, ExitBB);
6757 }
6758
6759 // Compute the trip counts of the floor loops.
6760 Builder.SetCurrentDebugLocation(DL);
6761 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6762 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6763 for (int i = 0; i < NumLoops; ++i) {
6764 Value *TileSize = TileSizes[i];
6765 Value *OrigTripCount = OrigTripCounts[i];
6766 Type *IVType = OrigTripCount->getType();
6767
6768 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6769 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6770
6771 // 0 if tripcount divides the tilesize, 1 otherwise.
6772 // 1 means we need an additional iteration for a partial tile.
6773 //
6774 // Unfortunately we cannot just use the roundup-formula
6775 // (tripcount + tilesize - 1)/tilesize
6776 // because the summation might overflow. We do not want introduce undefined
6777 // behavior when the untiled loop nest did not.
6778 Value *FloorTripOverflow =
6779 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6780
6781 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6782 Value *FloorTripCount =
6783 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6784 "omp_floor" + Twine(i) + ".tripcount", true);
6785
6786 // Remember some values for later use.
6787 FloorCompleteCount.push_back(FloorCompleteTripCount);
6788 FloorCount.push_back(FloorTripCount);
6789 FloorRems.push_back(FloorTripRem);
6790 }
6791
6792 // Generate the new loop nest, from the outermost to the innermost.
6793 std::vector<CanonicalLoopInfo *> Result;
6794 Result.reserve(NumLoops * 2);
6795
6796 // The basic block of the surrounding loop that enters the nest generated
6797 // loop.
6798 BasicBlock *Enter = OutermostLoop->getPreheader();
6799
6800 // The basic block of the surrounding loop where the inner code should
6801 // continue.
6802 BasicBlock *Continue = OutermostLoop->getAfter();
6803
6804 // Where the next loop basic block should be inserted.
6805 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6806
6807 auto EmbeddNewLoop =
6808 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6809 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6810 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6811 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6812 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6813 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6814
6815 // Setup the position where the next embedded loop connects to this loop.
6816 Enter = EmbeddedLoop->getBody();
6817 Continue = EmbeddedLoop->getLatch();
6818 OutroInsertBefore = EmbeddedLoop->getLatch();
6819 return EmbeddedLoop;
6820 };
6821
6822 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6823 const Twine &NameBase) {
6824 for (auto P : enumerate(TripCounts)) {
6825 CanonicalLoopInfo *EmbeddedLoop =
6826 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6827 Result.push_back(EmbeddedLoop);
6828 }
6829 };
6830
6831 EmbeddNewLoops(FloorCount, "floor");
6832
6833 // Within the innermost floor loop, emit the code that computes the tile
6834 // sizes.
6835 Builder.SetInsertPoint(Enter->getTerminator());
6836 SmallVector<Value *, 4> TileCounts;
6837 for (int i = 0; i < NumLoops; ++i) {
6838 CanonicalLoopInfo *FloorLoop = Result[i];
6839 Value *TileSize = TileSizes[i];
6840
6841 Value *FloorIsEpilogue =
6842 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6843 Value *TileTripCount =
6844 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6845
6846 TileCounts.push_back(TileTripCount);
6847 }
6848
6849 // Create the tile loops.
6850 EmbeddNewLoops(TileCounts, "tile");
6851
6852 // Insert the inbetween code into the body.
6853 BasicBlock *BodyEnter = Enter;
6854 BasicBlock *BodyEntered = nullptr;
6855 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6856 BasicBlock *EnterBB = P.first;
6857 BasicBlock *ExitBB = P.second;
6858
6859 if (BodyEnter)
6860 redirectTo(BodyEnter, EnterBB, DL);
6861 else
6862 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6863
6864 BodyEnter = nullptr;
6865 BodyEntered = ExitBB;
6866 }
6867
6868 // Append the original loop nest body into the generated loop nest body.
6869 if (BodyEnter)
6870 redirectTo(BodyEnter, InnerEnter, DL);
6871 else
6872 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6874
6875 // Replace the original induction variable with an induction variable computed
6876 // from the tile and floor induction variables.
6877 Builder.restoreIP(Result.back()->getBodyIP());
6878 for (int i = 0; i < NumLoops; ++i) {
6879 CanonicalLoopInfo *FloorLoop = Result[i];
6880 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6881 Value *OrigIndVar = OrigIndVars[i];
6882 Value *Size = TileSizes[i];
6883
6884 Value *Scale =
6885 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6886 Value *Shift =
6887 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6888 OrigIndVar->replaceAllUsesWith(Shift);
6889 }
6890
6891 // Remove unused parts of the original loops.
6892 removeUnusedBlocksFromParent(OldControlBBs);
6893
6894 for (CanonicalLoopInfo *L : Loops)
6895 L->invalidate();
6896
6897#ifndef NDEBUG
6898 for (CanonicalLoopInfo *GenL : Result)
6899 GenL->assertOK();
6900#endif
6901 return Result;
6902}
6903
6904/// Attach metadata \p Properties to the basic block described by \p BB. If the
6905/// basic block already has metadata, the basic block properties are appended.
6907 ArrayRef<Metadata *> Properties) {
6908 // Nothing to do if no property to attach.
6909 if (Properties.empty())
6910 return;
6911
6912 LLVMContext &Ctx = BB->getContext();
6913 SmallVector<Metadata *> NewProperties;
6914 NewProperties.push_back(nullptr);
6915
6916 // If the basic block already has metadata, prepend it to the new metadata.
6917 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6918 if (Existing)
6919 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6920
6921 append_range(NewProperties, Properties);
6922 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6923 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6924
6925 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6926}
6927
6928/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6929/// loop already has metadata, the loop properties are appended.
6931 ArrayRef<Metadata *> Properties) {
6932 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6933
6934 // Attach metadata to the loop's latch
6935 BasicBlock *Latch = Loop->getLatch();
6936 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6937 addBasicBlockMetadata(Latch, Properties);
6938}
6939
6940/// Attach llvm.access.group metadata to the memref instructions of \p Block
6942 LoopInfo &LI) {
6943 for (Instruction &I : *Block) {
6944 if (I.mayReadOrWriteMemory()) {
6945 // TODO: This instruction may already have access group from
6946 // other pragmas e.g. #pragma clang loop vectorize. Append
6947 // so that the existing metadata is not overwritten.
6948 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6949 }
6950 }
6951}
6952
6953CanonicalLoopInfo *
6955 CanonicalLoopInfo *firstLoop = Loops.front();
6956 CanonicalLoopInfo *lastLoop = Loops.back();
6957 Function *F = firstLoop->getPreheader()->getParent();
6958
6959 // Loop control blocks that will become orphaned later
6960 SmallVector<BasicBlock *> oldControlBBs;
6962 Loop->collectControlBlocks(oldControlBBs);
6963
6964 // Collect original trip counts
6965 SmallVector<Value *> origTripCounts;
6966 for (CanonicalLoopInfo *L : Loops) {
6967 assert(L->isValid() && "All input loops must be valid canonical loops");
6968 origTripCounts.push_back(L->getTripCount());
6969 }
6970
6971 Builder.SetCurrentDebugLocation(DL);
6972
6973 // Compute max trip count.
6974 // The fused loop will be from 0 to max(origTripCounts)
6975 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6976 F, firstLoop->getHeader());
6977 Builder.SetInsertPoint(TCBlock);
6978 Value *fusedTripCount = nullptr;
6979 for (CanonicalLoopInfo *L : Loops) {
6980 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6981 Value *origTripCount = L->getTripCount();
6982 if (!fusedTripCount) {
6983 fusedTripCount = origTripCount;
6984 continue;
6985 }
6986 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6987 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6988 ".omp.fuse.tc");
6989 }
6990
6991 // Generate new loop
6992 CanonicalLoopInfo *fused =
6993 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6994 lastLoop->getLatch(), "fused");
6995
6996 // Replace original loops with the fused loop
6997 // Preheader and After are not considered inside the CLI.
6998 // These are used to compute the individual TCs of the loops
6999 // so they have to be put before the resulting fused loop.
7000 // Moving them up for readability.
7001 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7002 Loops[i]->getPreheader()->moveBefore(TCBlock);
7003 Loops[i]->getAfter()->moveBefore(TCBlock);
7004 }
7005 lastLoop->getPreheader()->moveBefore(TCBlock);
7006
7007 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7008 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
7009 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
7010 }
7011 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
7012 redirectTo(TCBlock, fused->getPreheader(), DL);
7013 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
7014
7015 // Build the fused body
7016 // Create new Blocks with conditions that jump to the original loop bodies
7018 SmallVector<Value *> condValues;
7019 for (size_t i = 0; i < Loops.size(); ++i) {
7020 BasicBlock *condBlock = BasicBlock::Create(
7021 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
7022 Builder.SetInsertPoint(condBlock);
7023 Value *condValue =
7024 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
7025 condBBs.push_back(condBlock);
7026 condValues.push_back(condValue);
7027 }
7028 // Join the condition blocks with the bodies of the original loops
7029 redirectTo(fused->getBody(), condBBs[0], DL);
7030 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7031 Builder.SetInsertPoint(condBBs[i]);
7032 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
7033 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
7034 // Replace the IV with the fused IV
7035 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7036 }
7037 // Last body jumps to the created end body block
7038 Builder.SetInsertPoint(condBBs.back());
7039 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
7040 fused->getLatch());
7041 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
7042 // Replace the IV with the fused IV
7043 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7044
7045 // The loop latch must have only one predecessor. Currently it is branched to
7046 // from both the last condition block and the last loop body
7047 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
7048 "omp.fused.pre_latch");
7049
7050 // Remove unused parts
7051 removeUnusedBlocksFromParent(oldControlBBs);
7052
7053 // Invalidate old CLIs
7054 for (CanonicalLoopInfo *L : Loops)
7055 L->invalidate();
7056
7057#ifndef NDEBUG
7058 fused->assertOK();
7059#endif
7060 return fused;
7061}
7062
7064 LLVMContext &Ctx = Builder.getContext();
7066 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7067 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
7068}
7069
7071 LLVMContext &Ctx = Builder.getContext();
7073 Loop, {
7074 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7075 });
7076}
7077
7078void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
7079 Value *IfCond, ValueToValueMapTy &VMap,
7080 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
7081 const Twine &NamePrefix) {
7082 Function *F = CanonicalLoop->getFunction();
7083
7084 // We can't do
7085 // if (cond) {
7086 // simd_loop;
7087 // } else {
7088 // non_simd_loop;
7089 // }
7090 // because then the CanonicalLoopInfo would only point to one of the loops:
7091 // leading to other constructs operating on the same loop to malfunction.
7092 // Instead generate
7093 // while (...) {
7094 // if (cond) {
7095 // simd_body;
7096 // } else {
7097 // not_simd_body;
7098 // }
7099 // }
7100 // At least for simple loops, LLVM seems able to hoist the if out of the loop
7101 // body at -O3
7102
7103 // Define where if branch should be inserted
7104 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
7105
7106 // Create additional blocks for the if statement
7107 BasicBlock *Cond = SplitBeforeIt->getParent();
7108 llvm::LLVMContext &C = Cond->getContext();
7110 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
7112 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
7113
7114 // Create if condition branch.
7115 Builder.SetInsertPoint(SplitBeforeIt);
7116 Instruction *BrInstr =
7117 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
7118 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
7119 // Then block contains branch to omp loop body which needs to be vectorized
7120 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
7121 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
7122
7123 Builder.SetInsertPoint(ElseBlock);
7124
7125 // Clone loop for the else branch
7127
7128 SmallVector<BasicBlock *, 8> ExistingBlocks;
7129 ExistingBlocks.reserve(L->getNumBlocks() + 1);
7130 ExistingBlocks.push_back(ThenBlock);
7131 ExistingBlocks.append(L->block_begin(), L->block_end());
7132 // Cond is the block that has the if clause condition
7133 // LoopCond is omp_loop.cond
7134 // LoopHeader is omp_loop.header
7135 BasicBlock *LoopCond = Cond->getUniquePredecessor();
7136 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
7137 assert(LoopCond && LoopHeader && "Invalid loop structure");
7138 for (BasicBlock *Block : ExistingBlocks) {
7139 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
7140 Block == LoopHeader || Block == LoopCond || Block == Cond) {
7141 continue;
7142 }
7143 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
7144
7145 // fix name not to be omp.if.then
7146 if (Block == ThenBlock)
7147 NewBB->setName(NamePrefix + ".if.else");
7148
7149 NewBB->moveBefore(CanonicalLoop->getExit());
7150 VMap[Block] = NewBB;
7151 NewBlocks.push_back(NewBB);
7152 }
7153 remapInstructionsInBlocks(NewBlocks, VMap);
7154 Builder.CreateBr(NewBlocks.front());
7155
7156 // The loop latch must have only one predecessor. Currently it is branched to
7157 // from both the 'then' and 'else' branches.
7158 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
7159 NamePrefix + ".pre_latch");
7160
7161 // Ensure that the then block is added to the loop so we add the attributes in
7162 // the next step
7163 L->addBasicBlockToLoop(ThenBlock, LI);
7164}
7165
7166unsigned
7168 const StringMap<bool> &Features) {
7169 if (TargetTriple.isX86()) {
7170 if (Features.lookup("avx512f"))
7171 return 512;
7172 else if (Features.lookup("avx"))
7173 return 256;
7174 return 128;
7175 }
7176 if (TargetTriple.isPPC())
7177 return 128;
7178 if (TargetTriple.isWasm())
7179 return 128;
7180 return 0;
7181}
7182
7184 MapVector<Value *, Value *> AlignedVars,
7185 Value *IfCond, OrderKind Order,
7186 ConstantInt *Simdlen, ConstantInt *Safelen) {
7187 LLVMContext &Ctx = Builder.getContext();
7188
7189 Function *F = CanonicalLoop->getFunction();
7190
7191 // Blocks must have terminators.
7192 // FIXME: Don't run analyses on incomplete/invalid IR.
7194 for (BasicBlock &BB : *F)
7195 if (!BB.hasTerminator())
7196 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7197
7198 // TODO: We should not rely on pass manager. Currently we use pass manager
7199 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
7200 // object. We should have a method which returns all blocks between
7201 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
7203 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7204 FAM.registerPass([]() { return LoopAnalysis(); });
7205 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7206
7207 LoopAnalysis LIA;
7208 LoopInfo &&LI = LIA.run(*F, FAM);
7209
7210 for (Instruction *I : UIs)
7211 I->eraseFromParent();
7212
7213 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
7214 if (AlignedVars.size()) {
7215 InsertPointTy IP = Builder.saveIP();
7216 for (auto &AlignedItem : AlignedVars) {
7217 Value *AlignedPtr = AlignedItem.first;
7218 Value *Alignment = AlignedItem.second;
7219 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
7220 Builder.SetInsertPoint(loadInst->getNextNode());
7221 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
7222 Alignment);
7223 }
7224 Builder.restoreIP(IP);
7225 }
7226
7227 if (IfCond) {
7228 ValueToValueMapTy VMap;
7229 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
7230 }
7231
7233
7234 // Get the basic blocks from the loop in which memref instructions
7235 // can be found.
7236 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
7237 // preferably without running any passes.
7238 for (BasicBlock *Block : L->getBlocks()) {
7239 if (Block == CanonicalLoop->getCond() ||
7240 Block == CanonicalLoop->getHeader())
7241 continue;
7242 Reachable.insert(Block);
7243 }
7244
7245 SmallVector<Metadata *> LoopMDList;
7246
7247 // In presence of finite 'safelen', it may be unsafe to mark all
7248 // the memory instructions parallel, because loop-carried
7249 // dependences of 'safelen' iterations are possible.
7250 // If clause order(concurrent) is specified then the memory instructions
7251 // are marked parallel even if 'safelen' is finite.
7252 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
7253 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
7254
7255 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
7256 // versions so we can't add the loop attributes in that case.
7257 if (IfCond) {
7258 // we can still add llvm.loop.parallel_access
7259 addLoopMetadata(CanonicalLoop, LoopMDList);
7260 return;
7261 }
7262
7263 // Use the above access group metadata to create loop level
7264 // metadata, which should be distinct for each loop.
7265 ConstantAsMetadata *BoolConst =
7267 LoopMDList.push_back(MDNode::get(
7268 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7269
7270 if (Simdlen || Safelen) {
7271 // If both simdlen and safelen clauses are specified, the value of the
7272 // simdlen parameter must be less than or equal to the value of the safelen
7273 // parameter. Therefore, use safelen only in the absence of simdlen.
7274 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7275 LoopMDList.push_back(
7276 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7277 ConstantAsMetadata::get(VectorizeWidth)}));
7278 }
7279
7280 addLoopMetadata(CanonicalLoop, LoopMDList);
7281}
7282
7283/// Create the TargetMachine object to query the backend for optimization
7284/// preferences.
7285///
7286/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7287/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7288/// needed for the LLVM pass pipline. We use some default options to avoid
7289/// having to pass too many settings from the frontend that probably do not
7290/// matter.
7291///
7292/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7293/// method. If we are going to use TargetMachine for more purposes, especially
7294/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7295/// might become be worth requiring front-ends to pass on their TargetMachine,
7296/// or at least cache it between methods. Note that while fontends such as Clang
7297/// have just a single main TargetMachine per translation unit, "target-cpu" and
7298/// "target-features" that determine the TargetMachine are per-function and can
7299/// be overrided using __attribute__((target("OPTIONS"))).
7300static std::unique_ptr<TargetMachine>
7302 Module *M = F->getParent();
7303
7304 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7305 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7306 const llvm::Triple &Triple = M->getTargetTriple();
7307
7308 std::string Error;
7310 if (!TheTarget)
7311 return {};
7312
7314 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7315 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7316 /*CodeModel=*/std::nullopt, OptLevel));
7317}
7318
7319/// Heuristically determine the best-performant unroll factor for \p CLI. This
7320/// depends on the target processor. We are re-using the same heuristics as the
7321/// LoopUnrollPass.
7323 Function *F = CLI->getFunction();
7324
7325 // Assume the user requests the most aggressive unrolling, even if the rest of
7326 // the code is optimized using a lower setting.
7328 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7329
7330 // Blocks must have terminators.
7331 // FIXME: Don't run analyses on incomplete/invalid IR.
7333 for (BasicBlock &BB : *F)
7334 if (!BB.hasTerminator())
7335 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7336
7338 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7339 FAM.registerPass([]() { return AssumptionAnalysis(); });
7340 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7341 FAM.registerPass([]() { return LoopAnalysis(); });
7342 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7343 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7344 TargetIRAnalysis TIRA;
7345 if (TM)
7346 TIRA = TargetIRAnalysis(
7347 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7348 FAM.registerPass([&]() { return TIRA; });
7349
7350 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7352 ScalarEvolution &&SE = SEA.run(*F, FAM);
7354 DominatorTree &&DT = DTA.run(*F, FAM);
7355 LoopAnalysis LIA;
7356 LoopInfo &&LI = LIA.run(*F, FAM);
7358 AssumptionCache &&AC = ACT.run(*F, FAM);
7360
7361 for (Instruction *I : UIs)
7362 I->eraseFromParent();
7363
7364 Loop *L = LI.getLoopFor(CLI->getHeader());
7365 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7366
7368 L, SE, TTI,
7369 /*BlockFrequencyInfo=*/nullptr,
7370 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7371 /*UserThreshold=*/std::nullopt,
7372 /*UserCount=*/std::nullopt,
7373 /*UserAllowPartial=*/true,
7374 /*UserAllowRuntime=*/true,
7375 /*UserUpperBound=*/std::nullopt,
7376 /*UserFullUnrollMaxCount=*/std::nullopt);
7377
7378 UP.Force = true;
7379
7380 // Account for additional optimizations taking place before the LoopUnrollPass
7381 // would unroll the loop.
7384
7385 // Use normal unroll factors even if the rest of the code is optimized for
7386 // size.
7389
7390 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7391 << " Threshold=" << UP.Threshold << "\n"
7392 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7393 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7394 << " PartialOptSizeThreshold="
7395 << UP.PartialOptSizeThreshold << "\n");
7396
7397 // Disable peeling.
7400 /*UserAllowPeeling=*/false,
7401 /*UserAllowProfileBasedPeeling=*/false,
7402 /*UnrollingSpecficValues=*/false);
7403
7405 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7406
7407 // Assume that reads and writes to stack variables can be eliminated by
7408 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7409 // size.
7410 for (BasicBlock *BB : L->blocks()) {
7411 for (Instruction &I : *BB) {
7412 Value *Ptr;
7413 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7414 Ptr = Load->getPointerOperand();
7415 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7416 Ptr = Store->getPointerOperand();
7417 } else
7418 continue;
7419
7420 Ptr = Ptr->stripPointerCasts();
7421
7422 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7423 if (Alloca->getParent() == &F->getEntryBlock())
7424 EphValues.insert(&I);
7425 }
7426 }
7427 }
7428
7429 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7430
7431 // Loop is not unrollable if the loop contains certain instructions.
7432 if (!UCE.canUnroll()) {
7433 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7434 return 1;
7435 }
7436
7437 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7438 << "\n");
7439
7440 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7441 // be able to use it.
7442 int TripCount = 0;
7443 int MaxTripCount = 0;
7444 bool MaxOrZero = false;
7445 unsigned TripMultiple = 0;
7446
7447 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7448 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7449 unsigned Factor = UP.Count;
7450 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7451
7452 // This function returns 1 to signal to not unroll a loop.
7453 if (Factor == 0)
7454 return 1;
7455 return Factor;
7456}
7457
7459 int32_t Factor,
7460 CanonicalLoopInfo **UnrolledCLI) {
7461 assert(Factor >= 0 && "Unroll factor must not be negative");
7462
7463 Function *F = Loop->getFunction();
7464 LLVMContext &Ctx = F->getContext();
7465
7466 // If the unrolled loop is not used for another loop-associated directive, it
7467 // is sufficient to add metadata for the LoopUnrollPass.
7468 if (!UnrolledCLI) {
7469 SmallVector<Metadata *, 2> LoopMetadata;
7470 LoopMetadata.push_back(
7471 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7472
7473 if (Factor >= 1) {
7475 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7476 LoopMetadata.push_back(MDNode::get(
7477 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7478 }
7479
7480 addLoopMetadata(Loop, LoopMetadata);
7481 return;
7482 }
7483
7484 // Heuristically determine the unroll factor.
7485 if (Factor == 0)
7487
7488 // No change required with unroll factor 1.
7489 if (Factor == 1) {
7490 *UnrolledCLI = Loop;
7491 return;
7492 }
7493
7494 assert(Factor >= 2 &&
7495 "unrolling only makes sense with a factor of 2 or larger");
7496
7497 Type *IndVarTy = Loop->getIndVarType();
7498
7499 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7500 // unroll the inner loop.
7501 Value *FactorVal =
7502 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7503 /*isSigned=*/false));
7504 std::vector<CanonicalLoopInfo *> LoopNest =
7505 tileLoops(DL, {Loop}, {FactorVal});
7506 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7507 *UnrolledCLI = LoopNest[0];
7508 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7509
7510 // LoopUnrollPass can only fully unroll loops with constant trip count.
7511 // Unroll by the unroll factor with a fallback epilog for the remainder
7512 // iterations if necessary.
7514 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7516 InnerLoop,
7517 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7519 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7520
7521#ifndef NDEBUG
7522 (*UnrolledCLI)->assertOK();
7523#endif
7524}
7525
7528 llvm::Value *BufSize, llvm::Value *CpyBuf,
7529 llvm::Value *CpyFn, llvm::Value *DidIt) {
7530 if (!updateToLocation(Loc))
7531 return Loc.IP;
7532
7533 uint32_t SrcLocStrSize;
7534 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7535 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7536 Value *ThreadId = getOrCreateThreadID(Ident);
7537
7538 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7539
7540 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7541
7542 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7543 createRuntimeFunctionCall(Fn, Args);
7544
7545 return Builder.saveIP();
7546}
7547
7549 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7550 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7552
7553 if (!updateToLocation(Loc))
7554 return Loc.IP;
7555
7556 // If needed allocate and initialize `DidIt` with 0.
7557 // DidIt: flag variable: 1=single thread; 0=not single thread.
7558 llvm::Value *DidIt = nullptr;
7559 if (!CPVars.empty()) {
7560 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7561 Builder.CreateStore(Builder.getInt32(0), DidIt);
7562 }
7563
7564 Directive OMPD = Directive::OMPD_single;
7565 uint32_t SrcLocStrSize;
7566 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7567 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7568 Value *ThreadId = getOrCreateThreadID(Ident);
7569 Value *Args[] = {Ident, ThreadId};
7570
7571 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7572 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7573
7574 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7575 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7576
7577 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7578 if (Error Err = FiniCB(IP))
7579 return Err;
7580
7581 // The thread that executes the single region must set `DidIt` to 1.
7582 // This is used by __kmpc_copyprivate, to know if the caller is the
7583 // single thread or not.
7584 if (DidIt)
7585 Builder.CreateStore(Builder.getInt32(1), DidIt);
7586
7587 return Error::success();
7588 };
7589
7590 // generates the following:
7591 // if (__kmpc_single()) {
7592 // .... single region ...
7593 // __kmpc_end_single
7594 // }
7595 // __kmpc_copyprivate
7596 // __kmpc_barrier
7597
7598 InsertPointOrErrorTy AfterIP =
7599 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7600 /*Conditional*/ true,
7601 /*hasFinalize*/ true);
7602 if (!AfterIP)
7603 return AfterIP.takeError();
7604
7605 if (DidIt) {
7606 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7607 // NOTE BufSize is currently unused, so just pass 0.
7609 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7610 CPFuncs[I], DidIt);
7611 // NOTE __kmpc_copyprivate already inserts a barrier
7612 } else if (!IsNowait) {
7613 InsertPointOrErrorTy AfterIP =
7615 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7616 /* CheckCancelFlag */ false);
7617 if (!AfterIP)
7618 return AfterIP.takeError();
7619 }
7620 return Builder.saveIP();
7621}
7622
7625 BodyGenCallbackTy BodyGenCB,
7626 FinalizeCallbackTy FiniCB, bool IsNowait) {
7627
7628 if (!updateToLocation(Loc))
7629 return Loc.IP;
7630
7631 // All threads execute the scope body — no conditional entry.
7632 InsertPointOrErrorTy AfterIP = EmitOMPInlinedRegion(
7633 Directive::OMPD_scope, /*EntryCall=*/nullptr, /*ExitCall=*/nullptr,
7634 BodyGenCB, FiniCB, /*Conditional=*/false, /*HasFinalize=*/true,
7635 /*IsCancellable=*/false);
7636 if (!AfterIP)
7637 return AfterIP.takeError();
7638
7639 Builder.restoreIP(*AfterIP);
7640 if (!IsNowait) {
7641 AfterIP = createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
7642 omp::Directive::OMPD_unknown,
7643 /*ForceSimpleCall=*/false,
7644 /*CheckCancelFlag=*/false);
7645 if (!AfterIP)
7646 return AfterIP.takeError();
7647 }
7648 return Builder.saveIP();
7649}
7650
7652 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7653 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7654
7655 if (!updateToLocation(Loc))
7656 return Loc.IP;
7657
7658 Directive OMPD = Directive::OMPD_critical;
7659 uint32_t SrcLocStrSize;
7660 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7661 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7662 Value *ThreadId = getOrCreateThreadID(Ident);
7663 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7664 Value *Args[] = {Ident, ThreadId, LockVar};
7665
7666 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7667 Function *RTFn = nullptr;
7668 if (HintInst) {
7669 // Add Hint to entry Args and create call
7670 EnterArgs.push_back(HintInst);
7671 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7672 } else {
7673 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7674 }
7675 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7676
7677 Function *ExitRTLFn =
7678 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7679 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7680
7681 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7682 /*Conditional*/ false, /*hasFinalize*/ true);
7683}
7684
7687 InsertPointTy AllocaIP, unsigned NumLoops,
7688 ArrayRef<llvm::Value *> StoreValues,
7689 const Twine &Name, bool IsDependSource) {
7690 assert(
7691 llvm::all_of(StoreValues,
7692 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7693 "OpenMP runtime requires depend vec with i64 type");
7694
7695 if (!updateToLocation(Loc))
7696 return Loc.IP;
7697
7698 // Allocate space for vector and generate alloc instruction.
7699 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7700 Builder.restoreIP(AllocaIP);
7701 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7702 ArgsBase->setAlignment(Align(8));
7704
7705 // Store the index value with offset in depend vector.
7706 for (unsigned I = 0; I < NumLoops; ++I) {
7707 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7708 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7709 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7710 STInst->setAlignment(Align(8));
7711 }
7712
7713 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7714 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7715
7716 uint32_t SrcLocStrSize;
7717 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7718 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7719 Value *ThreadId = getOrCreateThreadID(Ident);
7720 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7721
7722 Function *RTLFn = nullptr;
7723 if (IsDependSource)
7724 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7725 else
7726 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7727 createRuntimeFunctionCall(RTLFn, Args);
7728
7729 return Builder.saveIP();
7730}
7731
7733 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7734 FinalizeCallbackTy FiniCB, bool IsThreads) {
7735 if (!updateToLocation(Loc))
7736 return Loc.IP;
7737
7738 Directive OMPD = Directive::OMPD_ordered;
7739 Instruction *EntryCall = nullptr;
7740 Instruction *ExitCall = nullptr;
7741
7742 if (IsThreads) {
7743 uint32_t SrcLocStrSize;
7744 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7745 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7746 Value *ThreadId = getOrCreateThreadID(Ident);
7747 Value *Args[] = {Ident, ThreadId};
7748
7749 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7750 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7751
7752 Function *ExitRTLFn =
7753 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7754 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7755 }
7756
7757 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7758 /*Conditional*/ false, /*hasFinalize*/ true);
7759}
7760
7761OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7762 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7763 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7764 bool HasFinalize, bool IsCancellable) {
7765
7766 if (HasFinalize)
7767 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7768
7769 // Create inlined region's entry and body blocks, in preparation
7770 // for conditional creation
7771 BasicBlock *EntryBB = Builder.GetInsertBlock();
7772 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7774 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7775 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7776 BasicBlock *FiniBB =
7777 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7778
7779 Builder.SetInsertPoint(EntryBB->getTerminator());
7780 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7781
7782 // generate body
7783 if (Error Err =
7784 BodyGenCB(/* AllocaIP */ InsertPointTy(),
7785 /* CodeGenIP */ Builder.saveIP(), /* DeallocBlocks */ {}))
7786 return Err;
7787
7788 // emit exit call and do any needed finalization.
7789 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7790 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7791 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7792 "Unexpected control flow graph state!!");
7793 InsertPointOrErrorTy AfterIP =
7794 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7795 if (!AfterIP)
7796 return AfterIP.takeError();
7797
7798 // If we are skipping the region of a non conditional, remove the exit
7799 // block, and clear the builder's insertion point.
7800 assert(SplitPos->getParent() == ExitBB &&
7801 "Unexpected Insertion point location!");
7802 auto merged = MergeBlockIntoPredecessor(ExitBB);
7803 BasicBlock *ExitPredBB = SplitPos->getParent();
7804 auto InsertBB = merged ? ExitPredBB : ExitBB;
7806 SplitPos->eraseFromParent();
7807 Builder.SetInsertPoint(InsertBB);
7808
7809 return Builder.saveIP();
7810}
7811
7812OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7813 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7814 // if nothing to do, Return current insertion point.
7815 if (!Conditional || !EntryCall)
7816 return Builder.saveIP();
7817
7818 BasicBlock *EntryBB = Builder.GetInsertBlock();
7819 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7820 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7821 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7822
7823 // Emit thenBB and set the Builder's insertion point there for
7824 // body generation next. Place the block after the current block.
7825 Function *CurFn = EntryBB->getParent();
7826 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7827
7828 // Move Entry branch to end of ThenBB, and replace with conditional
7829 // branch (If-stmt)
7830 Instruction *EntryBBTI = EntryBB->getTerminator();
7831 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7832 EntryBBTI->removeFromParent();
7833 Builder.SetInsertPoint(UI);
7834 Builder.Insert(EntryBBTI);
7835 UI->eraseFromParent();
7836 Builder.SetInsertPoint(ThenBB->getTerminator());
7837
7838 // return an insertion point to ExitBB.
7839 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7840}
7841
7842OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7843 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7844 bool HasFinalize) {
7845
7846 Builder.restoreIP(FinIP);
7847
7848 // If there is finalization to do, emit it before the exit call
7849 if (HasFinalize) {
7850 assert(!FinalizationStack.empty() &&
7851 "Unexpected finalization stack state!");
7852
7853 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7854 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7855
7856 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7857 return std::move(Err);
7858
7859 // Exit condition: insertion point is before the terminator of the new Fini
7860 // block
7861 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7862 }
7863
7864 if (!ExitCall)
7865 return Builder.saveIP();
7866
7867 // place the Exitcall as last instruction before Finalization block terminator
7868 ExitCall->removeFromParent();
7869 Builder.Insert(ExitCall);
7870
7871 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7872 ExitCall->getIterator());
7873}
7874
7876 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7877 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7878 if (!IP.isSet())
7879 return IP;
7880
7882
7883 // creates the following CFG structure
7884 // OMP_Entry : (MasterAddr != PrivateAddr)?
7885 // F T
7886 // | \
7887 // | copin.not.master
7888 // | /
7889 // v /
7890 // copyin.not.master.end
7891 // |
7892 // v
7893 // OMP.Entry.Next
7894
7895 BasicBlock *OMP_Entry = IP.getBlock();
7896 Function *CurFn = OMP_Entry->getParent();
7897 BasicBlock *CopyBegin =
7898 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7899 BasicBlock *CopyEnd = nullptr;
7900
7901 // If entry block is terminated, split to preserve the branch to following
7902 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7904 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7905 "copyin.not.master.end");
7906 OMP_Entry->getTerminator()->eraseFromParent();
7907 } else {
7908 CopyEnd =
7909 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7910 }
7911
7912 Builder.SetInsertPoint(OMP_Entry);
7913 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7914 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7915 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7916 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7917
7918 Builder.SetInsertPoint(CopyBegin);
7919 if (BranchtoEnd)
7920 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7921
7922 return Builder.saveIP();
7923}
7924
7926 Value *Size, Value *Allocator,
7927 std::string Name) {
7929 if (!updateToLocation(Loc))
7930 return nullptr;
7931
7932 uint32_t SrcLocStrSize;
7933 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7934 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7935 Value *ThreadId = getOrCreateThreadID(Ident);
7936 Value *Args[] = {ThreadId, Size, Allocator};
7937
7938 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7939
7940 return createRuntimeFunctionCall(Fn, Args, Name);
7941}
7942
7944 Value *Align, Value *Size,
7945 Value *Allocator,
7946 std::string Name) {
7948 if (!updateToLocation(Loc))
7949 return nullptr;
7950
7951 uint32_t SrcLocStrSize;
7952 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7953 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7954 Value *ThreadId = getOrCreateThreadID(Ident);
7955 Value *Args[] = {ThreadId, Align, Size, Allocator};
7956
7957 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_aligned_alloc);
7958
7959 return Builder.CreateCall(Fn, Args, Name);
7960}
7961
7963 Value *Addr, Value *Allocator,
7964 std::string Name) {
7966 if (!updateToLocation(Loc))
7967 return nullptr;
7968
7969 uint32_t SrcLocStrSize;
7970 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7971 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7972 Value *ThreadId = getOrCreateThreadID(Ident);
7973 Value *Args[] = {ThreadId, Addr, Allocator};
7974 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7975 return createRuntimeFunctionCall(Fn, Args, Name);
7976}
7977
7979 Value *Size,
7980 const Twine &Name) {
7983
7984 Value *Args[] = {Size};
7985 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared);
7986 CallInst *Call = Builder.CreateCall(Fn, Args, Name);
7988 M.getContext(), M.getDataLayout().getPrefTypeAlign(Int64)));
7989 return Call;
7990}
7991
7993 Type *VarType,
7994 const Twine &Name) {
7995 return createOMPAllocShared(
7996 Loc, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)), Name);
7997}
7998
8000 Value *Addr, Value *Size,
8001 const Twine &Name) {
8004
8005 Value *Args[] = {Addr, Size};
8006 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared);
8007 return Builder.CreateCall(Fn, Args, Name);
8008}
8009
8011 Value *Addr, Type *VarType,
8012 const Twine &Name) {
8013 return createOMPFreeShared(
8014 Loc, Addr, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)),
8015 Name);
8016}
8017
8019 const LocationDescription &Loc, Value *InteropVar,
8020 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
8021 Value *DependenceAddress, bool HaveNowaitClause) {
8024
8025 uint32_t SrcLocStrSize;
8026 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8027 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8028 Value *ThreadId = getOrCreateThreadID(Ident);
8029 if (Device == nullptr)
8030 Device = Constant::getAllOnesValue(Int32);
8031 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
8032 if (NumDependences == nullptr) {
8033 NumDependences = ConstantInt::get(Int32, 0);
8034 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8035 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8036 }
8037 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8038 Value *Args[] = {
8039 Ident, ThreadId, InteropVar, InteropTypeVal,
8040 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
8041
8042 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
8043
8044 return createRuntimeFunctionCall(Fn, Args);
8045}
8046
8048 const LocationDescription &Loc, Value *InteropVar, Value *Device,
8049 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
8052
8053 uint32_t SrcLocStrSize;
8054 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8055 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8056 Value *ThreadId = getOrCreateThreadID(Ident);
8057 if (Device == nullptr)
8058 Device = Constant::getAllOnesValue(Int32);
8059 if (NumDependences == nullptr) {
8060 NumDependences = ConstantInt::get(Int32, 0);
8061 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8062 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8063 }
8064 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8065 Value *Args[] = {
8066 Ident, ThreadId, InteropVar, Device,
8067 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8068
8069 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
8070
8071 return createRuntimeFunctionCall(Fn, Args);
8072}
8073
8075 Value *InteropVar, Value *Device,
8076 Value *NumDependences,
8077 Value *DependenceAddress,
8078 bool HaveNowaitClause) {
8081 uint32_t SrcLocStrSize;
8082 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8083 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8084 Value *ThreadId = getOrCreateThreadID(Ident);
8085 if (Device == nullptr)
8086 Device = Constant::getAllOnesValue(Int32);
8087 if (NumDependences == nullptr) {
8088 NumDependences = ConstantInt::get(Int32, 0);
8089 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8090 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8091 }
8092 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8093 Value *Args[] = {
8094 Ident, ThreadId, InteropVar, Device,
8095 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8096
8097 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
8098
8099 return createRuntimeFunctionCall(Fn, Args);
8100}
8101
8104 llvm::ConstantInt *Size, const llvm::Twine &Name) {
8107
8108 uint32_t SrcLocStrSize;
8109 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8110 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8111 Value *ThreadId = getOrCreateThreadID(Ident);
8112 Constant *ThreadPrivateCache =
8113 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
8114 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
8115
8116 Function *Fn =
8117 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
8118
8119 return createRuntimeFunctionCall(Fn, Args);
8120}
8121
8123 const LocationDescription &Loc,
8125 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
8126 "expected num_threads and num_teams to be specified");
8127
8128 if (!updateToLocation(Loc))
8129 return Loc.IP;
8130
8131 uint32_t SrcLocStrSize;
8132 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8133 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8134 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
8135 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
8136 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD &&
8137 Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP);
8138 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
8139 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
8140
8141 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
8142 Function *Kernel = DebugKernelWrapper;
8143
8144 // We need to strip the debug prefix to get the correct kernel name.
8145 StringRef KernelName = Kernel->getName();
8146 const std::string DebugPrefix = "_debug__";
8147 if (KernelName.ends_with(DebugPrefix)) {
8148 KernelName = KernelName.drop_back(DebugPrefix.length());
8149 Kernel = M.getFunction(KernelName);
8150 assert(Kernel && "Expected the real kernel to exist");
8151 }
8152
8153 // Manifest the launch configuration in the metadata matching the kernel
8154 // environment.
8155 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
8156 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
8157
8158 // If MaxThreads not set, select the maximum between the default workgroup
8159 // size and the MinThreads value.
8160 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
8161 if (MaxThreadsVal < 0) {
8162 if (hasGridValue(T)) {
8163 MaxThreadsVal =
8164 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
8165 Attrs.MinThreads);
8166 } else {
8167 MaxThreadsVal = Attrs.MinThreads;
8168 }
8169 }
8170
8171 if (MaxThreadsVal > 0)
8172 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
8173
8174 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
8175 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
8176 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
8177 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
8178 Constant *ReductionDataSize =
8179 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
8180 Constant *ReductionBufferLength =
8181 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
8182
8184 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
8185 const DataLayout &DL = Fn->getDataLayout();
8186
8187 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
8188 Constant *DynamicEnvironmentInitializer =
8189 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
8190 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
8191 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
8192 DynamicEnvironmentInitializer, DynamicEnvironmentName,
8193 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8194 DL.getDefaultGlobalsAddressSpace());
8195 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8196
8197 Constant *DynamicEnvironment =
8198 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
8199 ? DynamicEnvironmentGV
8200 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
8201 DynamicEnvironmentPtr);
8202
8203 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
8204 ConfigurationEnvironment, {
8205 UseGenericStateMachineVal,
8206 MayUseNestedParallelismVal,
8207 IsSPMDVal,
8208 MinThreads,
8209 MaxThreads,
8210 MinTeams,
8211 MaxTeams,
8212 ReductionDataSize,
8213 ReductionBufferLength,
8214 });
8215 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
8216 KernelEnvironment, {
8217 ConfigurationEnvironmentInitializer,
8218 Ident,
8219 DynamicEnvironment,
8220 });
8221 std::string KernelEnvironmentName =
8222 (KernelName + "_kernel_environment").str();
8223 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
8224 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
8225 KernelEnvironmentInitializer, KernelEnvironmentName,
8226 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8227 DL.getDefaultGlobalsAddressSpace());
8228 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8229
8230 Constant *KernelEnvironment =
8231 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
8232 ? KernelEnvironmentGV
8233 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
8234 KernelEnvironmentPtr);
8235 Value *KernelLaunchEnvironment =
8236 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
8237 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
8238 KernelLaunchEnvironment =
8239 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
8240 ? KernelLaunchEnvironment
8241 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
8242 KernelLaunchEnvParamTy);
8243 CallInst *ThreadKind = createRuntimeFunctionCall(
8244 Fn, {KernelEnvironment, KernelLaunchEnvironment});
8245
8246 Value *ExecUserCode = Builder.CreateICmpEQ(
8247 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
8248 "exec_user_code");
8249
8250 // ThreadKind = __kmpc_target_init(...)
8251 // if (ThreadKind == -1)
8252 // user_code
8253 // else
8254 // return;
8255
8256 auto *UI = Builder.CreateUnreachable();
8257 BasicBlock *CheckBB = UI->getParent();
8258 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
8259
8260 BasicBlock *WorkerExitBB = BasicBlock::Create(
8261 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
8262 Builder.SetInsertPoint(WorkerExitBB);
8263 Builder.CreateRetVoid();
8264
8265 auto *CheckBBTI = CheckBB->getTerminator();
8266 Builder.SetInsertPoint(CheckBBTI);
8267 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
8268
8269 CheckBBTI->eraseFromParent();
8270 UI->eraseFromParent();
8271
8272 // Continue in the "user_code" block, see diagram above and in
8273 // openmp/libomptarget/deviceRTLs/common/include/target.h .
8274 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
8275}
8276
8278 int32_t TeamsReductionDataSize,
8279 int32_t TeamsReductionBufferLength) {
8280 if (!updateToLocation(Loc))
8281 return;
8282
8284 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
8285
8287
8288 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
8289 return;
8290
8291 Function *Kernel = Builder.GetInsertBlock()->getParent();
8292 // We need to strip the debug prefix to get the correct kernel name.
8293 StringRef KernelName = Kernel->getName();
8294 const std::string DebugPrefix = "_debug__";
8295 if (KernelName.ends_with(DebugPrefix))
8296 KernelName = KernelName.drop_back(DebugPrefix.length());
8297 auto *KernelEnvironmentGV =
8298 M.getNamedGlobal((KernelName + "_kernel_environment").str());
8299 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
8300 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
8301 auto *NewInitializer = ConstantFoldInsertValueInstruction(
8302 KernelEnvironmentInitializer,
8303 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
8304 NewInitializer = ConstantFoldInsertValueInstruction(
8305 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
8306 {0, 8});
8307 KernelEnvironmentGV->setInitializer(NewInitializer);
8308}
8309
8310static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
8311 bool Min) {
8312 if (Kernel.hasFnAttribute(Name)) {
8313 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
8314 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
8315 }
8316 Kernel.addFnAttr(Name, llvm::utostr(Value));
8317}
8318
8319std::pair<int32_t, int32_t>
8321 int32_t ThreadLimit =
8322 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
8323
8324 if (T.isAMDGPU()) {
8325 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
8326 if (!Attr.isValid() || !Attr.isStringAttribute())
8327 return {0, ThreadLimit};
8328 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
8329 int32_t LB, UB;
8330 if (!llvm::to_integer(UBStr, UB, 10))
8331 return {0, ThreadLimit};
8332 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
8333 if (!llvm::to_integer(LBStr, LB, 10))
8334 return {0, UB};
8335 return {LB, UB};
8336 }
8337
8338 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
8339 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
8340 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
8341 }
8342 return {0, ThreadLimit};
8343}
8344
8346 Function &Kernel, int32_t LB,
8347 int32_t UB) {
8348 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
8349
8350 if (T.isAMDGPU()) {
8351 Kernel.addFnAttr("amdgpu-flat-work-group-size",
8352 llvm::utostr(LB) + "," + llvm::utostr(UB));
8353 return;
8354 }
8355
8357}
8358
8359std::pair<int32_t, int32_t>
8361 // TODO: Read from backend annotations if available.
8362 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8363}
8364
8366 int32_t LB, int32_t UB) {
8367 if (UB > 0) {
8368 if (T.isNVPTX())
8370 if (T.isAMDGPU())
8371 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(UB) + ",1,1");
8372 }
8373
8374 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8375}
8376
8377void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8378 Function *OutlinedFn) {
8379 if (Config.isTargetDevice()) {
8381 // TODO: Determine if DSO local can be set to true.
8382 OutlinedFn->setDSOLocal(false);
8384 if (T.isAMDGCN())
8386 else if (T.isNVPTX())
8388 else if (T.isSPIRV())
8390 }
8391}
8392
8393Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8394 StringRef EntryFnIDName) {
8395 if (Config.isTargetDevice()) {
8396 assert(OutlinedFn && "The outlined function must exist if embedded");
8397 return OutlinedFn;
8398 }
8399
8400 return new GlobalVariable(
8401 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8402 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8403}
8404
8405Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8406 StringRef EntryFnName) {
8407 if (OutlinedFn)
8408 return OutlinedFn;
8409
8410 assert(!M.getGlobalVariable(EntryFnName, true) &&
8411 "Named kernel already exists?");
8412 return new GlobalVariable(
8413 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8414 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8415}
8416
8418 TargetRegionEntryInfo &EntryInfo,
8419 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8420 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8421
8422 SmallString<64> EntryFnName;
8423 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8424
8425 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8426 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8427 if (!CBResult)
8428 return CBResult.takeError();
8429 OutlinedFn = *CBResult;
8430 } else {
8431 OutlinedFn = nullptr;
8432 }
8433
8434 // If this target outline function is not an offload entry, we don't need to
8435 // register it. This may be in the case of a false if clause, or if there are
8436 // no OpenMP targets.
8437 if (!IsOffloadEntry)
8438 return Error::success();
8439
8440 std::string EntryFnIDName =
8441 Config.isTargetDevice()
8442 ? std::string(EntryFnName)
8443 : createPlatformSpecificName({EntryFnName, "region_id"});
8444
8445 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8446 EntryFnName, EntryFnIDName);
8447 return Error::success();
8448}
8449
8451 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8452 StringRef EntryFnName, StringRef EntryFnIDName) {
8453 if (OutlinedFn)
8454 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8455 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8456 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8457 OffloadInfoManager.registerTargetRegionEntryInfo(
8458 EntryInfo, EntryAddr, OutlinedFnID,
8460 return OutlinedFnID;
8461}
8462
8464 const LocationDescription &Loc, InsertPointTy AllocaIP,
8465 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
8466 Value *DeviceID, Value *IfCond, TargetDataInfo &Info,
8467 GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB,
8468 omp::RuntimeFunction *MapperFunc,
8470 BodyGenTy BodyGenType)>
8471 BodyGenCB,
8472 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8473 if (!updateToLocation(Loc))
8474 return InsertPointTy();
8475
8476 Builder.restoreIP(CodeGenIP);
8477
8478 bool IsStandAlone = !BodyGenCB;
8479 MapInfosTy *MapInfo;
8480 // Generate the code for the opening of the data environment. Capture all the
8481 // arguments of the runtime call by reference because they are used in the
8482 // closing of the region.
8483 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8484 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8485 MapInfo = &GenMapInfoCB(Builder.saveIP());
8486 if (Error Err = emitOffloadingArrays(
8487 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8488 /*IsNonContiguous=*/true, DeviceAddrCB))
8489 return Err;
8490
8491 TargetDataRTArgs RTArgs;
8493
8494 // Emit the number of elements in the offloading arrays.
8495 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8496
8497 // Source location for the ident struct
8498 if (!SrcLocInfo) {
8499 uint32_t SrcLocStrSize;
8500 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8501 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8502 }
8503
8504 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8505 SrcLocInfo, DeviceID,
8506 PointerNum, RTArgs.BasePointersArray,
8507 RTArgs.PointersArray, RTArgs.SizesArray,
8508 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8509 RTArgs.MappersArray};
8510
8511 if (IsStandAlone) {
8512 assert(MapperFunc && "MapperFunc missing for standalone target data");
8513
8514 auto TaskBodyCB = [&](Value *, Value *,
8516 if (Info.HasNoWait) {
8517 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8521 }
8522
8524 OffloadingArgs);
8525
8526 if (Info.HasNoWait) {
8527 BasicBlock *OffloadContBlock =
8528 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8529 Function *CurFn = Builder.GetInsertBlock()->getParent();
8530 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8531 Builder.restoreIP(Builder.saveIP());
8532 }
8533 return Error::success();
8534 };
8535
8536 bool RequiresOuterTargetTask = Info.HasNoWait;
8537 if (!RequiresOuterTargetTask)
8538 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8539 /*TargetTaskAllocaIP=*/{}));
8540 else
8541 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8542 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8543 } else {
8544 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8545 omp::OMPRTL___tgt_target_data_begin_mapper);
8546
8547 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8548
8549 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8550 if (isa<AllocaInst>(DeviceMap.second.second)) {
8551 auto *LI =
8552 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8553 Builder.CreateStore(LI, DeviceMap.second.second);
8554 }
8555 }
8556
8557 // If device pointer privatization is required, emit the body of the
8558 // region here. It will have to be duplicated: with and without
8559 // privatization.
8560 InsertPointOrErrorTy AfterIP =
8561 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8562 if (!AfterIP)
8563 return AfterIP.takeError();
8564 Builder.restoreIP(*AfterIP);
8565 }
8566 return Error::success();
8567 };
8568
8569 // If we need device pointer privatization, we need to emit the body of the
8570 // region with no privatization in the 'else' branch of the conditional.
8571 // Otherwise, we don't have to do anything.
8572 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8573 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8574 InsertPointOrErrorTy AfterIP =
8575 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8576 if (!AfterIP)
8577 return AfterIP.takeError();
8578 Builder.restoreIP(*AfterIP);
8579 return Error::success();
8580 };
8581
8582 // Generate code for the closing of the data region.
8583 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8584 ArrayRef<BasicBlock *> DeallocBlocks) {
8585 TargetDataRTArgs RTArgs;
8586 Info.EmitDebug = !MapInfo->Names.empty();
8587 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8588
8589 // Emit the number of elements in the offloading arrays.
8590 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8591
8592 // Source location for the ident struct
8593 if (!SrcLocInfo) {
8594 uint32_t SrcLocStrSize;
8595 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8596 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8597 }
8598
8599 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8600 PointerNum, RTArgs.BasePointersArray,
8601 RTArgs.PointersArray, RTArgs.SizesArray,
8602 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8603 RTArgs.MappersArray};
8604 Function *EndMapperFunc =
8605 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8606
8607 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8608 return Error::success();
8609 };
8610
8611 // We don't have to do anything to close the region if the if clause evaluates
8612 // to false.
8613 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8614 ArrayRef<BasicBlock *> DeallocBlocks) {
8615 return Error::success();
8616 };
8617
8618 Error Err = [&]() -> Error {
8619 if (BodyGenCB) {
8620 Error Err = [&]() {
8621 if (IfCond)
8622 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8623 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8624 }();
8625
8626 if (Err)
8627 return Err;
8628
8629 // If we don't require privatization of device pointers, we emit the body
8630 // in between the runtime calls. This avoids duplicating the body code.
8631 InsertPointOrErrorTy AfterIP =
8632 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8633 if (!AfterIP)
8634 return AfterIP.takeError();
8635 restoreIPandDebugLoc(Builder, *AfterIP);
8636
8637 if (IfCond)
8638 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8639 return EndThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8640 }
8641 if (IfCond)
8642 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8643 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8644 }();
8645
8646 if (Err)
8647 return Err;
8648
8649 return Builder.saveIP();
8650}
8651
8654 bool IsGPUDistribute) {
8655 assert((IVSize == 32 || IVSize == 64) &&
8656 "IV size is not compatible with the omp runtime");
8657 RuntimeFunction Name;
8658 if (IsGPUDistribute)
8659 Name = IVSize == 32
8660 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8661 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8662 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8663 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8664 else
8665 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8666 : omp::OMPRTL___kmpc_for_static_init_4u)
8667 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8668 : omp::OMPRTL___kmpc_for_static_init_8u);
8669
8670 return getOrCreateRuntimeFunction(M, Name);
8671}
8672
8674 bool IVSigned) {
8675 assert((IVSize == 32 || IVSize == 64) &&
8676 "IV size is not compatible with the omp runtime");
8677 RuntimeFunction Name = IVSize == 32
8678 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8679 : omp::OMPRTL___kmpc_dispatch_init_4u)
8680 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8681 : omp::OMPRTL___kmpc_dispatch_init_8u);
8682
8683 return getOrCreateRuntimeFunction(M, Name);
8684}
8685
8687 bool IVSigned) {
8688 assert((IVSize == 32 || IVSize == 64) &&
8689 "IV size is not compatible with the omp runtime");
8690 RuntimeFunction Name = IVSize == 32
8691 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8692 : omp::OMPRTL___kmpc_dispatch_next_4u)
8693 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8694 : omp::OMPRTL___kmpc_dispatch_next_8u);
8695
8696 return getOrCreateRuntimeFunction(M, Name);
8697}
8698
8700 bool IVSigned) {
8701 assert((IVSize == 32 || IVSize == 64) &&
8702 "IV size is not compatible with the omp runtime");
8703 RuntimeFunction Name = IVSize == 32
8704 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8705 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8706 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8707 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8708
8709 return getOrCreateRuntimeFunction(M, Name);
8710}
8711
8713 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8714}
8715
8717 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8718 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8719
8720 DISubprogram *NewSP = Func->getSubprogram();
8721 if (!NewSP)
8722 return;
8723
8725
8726 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8727 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8728 // Only use cached variable if the arg number matches. This is important
8729 // so that DIVariable created for privatized variables are not discarded.
8730 if (NewVar && (arg == NewVar->getArg()))
8731 return NewVar;
8732
8734 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8735 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8736 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8737 return NewVar;
8738 };
8739
8740 auto UpdateDebugRecord = [&](auto *DR) {
8741 DILocalVariable *OldVar = DR->getVariable();
8742 unsigned ArgNo = 0;
8743 for (auto Loc : DR->location_ops()) {
8744 auto Iter = ValueReplacementMap.find(Loc);
8745 if (Iter != ValueReplacementMap.end()) {
8746 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8747 ArgNo = std::get<1>(Iter->second) + 1;
8748 }
8749 }
8750 if (ArgNo != 0)
8751 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8752 };
8753
8755 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8756 if (DVR->getNumVariableLocationOps() != 1u) {
8757 DVR->setKillLocation();
8758 return;
8759 }
8760 Value *Loc = DVR->getVariableLocationOp(0u);
8761 BasicBlock *CurBB = DVR->getParent();
8762 BasicBlock *RequiredBB = nullptr;
8763
8764 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8765 RequiredBB = LocInst->getParent();
8766 else if (isa<llvm::Argument>(Loc))
8767 RequiredBB = &DVR->getFunction()->getEntryBlock();
8768
8769 if (RequiredBB && RequiredBB != CurBB) {
8770 assert(!RequiredBB->empty());
8771 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8772 RequiredBB->back().getIterator());
8773 DVRsToDelete.push_back(DVR);
8774 }
8775 };
8776
8777 // The location and scope of variable intrinsics and records still point to
8778 // the parent function of the target region. Update them.
8779 for (Instruction &I : instructions(Func)) {
8781 "Unexpected debug intrinsic");
8782 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8783 UpdateDebugRecord(&DVR);
8784 MoveDebugRecordToCorrectBlock(&DVR);
8785 }
8786 }
8787 for (auto *DVR : DVRsToDelete)
8788 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8789 // An extra argument is passed to the device. Create the debug data for it.
8790 if (OMPBuilder.Config.isTargetDevice()) {
8791 DICompileUnit *CU = NewSP->getUnit();
8792 Module *M = Func->getParent();
8793 DIBuilder DB(*M, true, CU);
8794 DIType *VoidPtrTy =
8795 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8796 unsigned ArgNo = Func->arg_size();
8797 DILocalVariable *Var = DB.createParameterVariable(
8798 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8799 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8800 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8801 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8802 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8803 &(*Func->begin()));
8804 }
8805}
8806
8808 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8809 return cast<Operator>(V)->getOperand(0);
8810 return V;
8811}
8812
8814 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8816 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8819 SmallVector<Type *> ParameterTypes;
8820 if (OMPBuilder.Config.isTargetDevice()) {
8821 // All parameters to target devices are passed as pointers
8822 // or i64. This assumes 64-bit address spaces/pointers.
8823 for (auto &Arg : Inputs)
8824 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8825 ? Arg->getType()
8826 : Type::getInt64Ty(Builder.getContext()));
8827 } else {
8828 for (auto &Arg : Inputs)
8829 ParameterTypes.push_back(Arg->getType());
8830 }
8831
8832 // The implicit dyn_ptr argument is always the last parameter on both host
8833 // and device so the argument counts match without runtime manipulation.
8834 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8835 ParameterTypes.push_back(PtrTy);
8836
8837 auto BB = Builder.GetInsertBlock();
8838 auto M = BB->getModule();
8839 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8840 /*isVarArg*/ false);
8841 auto Func =
8842 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8843
8844 // Forward target-cpu and target-features function attributes from the
8845 // original function to the new outlined function.
8846 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8847
8848 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8849 if (TargetCpuAttr.isStringAttribute())
8850 Func->addFnAttr(TargetCpuAttr);
8851
8852 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8853 if (TargetFeaturesAttr.isStringAttribute())
8854 Func->addFnAttr(TargetFeaturesAttr);
8855
8856 if (OMPBuilder.Config.isTargetDevice()) {
8857 Value *ExecMode =
8858 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8859 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8860 }
8861
8862 // Save insert point.
8863 IRBuilder<>::InsertPointGuard IPG(Builder);
8864 // We will generate the entries in the outlined function but the debug
8865 // location may still be pointing to the parent function. Reset it now.
8866 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8867
8868 // Generate the region into the function.
8869 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8870 Builder.SetInsertPoint(EntryBB);
8871
8872 // Insert target init call in the device compilation pass.
8873 if (OMPBuilder.Config.isTargetDevice())
8874 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8875
8876 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8877
8878 // As we embed the user code in the middle of our target region after we
8879 // generate entry code, we must move what allocas we can into the entry
8880 // block to avoid possible breaking optimisations for device
8881 if (OMPBuilder.Config.isTargetDevice())
8883
8884 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "target.exit");
8885 BasicBlock *OutlinedBodyBB =
8886 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8888 Builder.saveIP(),
8889 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()),
8890 ExitBB);
8891 if (!AfterIP)
8892 return AfterIP.takeError();
8893 Builder.SetInsertPoint(ExitBB);
8894
8895 // Insert target deinit call in the device compilation pass.
8896 if (OMPBuilder.Config.isTargetDevice())
8897 OMPBuilder.createTargetDeinit(Builder);
8898
8899 // Insert return instruction.
8900 Builder.CreateRetVoid();
8901
8902 // New Alloca IP at entry point of created device function.
8903 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8904 auto AllocaIP = Builder.saveIP();
8905
8906 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8907
8908 // Do not include the artificial dyn_ptr argument.
8909 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8910
8912
8913 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8914 // Things like GEP's can come in the form of Constants. Constants and
8915 // ConstantExpr's do not have access to the knowledge of what they're
8916 // contained in, so we must dig a little to find an instruction so we
8917 // can tell if they're used inside of the function we're outlining. We
8918 // also replace the original constant expression with a new instruction
8919 // equivalent; an instruction as it allows easy modification in the
8920 // following loop, as we can now know the constant (instruction) is
8921 // owned by our target function and replaceUsesOfWith can now be invoked
8922 // on it (cannot do this with constants it seems). A brand new one also
8923 // allows us to be cautious as it is perhaps possible the old expression
8924 // was used inside of the function but exists and is used externally
8925 // (unlikely by the nature of a Constant, but still).
8926 // NOTE: We cannot remove dead constants that have been rewritten to
8927 // instructions at this stage, we run the risk of breaking later lowering
8928 // by doing so as we could still be in the process of lowering the module
8929 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8930 // constants we have created rewritten versions of.
8931 if (auto *Const = dyn_cast<Constant>(Input))
8932 convertUsersOfConstantsToInstructions(Const, Func, false);
8933
8934 // Collect users before iterating over them to avoid invalidating the
8935 // iteration in case a user uses Input more than once (e.g. a call
8936 // instruction).
8937 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8938 // Collect all the instructions
8940 if (auto *Instr = dyn_cast<Instruction>(User))
8941 if (Instr->getFunction() == Func)
8942 Instr->replaceUsesOfWith(Input, InputCopy);
8943 };
8944
8945 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8946
8947 // Rewrite uses of input valus to parameters.
8948 for (auto InArg : zip(Inputs, ArgRange)) {
8949 Value *Input = std::get<0>(InArg);
8950 Argument &Arg = std::get<1>(InArg);
8951 Value *InputCopy = nullptr;
8952
8953 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
8954 Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
8955 OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
8956 if (!AfterIP)
8957 return AfterIP.takeError();
8958 Builder.restoreIP(*AfterIP);
8959 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8960
8961 // In certain cases a Global may be set up for replacement, however, this
8962 // Global may be used in multiple arguments to the kernel, just segmented
8963 // apart, for example, if we have a global array, that is sectioned into
8964 // multiple mappings (technically not legal in OpenMP, but there is a case
8965 // in Fortran for Common Blocks where this is neccesary), we will end up
8966 // with GEP's into this array inside the kernel, that refer to the Global
8967 // but are technically separate arguments to the kernel for all intents and
8968 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8969 // index, it will fold into an referal to the Global, if we then encounter
8970 // this folded GEP during replacement all of the references to the
8971 // Global in the kernel will be replaced with the argument we have generated
8972 // that corresponds to it, including any other GEP's that refer to the
8973 // Global that may be other arguments. This will invalidate all of the other
8974 // preceding mapped arguments that refer to the same global that may be
8975 // separate segments. To prevent this, we defer global processing until all
8976 // other processing has been performed.
8979 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8980 continue;
8981 }
8982
8984 continue;
8985
8986 ReplaceValue(Input, InputCopy, Func);
8987 }
8988
8989 // Replace all of our deferred Input values, currently just Globals.
8990 for (auto Deferred : DeferredReplacement)
8991 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8992
8993 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8994 ValueReplacementMap);
8995 return Func;
8996}
8997/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8998/// of pointers containing shared data between the parent task and the created
8999/// task.
9001 IRBuilderBase &Builder,
9002 Value *TaskWithPrivates,
9003 Type *TaskWithPrivatesTy) {
9004
9005 Type *TaskTy = OMPIRBuilder.Task;
9006 LLVMContext &Ctx = Builder.getContext();
9007 Value *TaskT =
9008 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
9009 Value *Shareds = TaskT;
9010 // TaskWithPrivatesTy can be one of the following
9011 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9012 // %struct.privates }
9013 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
9014 //
9015 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
9016 // its first member has to be the task descriptor. TaskTy is the type of the
9017 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
9018 // first member of TaskT, gives us the pointer to shared data.
9019 if (TaskWithPrivatesTy != TaskTy)
9020 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
9021 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
9022}
9023/// Create an entry point for a target task with the following.
9024/// It'll have the following signature
9025/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
9026/// This function is called from emitTargetTask once the
9027/// code to launch the target kernel has been outlined already.
9028/// NumOffloadingArrays is the number of offloading arrays that we need to copy
9029/// into the task structure so that the deferred target task can access this
9030/// data even after the stack frame of the generating task has been rolled
9031/// back. Offloading arrays contain base pointers, pointers, sizes etc
9032/// of the data that the target kernel will access. These in effect are the
9033/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
9035 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
9036 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
9037 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
9038
9039 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
9040 // This is because PrivatesTy is the type of the structure in which
9041 // we pass the offloading arrays to the deferred target task.
9042 assert((!NumOffloadingArrays || PrivatesTy) &&
9043 "PrivatesTy cannot be nullptr when there are offloadingArrays"
9044 "to privatize");
9045
9046 Module &M = OMPBuilder.M;
9047 // KernelLaunchFunction is the target launch function, i.e.
9048 // the function that sets up kernel arguments and calls
9049 // __tgt_target_kernel to launch the kernel on the device.
9050 //
9051 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
9052
9053 // StaleCI is the CallInst which is the call to the outlined
9054 // target kernel launch function. If there are local live-in values
9055 // that the outlined function uses then these are aggregated into a structure
9056 // which is passed as the second argument. If there are no local live-in
9057 // values or if all values used by the outlined kernel are global variables,
9058 // then there's only one argument, the threadID. So, StaleCI can be
9059 //
9060 // %structArg = alloca { ptr, ptr }, align 8
9061 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
9062 // store ptr %20, ptr %gep_, align 8
9063 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
9064 // store ptr %21, ptr %gep_8, align 8
9065 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
9066 //
9067 // OR
9068 //
9069 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
9071 StaleCI->getIterator());
9072
9073 LLVMContext &Ctx = StaleCI->getParent()->getContext();
9074
9075 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
9076 Type *TaskPtrTy = OMPBuilder.TaskPtr;
9077 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
9078
9079 auto ProxyFnTy =
9080 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
9081 /* isVarArg */ false);
9082 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
9083 ".omp_target_task_proxy_func",
9084 Builder.GetInsertBlock()->getModule());
9085 Value *ThreadId = ProxyFn->getArg(0);
9086 Value *TaskWithPrivates = ProxyFn->getArg(1);
9087 ThreadId->setName("thread.id");
9088 TaskWithPrivates->setName("task");
9089
9090 bool HasShareds = SharedArgsOperandNo > 0;
9091 bool HasOffloadingArrays = NumOffloadingArrays > 0;
9092 BasicBlock *EntryBB =
9093 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
9094 Builder.SetInsertPoint(EntryBB);
9095
9096 SmallVector<Value *> KernelLaunchArgs;
9097 KernelLaunchArgs.reserve(StaleCI->arg_size());
9098 KernelLaunchArgs.push_back(ThreadId);
9099
9100 if (HasOffloadingArrays) {
9101 assert(TaskTy != TaskWithPrivatesTy &&
9102 "If there are offloading arrays to pass to the target"
9103 "TaskTy cannot be the same as TaskWithPrivatesTy");
9104 (void)TaskTy;
9105 Value *Privates =
9106 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
9107 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
9108 KernelLaunchArgs.push_back(
9109 Builder.CreateStructGEP(PrivatesTy, Privates, i));
9110 }
9111
9112 if (HasShareds) {
9113 auto *ArgStructAlloca =
9114 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
9115 assert(ArgStructAlloca &&
9116 "Unable to find the alloca instruction corresponding to arguments "
9117 "for extracted function");
9118 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
9119 std::optional<TypeSize> ArgAllocSize =
9120 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9121 assert(ArgStructType && ArgAllocSize &&
9122 "Unable to determine size of arguments for extracted function");
9123 uint64_t StructSize = ArgAllocSize->getFixedValue();
9124
9125 AllocaInst *NewArgStructAlloca =
9126 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
9127
9128 Value *SharedsSize = Builder.getInt64(StructSize);
9129
9131 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
9132
9133 Builder.CreateMemCpy(
9134 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
9135 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
9136 KernelLaunchArgs.push_back(NewArgStructAlloca);
9137 }
9138 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
9139 Builder.CreateRetVoid();
9140 return ProxyFn;
9141}
9143
9144 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
9145 return GEP->getSourceElementType();
9146 if (auto *Alloca = dyn_cast<AllocaInst>(V))
9147 return Alloca->getAllocatedType();
9148
9149 llvm_unreachable("Unhandled Instruction type");
9150 return nullptr;
9151}
9152// This function returns a struct that has at most two members.
9153// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
9154// descriptor. The second member, if needed, is a struct containing arrays
9155// that need to be passed to the offloaded target kernel. For example,
9156// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
9157// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
9158// respectively, then the types created by this function are
9159//
9160// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
9161// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9162// %struct.privates }
9163// %struct.task_with_privates is returned by this function.
9164// If there aren't any offloading arrays to pass to the target kernel,
9165// %struct.kmp_task_ompbuilder_t is returned.
9166static StructType *
9168 ArrayRef<Value *> OffloadingArraysToPrivatize) {
9169
9170 if (OffloadingArraysToPrivatize.empty())
9171 return OMPIRBuilder.Task;
9172
9173 SmallVector<Type *, 4> StructFieldTypes;
9174 for (Value *V : OffloadingArraysToPrivatize) {
9175 assert(V->getType()->isPointerTy() &&
9176 "Expected pointer to array to privatize. Got a non-pointer value "
9177 "instead");
9178 Type *ArrayTy = getOffloadingArrayType(V);
9179 assert(ArrayTy && "ArrayType cannot be nullptr");
9180 StructFieldTypes.push_back(ArrayTy);
9181 }
9182 StructType *PrivatesStructTy =
9183 StructType::create(StructFieldTypes, "struct.privates");
9184 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
9185 "struct.task_with_privates");
9186}
9188 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
9189 TargetRegionEntryInfo &EntryInfo,
9191 Function *&OutlinedFn, Constant *&OutlinedFnID,
9195
9196 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
9197 [&](StringRef EntryFnName) {
9198 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
9199 EntryFnName, Inputs, CBFunc,
9200 ArgAccessorFuncCB);
9201 };
9202
9203 return OMPBuilder.emitTargetRegionFunction(
9204 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
9205 OutlinedFnID);
9206}
9207
9209 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
9211 const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs,
9212 bool HasNoWait) {
9213
9214 // The following explains the code-gen scenario for the `target` directive. A
9215 // similar scneario is followed for other device-related directives (e.g.
9216 // `target enter data`) but in similar fashion since we only need to emit task
9217 // that encapsulates the proper runtime call.
9218 //
9219 // When we arrive at this function, the target region itself has been
9220 // outlined into the function OutlinedFn.
9221 // So at ths point, for
9222 // --------------------------------------------------------------
9223 // void user_code_that_offloads(...) {
9224 // omp target depend(..) map(from:a) map(to:b) private(i)
9225 // do i = 1, 10
9226 // a(i) = b(i) + n
9227 // }
9228 //
9229 // --------------------------------------------------------------
9230 //
9231 // we have
9232 //
9233 // --------------------------------------------------------------
9234 //
9235 // void user_code_that_offloads(...) {
9236 // %.offload_baseptrs = alloca [2 x ptr], align 8
9237 // %.offload_ptrs = alloca [2 x ptr], align 8
9238 // %.offload_mappers = alloca [2 x ptr], align 8
9239 // ;; target region has been outlined and now we need to
9240 // ;; offload to it via a target task.
9241 // }
9242 // void outlined_device_function(ptr a, ptr b, ptr n) {
9243 // n = *n_ptr;
9244 // do i = 1, 10
9245 // a(i) = b(i) + n
9246 // }
9247 //
9248 // We have to now do the following
9249 // (i) Make an offloading call to outlined_device_function using the OpenMP
9250 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
9251 // emitted by emitKernelLaunch
9252 // (ii) Create a task entry point function that calls kernel_launch_function
9253 // and is the entry point for the target task. See
9254 // '@.omp_target_task_proxy_func in the pseudocode below.
9255 // (iii) Create a task with the task entry point created in (ii)
9256 //
9257 // That is we create the following
9258 // struct task_with_privates {
9259 // struct kmp_task_ompbuilder_t task_struct;
9260 // struct privates {
9261 // [2 x ptr] ; baseptrs
9262 // [2 x ptr] ; ptrs
9263 // [2 x i64] ; sizes
9264 // }
9265 // }
9266 // void user_code_that_offloads(...) {
9267 // %.offload_baseptrs = alloca [2 x ptr], align 8
9268 // %.offload_ptrs = alloca [2 x ptr], align 8
9269 // %.offload_sizes = alloca [2 x i64], align 8
9270 //
9271 // %structArg = alloca { ptr, ptr, ptr }, align 8
9272 // %strucArg[0] = a
9273 // %strucArg[1] = b
9274 // %strucArg[2] = &n
9275 //
9276 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
9277 // sizeof(kmp_task_ompbuilder_t),
9278 // sizeof(structArg),
9279 // @.omp_target_task_proxy_func,
9280 // ...)
9281 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
9282 // sizeof(structArg))
9283 // memcpy(target_task_with_privates->privates->baseptrs,
9284 // offload_baseptrs, sizeof(offload_baseptrs)
9285 // memcpy(target_task_with_privates->privates->ptrs,
9286 // offload_ptrs, sizeof(offload_ptrs)
9287 // memcpy(target_task_with_privates->privates->sizes,
9288 // offload_sizes, sizeof(offload_sizes)
9289 // dependencies_array = ...
9290 // ;; if nowait not present
9291 // call @__kmpc_omp_wait_deps(..., dependencies_array)
9292 // call @__kmpc_omp_task_begin_if0(...)
9293 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
9294 // %target_task_with_privates)
9295 // call @__kmpc_omp_task_complete_if0(...)
9296 // }
9297 //
9298 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
9299 // ptr %task) {
9300 // %structArg = alloca {ptr, ptr, ptr}
9301 // %task_ptr = getelementptr(%task, 0, 0)
9302 // %shared_data = load (getelementptr %task_ptr, 0, 0)
9303 // mempcy(%structArg, %shared_data, sizeof(%structArg))
9304 //
9305 // %offloading_arrays = getelementptr(%task, 0, 1)
9306 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
9307 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
9308 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
9309 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
9310 // %offload_sizes, %structArg)
9311 // }
9312 //
9313 // We need the proxy function because the signature of the task entry point
9314 // expected by kmpc_omp_task is always the same and will be different from
9315 // that of the kernel_launch function.
9316 //
9317 // kernel_launch_function is generated by emitKernelLaunch and has the
9318 // always_inline attribute. For this example, it'll look like so:
9319 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
9320 // %offload_sizes, %structArg) alwaysinline {
9321 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
9322 // ; load aggregated data from %structArg
9323 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
9324 // ; offload_sizes
9325 // call i32 @__tgt_target_kernel(...,
9326 // outlined_device_function,
9327 // ptr %kernel_args)
9328 // }
9329 // void outlined_device_function(ptr a, ptr b, ptr n) {
9330 // n = *n_ptr;
9331 // do i = 1, 10
9332 // a(i) = b(i) + n
9333 // }
9334 //
9335 BasicBlock *TargetTaskBodyBB =
9336 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
9337 BasicBlock *TargetTaskAllocaBB =
9338 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
9339
9340 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
9341 TargetTaskAllocaBB->begin());
9342 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
9343
9344 auto OI = std::make_unique<OutlineInfo>();
9345 OI->EntryBB = TargetTaskAllocaBB;
9346 OI->OuterAllocBB = AllocaIP.getBlock();
9347
9348 // Add the thread ID argument.
9350 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9351 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
9352
9353 // Generate the task body which will subsequently be outlined.
9354 Builder.restoreIP(TargetTaskBodyIP);
9355 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
9356 return Err;
9357
9358 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
9359 // it is given. These blocks are enumerated by
9360 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
9361 // to be outside the region. In other words, OI.ExitBlock is expected to be
9362 // the start of the region after the outlining. We used to set OI.ExitBlock
9363 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
9364 // except when the task body is a single basic block. In that case,
9365 // OI.ExitBlock is set to the single task body block and will get left out of
9366 // the outlining process. So, simply create a new empty block to which we
9367 // uncoditionally branch from where TaskBodyCB left off
9368 OI->ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9369 emitBlock(OI->ExitBB, Builder.GetInsertBlock()->getParent(),
9370 /*IsFinished=*/true);
9371
9372 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9373 bool NeedsTargetTask = HasNoWait && DeviceID;
9374 if (NeedsTargetTask) {
9375 for (auto *V :
9376 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9377 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9378 RTArgs.SizesArray}) {
9380 OffloadingArraysToPrivatize.push_back(V);
9381 OI->ExcludeArgsFromAggregate.push_back(V);
9382 }
9383 }
9384 }
9385 OI->PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9386 DeviceID, OffloadingArraysToPrivatize](
9387 Function &OutlinedFn) mutable {
9388 assert(OutlinedFn.hasOneUse() &&
9389 "there must be a single user for the outlined function");
9390
9391 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9392
9393 // The first argument of StaleCI is always the thread id.
9394 // The next few arguments are the pointers to offloading arrays
9395 // if any. (see OffloadingArraysToPrivatize)
9396 // Finally, all other local values that are live-in into the outlined region
9397 // end up in a structure whose pointer is passed as the last argument. This
9398 // piece of data is passed in the "shared" field of the task structure. So,
9399 // we know we have to pass shareds to the task if the number of arguments is
9400 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9401 // thread id. Further, for safety, we assert that the number of arguments of
9402 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9403 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9404 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9405 assert((!HasShareds ||
9406 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9407 "Wrong number of arguments for StaleCI when shareds are present");
9408 int SharedArgOperandNo =
9409 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9410
9411 StructType *TaskWithPrivatesTy =
9412 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9413 StructType *PrivatesTy = nullptr;
9414
9415 if (!OffloadingArraysToPrivatize.empty())
9416 PrivatesTy =
9417 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9418
9420 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9421 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9422
9423 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9424 << "\n");
9425
9426 Builder.SetInsertPoint(StaleCI);
9427
9428 // Gather the arguments for emitting the runtime call.
9429 uint32_t SrcLocStrSize;
9430 Constant *SrcLocStr =
9432 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9433
9434 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9435 //
9436 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9437 // the DeviceID to the deferred task and also since
9438 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9439 Function *TaskAllocFn =
9440 !NeedsTargetTask
9441 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9443 OMPRTL___kmpc_omp_target_task_alloc);
9444
9445 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9446 // call.
9447 Value *ThreadID = getOrCreateThreadID(Ident);
9448
9449 // Argument - `sizeof_kmp_task_t` (TaskSize)
9450 // Tasksize refers to the size in bytes of kmp_task_t data structure
9451 // plus any other data to be passed to the target task, if any, which
9452 // is packed into a struct. kmp_task_t and the struct so created are
9453 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9454 Value *TaskSize = Builder.getInt64(
9455 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9456
9457 // Argument - `sizeof_shareds` (SharedsSize)
9458 // SharedsSize refers to the shareds array size in the kmp_task_t data
9459 // structure.
9460 Value *SharedsSize = Builder.getInt64(0);
9461 if (HasShareds) {
9462 auto *ArgStructAlloca =
9463 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9464 assert(ArgStructAlloca &&
9465 "Unable to find the alloca instruction corresponding to arguments "
9466 "for extracted function");
9467 std::optional<TypeSize> ArgAllocSize =
9468 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9469 assert(ArgAllocSize &&
9470 "Unable to determine size of arguments for extracted function");
9471 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9472 }
9473
9474 // Argument - `flags`
9475 // Task is tied iff (Flags & 1) == 1.
9476 // Task is untied iff (Flags & 1) == 0.
9477 // Task is final iff (Flags & 2) == 2.
9478 // Task is not final iff (Flags & 2) == 0.
9479 // A target task is not final and is untied.
9480 Value *Flags = Builder.getInt32(0);
9481
9482 // Emit the @__kmpc_omp_task_alloc runtime call
9483 // The runtime call returns a pointer to an area where the task captured
9484 // variables must be copied before the task is run (TaskData)
9485 CallInst *TaskData = nullptr;
9486
9487 SmallVector<llvm::Value *> TaskAllocArgs = {
9488 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9489 /*flags=*/Flags,
9490 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9491 /*task_func=*/ProxyFn};
9492
9493 if (NeedsTargetTask) {
9494 assert(DeviceID && "Expected non-empty device ID.");
9495 TaskAllocArgs.push_back(DeviceID);
9496 }
9497
9498 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9499
9500 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9501 if (HasShareds) {
9502 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9504 *this, Builder, TaskData, TaskWithPrivatesTy);
9505 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9506 SharedsSize);
9507 }
9508 if (!OffloadingArraysToPrivatize.empty()) {
9509 Value *Privates =
9510 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9511 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9512 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9513 [[maybe_unused]] Type *ArrayType =
9514 getOffloadingArrayType(PtrToPrivatize);
9515 assert(ArrayType && "ArrayType cannot be nullptr");
9516
9517 Type *ElementType = PrivatesTy->getElementType(i);
9518 assert(ElementType == ArrayType &&
9519 "ElementType should match ArrayType");
9520 (void)ArrayType;
9521
9522 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9523 Builder.CreateMemCpy(
9524 Dst, Alignment, PtrToPrivatize, Alignment,
9525 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9526 }
9527 }
9528
9529 Value *DepArray = nullptr;
9530 Value *NumDeps = nullptr;
9531 if (Dependencies.DepArray) {
9532 DepArray = Dependencies.DepArray;
9533 NumDeps = Dependencies.NumDeps;
9534 } else if (!Dependencies.Deps.empty()) {
9535 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
9536 NumDeps = Builder.getInt32(Dependencies.Deps.size());
9537 }
9538
9539 // ---------------------------------------------------------------
9540 // V5.2 13.8 target construct
9541 // If the nowait clause is present, execution of the target task
9542 // may be deferred. If the nowait clause is not present, the target task is
9543 // an included task.
9544 // ---------------------------------------------------------------
9545 // The above means that the lack of a nowait on the target construct
9546 // translates to '#pragma omp task if(0)'
9547 if (!NeedsTargetTask) {
9548 if (DepArray) {
9549 Function *TaskWaitFn =
9550 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9552 TaskWaitFn,
9553 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9554 /*ndeps=*/NumDeps,
9555 /*dep_list=*/DepArray,
9556 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9557 /*noalias_dep_list=*/
9559 }
9560 // Included task.
9561 Function *TaskBeginFn =
9562 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9563 Function *TaskCompleteFn =
9564 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9565 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9566 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9567 CI->setDebugLoc(StaleCI->getDebugLoc());
9568 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9569 } else if (DepArray) {
9570 // HasNoWait - meaning the task may be deferred. Call
9571 // __kmpc_omp_task_with_deps if there are dependencies,
9572 // else call __kmpc_omp_task
9573 Function *TaskFn =
9574 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9576 TaskFn,
9577 {Ident, ThreadID, TaskData, NumDeps, DepArray,
9578 ConstantInt::get(Builder.getInt32Ty(), 0),
9580 } else {
9581 // Emit the @__kmpc_omp_task runtime call to spawn the task
9582 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9583 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9584 }
9585
9586 StaleCI->eraseFromParent();
9587 for (Instruction *I : llvm::reverse(ToBeDeleted))
9588 I->eraseFromParent();
9589 };
9590 addOutlineInfo(std::move(OI));
9591
9592 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9593 << *(Builder.GetInsertBlock()) << "\n");
9594 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9595 << *(Builder.GetInsertBlock()->getParent()->getParent())
9596 << "\n");
9597 return Builder.saveIP();
9598}
9599
9601 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9602 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9603 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9604 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9605 if (Error Err =
9606 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9607 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9608 return Err;
9609 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9610 return Error::success();
9611}
9612
9613static void emitTargetCall(
9614 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9619 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9623 const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait,
9624 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9625 // Generate a function call to the host fallback implementation of the target
9626 // region. This is called by the host when no offload entry was generated for
9627 // the target region and when the offloading call fails at runtime.
9628 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9630 Builder.restoreIP(IP);
9631 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9632 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9633 FallbackArgs.push_back(
9634 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9635 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9636 return Builder.saveIP();
9637 };
9638
9639 bool HasDependencies = !Dependencies.empty();
9640 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9641
9643
9644 auto TaskBodyCB =
9645 [&](Value *DeviceID, Value *RTLoc,
9646 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9647 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9648 // produce any.
9650 // emitKernelLaunch makes the necessary runtime call to offload the
9651 // kernel. We then outline all that code into a separate function
9652 // ('kernel_launch_function' in the pseudo code above). This function is
9653 // then called by the target task proxy function (see
9654 // '@.omp_target_task_proxy_func' in the pseudo code above)
9655 // "@.omp_target_task_proxy_func' is generated by
9656 // emitTargetTaskProxyFunction.
9657 if (OutlinedFnID && DeviceID)
9658 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9659 EmitTargetCallFallbackCB, KArgs,
9660 DeviceID, RTLoc, TargetTaskAllocaIP);
9661
9662 // We only need to do the outlining if `DeviceID` is set to avoid calling
9663 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9664 // generating the `else` branch of an `if` clause.
9665 //
9666 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9667 // In this case, we execute the host implementation directly.
9668 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9669 }());
9670
9671 OMPBuilder.Builder.restoreIP(AfterIP);
9672 return Error::success();
9673 };
9674
9675 auto &&EmitTargetCallElse =
9676 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9678 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9679 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9680 // produce any.
9682 if (RequiresOuterTargetTask) {
9683 // Arguments that are intended to be directly forwarded to an
9684 // emitKernelLaunch call are pased as nullptr, since
9685 // OutlinedFnID=nullptr results in that call not being done.
9687 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9688 /*RTLoc=*/nullptr, AllocaIP,
9689 Dependencies, EmptyRTArgs, HasNoWait);
9690 }
9691 return EmitTargetCallFallbackCB(Builder.saveIP());
9692 }());
9693
9694 Builder.restoreIP(AfterIP);
9695 return Error::success();
9696 };
9697
9698 auto &&EmitTargetCallThen =
9699 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9701 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9702 Info.HasNoWait = HasNoWait;
9703 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9704
9706 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9707 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9708 /*IsNonContiguous=*/true,
9709 /*ForEndCall=*/false))
9710 return Err;
9711
9712 SmallVector<Value *, 3> NumTeamsC;
9713 for (auto [DefaultVal, RuntimeVal] :
9714 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9715 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9716 : Builder.getInt32(DefaultVal));
9717
9718 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9719 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9720 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9721 if (Clause)
9722 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9723 /*isSigned=*/false);
9724 return Clause;
9725 };
9726 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9727 if (Clause)
9728 Result =
9729 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9730 Result, Clause)
9731 : Clause;
9732 };
9733
9734 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9735 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9736 SmallVector<Value *, 3> NumThreadsC;
9737 Value *MaxThreadsClause =
9738 RuntimeAttrs.TeamsThreadLimit.size() == 1
9739 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9740 : nullptr;
9741
9742 for (auto [TeamsVal, TargetVal] : zip_equal(
9743 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9744 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9745 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9746
9747 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9748 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9749
9750 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9751 }
9752
9753 unsigned NumTargetItems = Info.NumberOfPtrs;
9754 uint32_t SrcLocStrSize;
9755 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9756 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9757 llvm::omp::IdentFlag(0), 0);
9758
9759 Value *TripCount = RuntimeAttrs.LoopTripCount
9760 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9761 Builder.getInt64Ty(),
9762 /*isSigned=*/false)
9763 : Builder.getInt64(0);
9764
9765 // Request zero groupprivate bytes by default.
9766 if (!DynCGroupMem)
9767 DynCGroupMem = Builder.getInt32(0);
9768
9770 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9771 HasNoWait, DynCGroupMemFallback);
9772
9773 // Assume no error was returned because TaskBodyCB and
9774 // EmitTargetCallFallbackCB don't produce any.
9776 // The presence of certain clauses on the target directive require the
9777 // explicit generation of the target task.
9778 if (RequiresOuterTargetTask)
9779 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9780 RTLoc, AllocaIP, Dependencies,
9781 KArgs.RTArgs, Info.HasNoWait);
9782
9783 return OMPBuilder.emitKernelLaunch(
9784 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9785 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9786 }());
9787
9788 Builder.restoreIP(AfterIP);
9789 return Error::success();
9790 };
9791
9792 // If we don't have an ID for the target region, it means an offload entry
9793 // wasn't created. In this case we just run the host fallback directly and
9794 // ignore any potential 'if' clauses.
9795 if (!OutlinedFnID) {
9796 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP(), DeallocBlocks));
9797 return;
9798 }
9799
9800 // If there's no 'if' clause, only generate the kernel launch code path.
9801 if (!IfCond) {
9802 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP(), DeallocBlocks));
9803 return;
9804 }
9805
9806 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9807 EmitTargetCallElse, AllocaIP));
9808}
9809
9811 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9812 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
9813 TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo,
9814 const TargetKernelDefaultAttrs &DefaultAttrs,
9815 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9816 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9819 CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies,
9820 bool HasNowait, Value *DynCGroupMem,
9821 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9822
9823 if (!updateToLocation(Loc))
9824 return InsertPointTy();
9825
9826 Builder.restoreIP(CodeGenIP);
9827
9828 Function *OutlinedFn;
9829 Constant *OutlinedFnID = nullptr;
9830 // The target region is outlined into its own function. The LLVM IR for
9831 // the target region itself is generated using the callbacks CBFunc
9832 // and ArgAccessorFuncCB
9834 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9835 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9836 return Err;
9837
9838 // If we are not on the target device, then we need to generate code
9839 // to make a remote call (offload) to the previously outlined function
9840 // that represents the target region. Do that now.
9841 if (!Config.isTargetDevice())
9842 emitTargetCall(*this, Builder, AllocaIP, DeallocBlocks, Info, DefaultAttrs,
9843 RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs,
9844 GenMapInfoCB, CustomMapperCB, Dependencies, HasNowait,
9845 DynCGroupMem, DynCGroupMemFallback);
9846 return Builder.saveIP();
9847}
9848
9849std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9850 StringRef FirstSeparator,
9851 StringRef Separator) {
9852 SmallString<128> Buffer;
9853 llvm::raw_svector_ostream OS(Buffer);
9854 StringRef Sep = FirstSeparator;
9855 for (StringRef Part : Parts) {
9856 OS << Sep << Part;
9857 Sep = Separator;
9858 }
9859 return OS.str().str();
9860}
9861
9862std::string
9864 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9865 Config.separator());
9866}
9867
9869 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9870 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9871 if (Elem.second) {
9872 assert(Elem.second->getValueType() == Ty &&
9873 "OMP internal variable has different type than requested");
9874 } else {
9875 // TODO: investigate the appropriate linkage type used for the global
9876 // variable for possibly changing that to internal or private, or maybe
9877 // create different versions of the function for different OMP internal
9878 // variables.
9879 const DataLayout &DL = M.getDataLayout();
9880 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9881 // default global AS is 1.
9882 // See double-target-call-with-declare-target.f90 and
9883 // declare-target-vars-in-target-region.f90 libomptarget
9884 // tests.
9885 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9886 : M.getTargetTriple().isAMDGPU()
9887 ? 0
9888 : DL.getDefaultGlobalsAddressSpace();
9889 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9892 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9893 Constant::getNullValue(Ty), Elem.first(),
9894 /*InsertBefore=*/nullptr,
9895 GlobalValue::NotThreadLocal, AddressSpaceVal);
9896 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9897 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9898 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9899 Elem.second = GV;
9900 }
9901
9902 return Elem.second;
9903}
9904
9905Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9906 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9907 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9908 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9909}
9910
9912 LLVMContext &Ctx = Builder.getContext();
9913 Value *Null =
9914 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9915 Value *SizeGep =
9916 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9917 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9918 return SizePtrToInt;
9919}
9920
9923 std::string VarName) {
9924 llvm::Constant *MaptypesArrayInit =
9925 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9926 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9927 M, MaptypesArrayInit->getType(),
9928 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9929 VarName);
9930 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9931 return MaptypesArrayGlobal;
9932}
9933
9935 InsertPointTy AllocaIP,
9936 unsigned NumOperands,
9937 struct MapperAllocas &MapperAllocas) {
9938 if (!updateToLocation(Loc))
9939 return;
9940
9941 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9942 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9943 Builder.restoreIP(AllocaIP);
9944 AllocaInst *ArgsBase = Builder.CreateAlloca(
9945 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9946 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9947 ".offload_ptrs");
9948 AllocaInst *ArgSizes = Builder.CreateAlloca(
9949 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9951 MapperAllocas.ArgsBase = ArgsBase;
9952 MapperAllocas.Args = Args;
9953 MapperAllocas.ArgSizes = ArgSizes;
9954}
9955
9957 Function *MapperFunc, Value *SrcLocInfo,
9958 Value *MaptypesArg, Value *MapnamesArg,
9960 int64_t DeviceID, unsigned NumOperands) {
9961 if (!updateToLocation(Loc))
9962 return;
9963
9964 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9965 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9966 Value *ArgsBaseGEP =
9967 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9968 {Builder.getInt32(0), Builder.getInt32(0)});
9969 Value *ArgsGEP =
9970 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9971 {Builder.getInt32(0), Builder.getInt32(0)});
9972 Value *ArgSizesGEP =
9973 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9974 {Builder.getInt32(0), Builder.getInt32(0)});
9975 Value *NullPtr =
9976 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9977 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9978 Builder.getInt32(NumOperands),
9979 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9980 MaptypesArg, MapnamesArg, NullPtr});
9981}
9982
9984 TargetDataRTArgs &RTArgs,
9985 TargetDataInfo &Info,
9986 bool ForEndCall) {
9987 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9988 "expected region end call to runtime only when end call is separate");
9989 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9990 auto VoidPtrTy = UnqualPtrTy;
9991 auto VoidPtrPtrTy = UnqualPtrTy;
9992 auto Int64Ty = Type::getInt64Ty(M.getContext());
9993 auto Int64PtrTy = UnqualPtrTy;
9994
9995 if (!Info.NumberOfPtrs) {
9996 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9997 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9998 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9999 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
10000 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10001 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10002 return;
10003 }
10004
10005 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
10006 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
10007 Info.RTArgs.BasePointersArray,
10008 /*Idx0=*/0, /*Idx1=*/0);
10009 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
10010 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
10011 /*Idx0=*/0,
10012 /*Idx1=*/0);
10013 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
10014 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10015 /*Idx0=*/0, /*Idx1=*/0);
10016 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
10017 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
10018 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
10019 : Info.RTArgs.MapTypesArray,
10020 /*Idx0=*/0,
10021 /*Idx1=*/0);
10022
10023 // Only emit the mapper information arrays if debug information is
10024 // requested.
10025 if (!Info.EmitDebug)
10026 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10027 else
10028 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
10029 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
10030 /*Idx0=*/0,
10031 /*Idx1=*/0);
10032 // If there is no user-defined mapper, set the mapper array to nullptr to
10033 // avoid an unnecessary data privatization
10034 if (!Info.HasMapper)
10035 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10036 else
10037 RTArgs.MappersArray =
10038 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
10039}
10040
10042 InsertPointTy CodeGenIP,
10043 MapInfosTy &CombinedInfo,
10044 TargetDataInfo &Info) {
10046 CombinedInfo.NonContigInfo;
10047
10048 // Build an array of struct descriptor_dim and then assign it to
10049 // offload_args.
10050 //
10051 // struct descriptor_dim {
10052 // uint64_t offset;
10053 // uint64_t count;
10054 // uint64_t stride
10055 // };
10056 Type *Int64Ty = Builder.getInt64Ty();
10058 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
10059 "struct.descriptor_dim");
10060
10061 enum { OffsetFD = 0, CountFD, StrideFD };
10062 // We need two index variable here since the size of "Dims" is the same as
10063 // the size of Components, however, the size of offset, count, and stride is
10064 // equal to the size of base declaration that is non-contiguous.
10065 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
10066 // Skip emitting ir if dimension size is 1 since it cannot be
10067 // non-contiguous.
10068 if (NonContigInfo.Dims[I] == 1)
10069 continue;
10070 Builder.restoreIP(AllocaIP);
10071 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
10072 AllocaInst *DimsAddr =
10073 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
10074 Builder.restoreIP(CodeGenIP);
10075 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
10076 unsigned RevIdx = EE - II - 1;
10077 Value *DimsLVal = Builder.CreateInBoundsGEP(
10078 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
10079 // Offset
10080 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
10081 Builder.CreateAlignedStore(
10082 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
10083 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
10084 // Count
10085 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
10086 Builder.CreateAlignedStore(
10087 NonContigInfo.Counts[L][RevIdx], CountLVal,
10088 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10089 // Stride
10090 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
10091 Builder.CreateAlignedStore(
10092 NonContigInfo.Strides[L][RevIdx], StrideLVal,
10093 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10094 }
10095 // args[I] = &dims
10096 Builder.restoreIP(CodeGenIP);
10097 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
10098 DimsAddr, Builder.getPtrTy());
10099 Value *P = Builder.CreateConstInBoundsGEP2_32(
10100 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
10101 Info.RTArgs.PointersArray, 0, I);
10102 Builder.CreateAlignedStore(
10103 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
10104 ++L;
10105 }
10106}
10107
10108void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
10109 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
10110 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
10111 BasicBlock *ExitBB, bool IsInit) {
10112 StringRef Prefix = IsInit ? ".init" : ".del";
10113
10114 // Evaluate if this is an array section.
10116 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
10117 Value *IsArray =
10118 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
10119 Value *DeleteBit = Builder.CreateAnd(
10120 MapType,
10121 Builder.getInt64(
10122 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10123 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
10124 Value *DeleteCond;
10125 Value *Cond;
10126 if (IsInit) {
10127 // base != begin?
10128 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
10129 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
10130 DeleteCond = Builder.CreateIsNull(
10131 DeleteBit,
10132 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10133 } else {
10134 Cond = IsArray;
10135 DeleteCond = Builder.CreateIsNotNull(
10136 DeleteBit,
10137 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10138 }
10139 Cond = Builder.CreateAnd(Cond, DeleteCond);
10140 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
10141
10142 emitBlock(BodyBB, MapperFn);
10143 // Get the array size by multiplying element size and element number (i.e., \p
10144 // Size).
10145 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
10146 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
10147 // memory allocation/deletion purpose only.
10148 Value *MapTypeArg = Builder.CreateAnd(
10149 MapType,
10150 Builder.getInt64(
10151 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10152 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10153 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10154 MapTypeArg = Builder.CreateOr(
10155 MapTypeArg,
10156 Builder.getInt64(
10157 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10158 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
10159
10160 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10161 // data structure.
10162 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
10163 ArraySize, MapTypeArg, MapName};
10165 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10166 OffloadingArgs);
10167}
10168
10171 llvm::Value *BeginArg)>
10172 GenMapInfoCB,
10173 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
10174 SmallVector<Type *> Params;
10175 Params.emplace_back(Builder.getPtrTy());
10176 Params.emplace_back(Builder.getPtrTy());
10177 Params.emplace_back(Builder.getPtrTy());
10178 Params.emplace_back(Builder.getInt64Ty());
10179 Params.emplace_back(Builder.getInt64Ty());
10180 Params.emplace_back(Builder.getPtrTy());
10181
10182 auto *FnTy =
10183 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
10184
10185 SmallString<64> TyStr;
10186 raw_svector_ostream Out(TyStr);
10187 Function *MapperFn =
10189 MapperFn->addFnAttr(Attribute::NoInline);
10190 MapperFn->addFnAttr(Attribute::NoUnwind);
10191 MapperFn->addParamAttr(0, Attribute::NoUndef);
10192 MapperFn->addParamAttr(1, Attribute::NoUndef);
10193 MapperFn->addParamAttr(2, Attribute::NoUndef);
10194 MapperFn->addParamAttr(3, Attribute::NoUndef);
10195 MapperFn->addParamAttr(4, Attribute::NoUndef);
10196 MapperFn->addParamAttr(5, Attribute::NoUndef);
10197
10198 // Start the mapper function code generation.
10199 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
10200 auto SavedIP = Builder.saveIP();
10201 Builder.SetInsertPoint(EntryBB);
10202
10203 Value *MapperHandle = MapperFn->getArg(0);
10204 Value *BaseIn = MapperFn->getArg(1);
10205 Value *BeginIn = MapperFn->getArg(2);
10206 Value *Size = MapperFn->getArg(3);
10207 Value *MapType = MapperFn->getArg(4);
10208 Value *MapName = MapperFn->getArg(5);
10209
10210 // Compute the starting and end addresses of array elements.
10211 // Prepare common arguments for array initiation and deletion.
10212 // Convert the size in bytes into the number of array elements.
10213 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
10214 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
10215 Value *PtrBegin = BeginIn;
10216 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
10217
10218 // Emit array initiation if this is an array section and \p MapType indicates
10219 // that memory allocation is required.
10220 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
10221 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10222 MapType, MapName, ElementSize, HeadBB,
10223 /*IsInit=*/true);
10224
10225 // Emit a for loop to iterate through SizeArg of elements and map all of them.
10226
10227 // Emit the loop header block.
10228 emitBlock(HeadBB, MapperFn);
10229 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
10230 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
10231 // Evaluate whether the initial condition is satisfied.
10232 Value *IsEmpty =
10233 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
10234 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
10235
10236 // Emit the loop body block.
10237 emitBlock(BodyBB, MapperFn);
10238 BasicBlock *LastBB = BodyBB;
10239 PHINode *PtrPHI =
10240 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
10241 PtrPHI->addIncoming(PtrBegin, HeadBB);
10242
10243 // Get map clause information. Fill up the arrays with all mapped variables.
10244 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
10245 if (!Info)
10246 return Info.takeError();
10247
10248 // Call the runtime API __tgt_mapper_num_components to get the number of
10249 // pre-existing components.
10250 Value *OffloadingArgs[] = {MapperHandle};
10251 Value *PreviousSize = createRuntimeFunctionCall(
10252 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
10253 OffloadingArgs);
10254 Value *ShiftedPreviousSize =
10255 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
10256
10257 // Fill up the runtime mapper handle for all components.
10258 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
10259 Value *CurBaseArg = Info->BasePointers[I];
10260 Value *CurBeginArg = Info->Pointers[I];
10261 Value *CurSizeArg = Info->Sizes[I];
10262 Value *CurNameArg = Info->Names.size()
10263 ? Info->Names[I]
10264 : Constant::getNullValue(Builder.getPtrTy());
10265
10266 // Extract the MEMBER_OF field from the map type.
10267 Value *OriMapType = Builder.getInt64(
10268 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10269 Info->Types[I]));
10270 Value *MemberMapType =
10271 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
10272
10273 // Combine the map type inherited from user-defined mapper with that
10274 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
10275 // bits of the \a MapType, which is the input argument of the mapper
10276 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
10277 // bits of MemberMapType.
10278 // [OpenMP 5.0], 1.2.6. map-type decay.
10279 // | alloc | to | from | tofrom | release | delete
10280 // ----------------------------------------------------------
10281 // alloc | alloc | alloc | alloc | alloc | release | delete
10282 // to | alloc | to | alloc | to | release | delete
10283 // from | alloc | alloc | from | from | release | delete
10284 // tofrom | alloc | to | from | tofrom | release | delete
10285 Value *LeftToFrom = Builder.CreateAnd(
10286 MapType,
10287 Builder.getInt64(
10288 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10289 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10290 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10291 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
10292 BasicBlock *AllocElseBB =
10293 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
10294 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
10295 BasicBlock *ToElseBB =
10296 BasicBlock::Create(M.getContext(), "omp.type.to.else");
10297 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
10298 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
10299 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
10300 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
10301 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
10302 emitBlock(AllocBB, MapperFn);
10303 Value *AllocMapType = Builder.CreateAnd(
10304 MemberMapType,
10305 Builder.getInt64(
10306 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10307 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10308 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10309 Builder.CreateBr(EndBB);
10310 emitBlock(AllocElseBB, MapperFn);
10311 Value *IsTo = Builder.CreateICmpEQ(
10312 LeftToFrom,
10313 Builder.getInt64(
10314 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10315 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10316 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
10317 // In case of to, clear OMP_MAP_FROM.
10318 emitBlock(ToBB, MapperFn);
10319 Value *ToMapType = Builder.CreateAnd(
10320 MemberMapType,
10321 Builder.getInt64(
10322 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10323 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10324 Builder.CreateBr(EndBB);
10325 emitBlock(ToElseBB, MapperFn);
10326 Value *IsFrom = Builder.CreateICmpEQ(
10327 LeftToFrom,
10328 Builder.getInt64(
10329 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10330 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10331 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
10332 // In case of from, clear OMP_MAP_TO.
10333 emitBlock(FromBB, MapperFn);
10334 Value *FromMapType = Builder.CreateAnd(
10335 MemberMapType,
10336 Builder.getInt64(
10337 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10338 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10339 // In case of tofrom, do nothing.
10340 emitBlock(EndBB, MapperFn);
10341 LastBB = EndBB;
10342 PHINode *CurMapType =
10343 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
10344 CurMapType->addIncoming(AllocMapType, AllocBB);
10345 CurMapType->addIncoming(ToMapType, ToBB);
10346 CurMapType->addIncoming(FromMapType, FromBB);
10347 CurMapType->addIncoming(MemberMapType, ToElseBB);
10348
10349 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
10350 CurSizeArg, CurMapType, CurNameArg};
10351
10352 auto ChildMapperFn = CustomMapperCB(I);
10353 if (!ChildMapperFn)
10354 return ChildMapperFn.takeError();
10355 if (*ChildMapperFn) {
10356 // Call the corresponding mapper function.
10357 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
10358 ->setDoesNotThrow();
10359 } else {
10360 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10361 // data structure.
10363 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10364 OffloadingArgs);
10365 }
10366 }
10367
10368 // Update the pointer to point to the next element that needs to be mapped,
10369 // and check whether we have mapped all elements.
10370 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
10371 "omp.arraymap.next");
10372 PtrPHI->addIncoming(PtrNext, LastBB);
10373 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
10374 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
10375 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
10376
10377 emitBlock(ExitBB, MapperFn);
10378 // Emit array deletion if this is an array section and \p MapType indicates
10379 // that deletion is required.
10380 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10381 MapType, MapName, ElementSize, DoneBB,
10382 /*IsInit=*/false);
10383
10384 // Emit the function exit block.
10385 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10386
10387 Builder.CreateRetVoid();
10388 Builder.restoreIP(SavedIP);
10389 return MapperFn;
10390}
10391
10393 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10394 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10395 bool IsNonContiguous,
10396 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10397
10398 // Reset the array information.
10399 Info.clearArrayInfo();
10400 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10401
10402 if (Info.NumberOfPtrs == 0)
10403 return Error::success();
10404
10405 Builder.restoreIP(AllocaIP);
10406 // Detect if we have any capture size requiring runtime evaluation of the
10407 // size so that a constant array could be eventually used.
10408 ArrayType *PointerArrayType =
10409 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10410
10411 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10412 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10413
10414 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10415 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10416 AllocaInst *MappersArray = Builder.CreateAlloca(
10417 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10418 Info.RTArgs.MappersArray = MappersArray;
10419
10420 // If we don't have any VLA types or other types that require runtime
10421 // evaluation, we can use a constant array for the map sizes, otherwise we
10422 // need to fill up the arrays as we do for the pointers.
10423 Type *Int64Ty = Builder.getInt64Ty();
10424 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10425 ConstantInt::get(Int64Ty, 0));
10426 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10427 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10428 bool IsNonContigEntry =
10429 IsNonContiguous &&
10430 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10431 CombinedInfo.Types[I] &
10432 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10433 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10434 // descriptor_dim records), not the byte size.
10435 if (IsNonContigEntry) {
10436 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10437 "Index must be in-bounds for NON_CONTIG Dims array");
10438 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10439 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10440 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10441 continue;
10442 }
10443 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10444 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10445 ConstSizes[I] = CI;
10446 continue;
10447 }
10448 }
10449 RuntimeSizes.set(I);
10450 }
10451
10452 if (RuntimeSizes.all()) {
10453 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10454 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10455 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10456 restoreIPandDebugLoc(Builder, CodeGenIP);
10457 } else {
10458 auto *SizesArrayInit = ConstantArray::get(
10459 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10460 std::string Name = createPlatformSpecificName({"offload_sizes"});
10461 auto *SizesArrayGbl =
10462 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10463 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10464 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10465
10466 if (!RuntimeSizes.any()) {
10467 Info.RTArgs.SizesArray = SizesArrayGbl;
10468 } else {
10469 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10470 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10471 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10472 AllocaInst *Buffer = Builder.CreateAlloca(
10473 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10474 Buffer->setAlignment(OffloadSizeAlign);
10475 restoreIPandDebugLoc(Builder, CodeGenIP);
10476 Builder.CreateMemCpy(
10477 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10478 SizesArrayGbl, OffloadSizeAlign,
10479 Builder.getIntN(
10480 IndexSize,
10481 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10482
10483 Info.RTArgs.SizesArray = Buffer;
10484 }
10485 restoreIPandDebugLoc(Builder, CodeGenIP);
10486 }
10487
10488 // The map types are always constant so we don't need to generate code to
10489 // fill arrays. Instead, we create an array constant.
10491 for (auto mapFlag : CombinedInfo.Types)
10492 Mapping.push_back(
10493 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10494 mapFlag));
10495 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10496 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10497 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10498
10499 // The information types are only built if provided.
10500 if (!CombinedInfo.Names.empty()) {
10501 auto *MapNamesArrayGbl = createOffloadMapnames(
10502 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10503 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10504 Info.EmitDebug = true;
10505 } else {
10506 Info.RTArgs.MapNamesArray =
10508 Info.EmitDebug = false;
10509 }
10510
10511 // If there's a present map type modifier, it must not be applied to the end
10512 // of a region, so generate a separate map type array in that case.
10513 if (Info.separateBeginEndCalls()) {
10514 bool EndMapTypesDiffer = false;
10515 for (uint64_t &Type : Mapping) {
10516 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10517 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10518 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10519 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10520 EndMapTypesDiffer = true;
10521 }
10522 }
10523 if (EndMapTypesDiffer) {
10524 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10525 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10526 }
10527 }
10528
10529 PointerType *PtrTy = Builder.getPtrTy();
10530 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10531 Value *BPVal = CombinedInfo.BasePointers[I];
10532 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10533 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10534 0, I);
10535 Builder.CreateAlignedStore(BPVal, BP,
10536 M.getDataLayout().getPrefTypeAlign(PtrTy));
10537
10538 if (Info.requiresDevicePointerInfo()) {
10539 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10540 CodeGenIP = Builder.saveIP();
10541 Builder.restoreIP(AllocaIP);
10542 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10543 Builder.restoreIP(CodeGenIP);
10544 if (DeviceAddrCB)
10545 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10546 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10547 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10548 if (DeviceAddrCB)
10549 DeviceAddrCB(I, BP);
10550 }
10551 }
10552
10553 Value *PVal = CombinedInfo.Pointers[I];
10554 Value *P = Builder.CreateConstInBoundsGEP2_32(
10555 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10556 I);
10557 // TODO: Check alignment correct.
10558 Builder.CreateAlignedStore(PVal, P,
10559 M.getDataLayout().getPrefTypeAlign(PtrTy));
10560
10561 if (RuntimeSizes.test(I)) {
10562 Value *S = Builder.CreateConstInBoundsGEP2_32(
10563 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10564 /*Idx0=*/0,
10565 /*Idx1=*/I);
10566 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10567 Int64Ty,
10568 /*isSigned=*/true),
10569 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10570 }
10571 // Fill up the mapper array.
10572 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10573 Value *MFunc = ConstantPointerNull::get(PtrTy);
10574
10575 auto CustomMFunc = CustomMapperCB(I);
10576 if (!CustomMFunc)
10577 return CustomMFunc.takeError();
10578 if (*CustomMFunc)
10579 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10580
10581 Value *MAddr = Builder.CreateInBoundsGEP(
10582 PointerArrayType, MappersArray,
10583 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10584 Builder.CreateAlignedStore(
10585 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10586 }
10587
10588 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10589 Info.NumberOfPtrs == 0)
10590 return Error::success();
10591 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10592 return Error::success();
10593}
10594
10596 BasicBlock *CurBB = Builder.GetInsertBlock();
10597
10598 if (!CurBB || CurBB->hasTerminator()) {
10599 // If there is no insert point or the previous block is already
10600 // terminated, don't touch it.
10601 } else {
10602 // Otherwise, create a fall-through branch.
10603 Builder.CreateBr(Target);
10604 }
10605
10606 Builder.ClearInsertionPoint();
10607}
10608
10610 bool IsFinished) {
10611 BasicBlock *CurBB = Builder.GetInsertBlock();
10612
10613 // Fall out of the current block (if necessary).
10614 emitBranch(BB);
10615
10616 if (IsFinished && BB->use_empty()) {
10617 BB->eraseFromParent();
10618 return;
10619 }
10620
10621 // Place the block after the current block, if possible, or else at
10622 // the end of the function.
10623 if (CurBB && CurBB->getParent())
10624 CurFn->insert(std::next(CurBB->getIterator()), BB);
10625 else
10626 CurFn->insert(CurFn->end(), BB);
10627 Builder.SetInsertPoint(BB);
10628}
10629
10631 BodyGenCallbackTy ElseGen,
10632 InsertPointTy AllocaIP,
10633 ArrayRef<BasicBlock *> DeallocBlocks) {
10634 // If the condition constant folds and can be elided, try to avoid emitting
10635 // the condition and the dead arm of the if/else.
10636 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10637 auto CondConstant = CI->getSExtValue();
10638 if (CondConstant)
10639 return ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10640
10641 return ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10642 }
10643
10644 Function *CurFn = Builder.GetInsertBlock()->getParent();
10645
10646 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10647 // emit the conditional branch.
10648 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10649 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10650 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10651 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10652 // Emit the 'then' code.
10653 emitBlock(ThenBlock, CurFn);
10654 if (Error Err = ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10655 return Err;
10656 emitBranch(ContBlock);
10657 // Emit the 'else' code if present.
10658 // There is no need to emit line number for unconditional branch.
10659 emitBlock(ElseBlock, CurFn);
10660 if (Error Err = ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10661 return Err;
10662 // There is no need to emit line number for unconditional branch.
10663 emitBranch(ContBlock);
10664 // Emit the continuation block for code after the if.
10665 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10666 return Error::success();
10667}
10668
10669bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10670 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10673 "Unexpected Atomic Ordering.");
10674
10675 bool Flush = false;
10677
10678 switch (AK) {
10679 case Read:
10682 FlushAO = AtomicOrdering::Acquire;
10683 Flush = true;
10684 }
10685 break;
10686 case Write:
10687 case Compare:
10688 case Update:
10691 FlushAO = AtomicOrdering::Release;
10692 Flush = true;
10693 }
10694 break;
10695 case Capture:
10696 switch (AO) {
10698 FlushAO = AtomicOrdering::Acquire;
10699 Flush = true;
10700 break;
10702 FlushAO = AtomicOrdering::Release;
10703 Flush = true;
10704 break;
10708 Flush = true;
10709 break;
10710 default:
10711 // do nothing - leave silently.
10712 break;
10713 }
10714 }
10715
10716 if (Flush) {
10717 // Currently Flush RT call still doesn't take memory_ordering, so for when
10718 // that happens, this tries to do the resolution of which atomic ordering
10719 // to use with but issue the flush call
10720 // TODO: pass `FlushAO` after memory ordering support is added
10721 (void)FlushAO;
10722 emitFlush(Loc);
10723 }
10724
10725 // for AO == AtomicOrdering::Monotonic and all other case combinations
10726 // do nothing
10727 return Flush;
10728}
10729
10733 AtomicOrdering AO, InsertPointTy AllocaIP) {
10734 if (!updateToLocation(Loc))
10735 return Loc.IP;
10736
10737 assert(X.Var->getType()->isPointerTy() &&
10738 "OMP Atomic expects a pointer to target memory");
10739 Type *XElemTy = X.ElemTy;
10740 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10741 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10742 "OMP atomic read expected a scalar type");
10743
10744 Value *XRead = nullptr;
10745
10746 if (XElemTy->isIntegerTy()) {
10747 LoadInst *XLD =
10748 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10749 XLD->setAtomic(AO);
10750 XRead = cast<Value>(XLD);
10751 } else if (XElemTy->isStructTy()) {
10752 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10753 // target does not support `atomicrmw` of the size of the struct
10754 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10755 OldVal->setAtomic(AO);
10756 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10757 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10758 OpenMPIRBuilder::AtomicInfo atomicInfo(
10759 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10760 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10761 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10762 XRead = AtomicLoadRes.first;
10763 OldVal->eraseFromParent();
10764 } else {
10765 // We need to perform atomic op as integer
10766 IntegerType *IntCastTy =
10767 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10768 LoadInst *XLoad =
10769 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10770 XLoad->setAtomic(AO);
10771 if (XElemTy->isFloatingPointTy()) {
10772 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10773 } else {
10774 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10775 }
10776 }
10777 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10778 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10779 return Builder.saveIP();
10780}
10781
10784 AtomicOpValue &X, Value *Expr,
10785 AtomicOrdering AO, InsertPointTy AllocaIP) {
10786 if (!updateToLocation(Loc))
10787 return Loc.IP;
10788
10789 assert(X.Var->getType()->isPointerTy() &&
10790 "OMP Atomic expects a pointer to target memory");
10791 Type *XElemTy = X.ElemTy;
10792 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10793 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10794 "OMP atomic write expected a scalar type");
10795
10796 if (XElemTy->isIntegerTy()) {
10797 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10798 XSt->setAtomic(AO);
10799 } else if (XElemTy->isStructTy()) {
10800 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10801 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10802 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10803 OpenMPIRBuilder::AtomicInfo atomicInfo(
10804 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10805 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10806 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10807 OldVal->eraseFromParent();
10808 } else {
10809 // We need to bitcast and perform atomic op as integers
10810 IntegerType *IntCastTy =
10811 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10812 Value *ExprCast =
10813 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10814 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10815 XSt->setAtomic(AO);
10816 }
10817
10818 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10819 return Builder.saveIP();
10820}
10821
10824 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10825 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10826 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10827 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10828 if (!updateToLocation(Loc))
10829 return Loc.IP;
10830
10831 LLVM_DEBUG({
10832 Type *XTy = X.Var->getType();
10833 assert(XTy->isPointerTy() &&
10834 "OMP Atomic expects a pointer to target memory");
10835 Type *XElemTy = X.ElemTy;
10836 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10837 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10838 "OMP atomic update expected a scalar or struct type");
10839 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10840 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10841 "OpenMP atomic does not support LT or GT operations");
10842 });
10843
10844 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10845 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10846 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10847 if (!AtomicResult)
10848 return AtomicResult.takeError();
10849 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10850 return Builder.saveIP();
10851}
10852
10853// FIXME: Duplicating AtomicExpand
10854Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10855 AtomicRMWInst::BinOp RMWOp) {
10856 switch (RMWOp) {
10857 case AtomicRMWInst::Add:
10858 return Builder.CreateAdd(Src1, Src2);
10859 case AtomicRMWInst::Sub:
10860 return Builder.CreateSub(Src1, Src2);
10861 case AtomicRMWInst::And:
10862 return Builder.CreateAnd(Src1, Src2);
10864 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10865 case AtomicRMWInst::Or:
10866 return Builder.CreateOr(Src1, Src2);
10867 case AtomicRMWInst::Xor:
10868 return Builder.CreateXor(Src1, Src2);
10873 case AtomicRMWInst::Max:
10874 case AtomicRMWInst::Min:
10887 llvm_unreachable("Unsupported atomic update operation");
10888 }
10889 llvm_unreachable("Unsupported atomic update operation");
10890}
10891
10892Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10893 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10895 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10896 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10897 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2.
10898 bool emitRMWOp = false;
10899 switch (RMWOp) {
10900 case AtomicRMWInst::Add:
10901 case AtomicRMWInst::And:
10903 case AtomicRMWInst::Or:
10904 case AtomicRMWInst::Xor:
10906 emitRMWOp = XElemTy;
10907 break;
10908 case AtomicRMWInst::Sub:
10909 emitRMWOp = (IsXBinopExpr && XElemTy);
10910 break;
10911 default:
10912 emitRMWOp = false;
10913 }
10914 emitRMWOp &= XElemTy->isIntegerTy();
10915
10916 std::pair<Value *, Value *> Res;
10917 if (emitRMWOp) {
10918 AtomicRMWInst *RMWInst =
10919 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10920 if (T.isAMDGPU()) {
10921 if (IsIgnoreDenormalMode)
10922 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10923 llvm::MDNode::get(Builder.getContext(), {}));
10924 if (!IsFineGrainedMemory)
10925 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10926 llvm::MDNode::get(Builder.getContext(), {}));
10927 if (!IsRemoteMemory)
10928 RMWInst->setMetadata("amdgpu.no.remote.memory",
10929 llvm::MDNode::get(Builder.getContext(), {}));
10930 }
10931 Res.first = RMWInst;
10932 // not needed except in case of postfix captures. Generate anyway for
10933 // consistency with the else part. Will be removed with any DCE pass.
10934 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10935 if (RMWOp == AtomicRMWInst::Xchg)
10936 Res.second = Res.first;
10937 else
10938 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10939 } else if (XElemTy->isStructTy()) {
10940 LoadInst *OldVal =
10941 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10942 OldVal->setAtomic(AO);
10943 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10944 unsigned LoadSize = LoadDL.getTypeStoreSize(XElemTy);
10945
10946 OpenMPIRBuilder::AtomicInfo atomicInfo(
10947 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10948 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10949 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10950 BasicBlock *CurBB = Builder.GetInsertBlock();
10951 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10952 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10953 BasicBlock *ExitBB =
10954 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10955 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10956 X->getName() + ".atomic.cont");
10957 ContBB->getTerminator()->eraseFromParent();
10958 Builder.restoreIP(AllocaIP);
10959 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10960 NewAtomicAddr->setName(X->getName() + "x.new.val");
10961 Builder.SetInsertPoint(ContBB);
10962 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10963 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10964 Value *OldExprVal = PHI;
10965 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10966 if (!CBResult)
10967 return CBResult.takeError();
10968 Value *Upd = *CBResult;
10969 Builder.CreateStore(Upd, NewAtomicAddr);
10972 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10973 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10974 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10975 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10976 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10977 OldVal->eraseFromParent();
10978 Res.first = OldExprVal;
10979 Res.second = Upd;
10980
10981 if (UnreachableInst *ExitTI =
10983 CurBBTI->eraseFromParent();
10984 Builder.SetInsertPoint(ExitBB);
10985 } else {
10986 Builder.SetInsertPoint(ExitTI);
10987 }
10988 } else {
10989 IntegerType *IntCastTy =
10990 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10991 LoadInst *OldVal =
10992 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10993 OldVal->setAtomic(AO);
10994 // CurBB
10995 // | /---\
10996 // ContBB |
10997 // | \---/
10998 // ExitBB
10999 BasicBlock *CurBB = Builder.GetInsertBlock();
11000 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11001 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11002 BasicBlock *ExitBB =
11003 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
11004 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
11005 X->getName() + ".atomic.cont");
11006 ContBB->getTerminator()->eraseFromParent();
11007 Builder.restoreIP(AllocaIP);
11008 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
11009 NewAtomicAddr->setName(X->getName() + "x.new.val");
11010 Builder.SetInsertPoint(ContBB);
11011 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
11012 PHI->addIncoming(OldVal, CurBB);
11013 bool IsIntTy = XElemTy->isIntegerTy();
11014 Value *OldExprVal = PHI;
11015 if (!IsIntTy) {
11016 if (XElemTy->isFloatingPointTy()) {
11017 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
11018 X->getName() + ".atomic.fltCast");
11019 } else {
11020 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
11021 X->getName() + ".atomic.ptrCast");
11022 }
11023 }
11024
11025 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
11026 if (!CBResult)
11027 return CBResult.takeError();
11028 Value *Upd = *CBResult;
11029 Builder.CreateStore(Upd, NewAtomicAddr);
11030 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
11033 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
11034 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
11035 Result->setVolatile(VolatileX);
11036 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11037 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11038 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
11039 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
11040
11041 Res.first = OldExprVal;
11042 Res.second = Upd;
11043
11044 // set Insertion point in exit block
11045 if (UnreachableInst *ExitTI =
11047 CurBBTI->eraseFromParent();
11048 Builder.SetInsertPoint(ExitBB);
11049 } else {
11050 Builder.SetInsertPoint(ExitTI);
11051 }
11052 }
11053
11054 return Res;
11055}
11056
11059 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
11060 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
11061 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
11062 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
11063 if (!updateToLocation(Loc))
11064 return Loc.IP;
11065
11066 LLVM_DEBUG({
11067 Type *XTy = X.Var->getType();
11068 assert(XTy->isPointerTy() &&
11069 "OMP Atomic expects a pointer to target memory");
11070 Type *XElemTy = X.ElemTy;
11071 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
11072 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
11073 "OMP atomic capture expected a scalar or struct type");
11074 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
11075 "OpenMP atomic does not support LT or GT operations");
11076 });
11077
11078 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
11079 // 'x' is simply atomically rewritten with 'expr'.
11080 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
11081 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
11082 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
11083 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
11084 if (!AtomicResult)
11085 return AtomicResult.takeError();
11086 Value *CapturedVal =
11087 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
11088 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
11089
11090 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
11091 return Builder.saveIP();
11092}
11093
11097 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11098 bool IsFailOnly) {
11099
11101 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
11102 IsPostfixUpdate, IsFailOnly, Failure);
11103}
11104
11108 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11109 bool IsFailOnly, AtomicOrdering Failure) {
11110
11111 if (!updateToLocation(Loc))
11112 return Loc.IP;
11113
11114 assert(X.Var->getType()->isPointerTy() &&
11115 "OMP atomic expects a pointer to target memory");
11116 // compare capture
11117 if (V.Var) {
11118 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
11119 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
11120 }
11121
11122 bool IsInteger = E->getType()->isIntegerTy();
11123
11124 if (Op == OMPAtomicCompareOp::EQ) {
11125 AtomicCmpXchgInst *Result = nullptr;
11126 if (!IsInteger) {
11127 IntegerType *IntCastTy =
11128 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
11129 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
11130 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
11131 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
11132 AO, Failure);
11133 } else {
11134 Result =
11135 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
11136 }
11137
11138 if (V.Var) {
11139 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11140 if (!IsInteger)
11141 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
11142 assert(OldValue->getType() == V.ElemTy &&
11143 "OldValue and V must be of same type");
11144 if (IsPostfixUpdate) {
11145 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
11146 } else {
11147 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11148 if (IsFailOnly) {
11149 // CurBB----
11150 // | |
11151 // v |
11152 // ContBB |
11153 // | |
11154 // v |
11155 // ExitBB <-
11156 //
11157 // where ContBB only contains the store of old value to 'v'.
11158 BasicBlock *CurBB = Builder.GetInsertBlock();
11159 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11160 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11161 BasicBlock *ExitBB = CurBB->splitBasicBlock(
11162 CurBBTI, X.Var->getName() + ".atomic.exit");
11163 BasicBlock *ContBB = CurBB->splitBasicBlock(
11164 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
11165 ContBB->getTerminator()->eraseFromParent();
11166 CurBB->getTerminator()->eraseFromParent();
11167
11168 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
11169
11170 Builder.SetInsertPoint(ContBB);
11171 Builder.CreateStore(OldValue, V.Var);
11172 Builder.CreateBr(ExitBB);
11173
11174 if (UnreachableInst *ExitTI =
11176 CurBBTI->eraseFromParent();
11177 Builder.SetInsertPoint(ExitBB);
11178 } else {
11179 Builder.SetInsertPoint(ExitTI);
11180 }
11181 } else {
11182 Value *CapturedValue =
11183 Builder.CreateSelect(SuccessOrFail, E, OldValue);
11184 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11185 }
11186 }
11187 }
11188 // The comparison result has to be stored.
11189 if (R.Var) {
11190 assert(R.Var->getType()->isPointerTy() &&
11191 "r.var must be of pointer type");
11192 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
11193
11194 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11195 Value *ResultCast = R.IsSigned
11196 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
11197 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
11198 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
11199 }
11200 } else {
11201 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
11202 "Op should be either max or min at this point");
11203 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
11204
11205 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
11206 // Let's take max as example.
11207 // OpenMP form:
11208 // x = x > expr ? expr : x;
11209 // LLVM form:
11210 // *ptr = *ptr > val ? *ptr : val;
11211 // We need to transform to LLVM form.
11212 // x = x <= expr ? x : expr;
11214 if (IsXBinopExpr) {
11215 if (IsInteger) {
11216 if (X.IsSigned)
11217 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
11219 else
11220 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
11222 } else {
11223 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
11225 }
11226 } else {
11227 if (IsInteger) {
11228 if (X.IsSigned)
11229 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
11231 else
11232 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
11234 } else {
11235 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
11237 }
11238 }
11239
11240 AtomicRMWInst *OldValue =
11241 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
11242 if (V.Var) {
11243 Value *CapturedValue = nullptr;
11244 if (IsPostfixUpdate) {
11245 CapturedValue = OldValue;
11246 } else {
11247 CmpInst::Predicate Pred;
11248 switch (NewOp) {
11249 case AtomicRMWInst::Max:
11250 Pred = CmpInst::ICMP_SGT;
11251 break;
11253 Pred = CmpInst::ICMP_UGT;
11254 break;
11256 Pred = CmpInst::FCMP_OGT;
11257 break;
11258 case AtomicRMWInst::Min:
11259 Pred = CmpInst::ICMP_SLT;
11260 break;
11262 Pred = CmpInst::ICMP_ULT;
11263 break;
11265 Pred = CmpInst::FCMP_OLT;
11266 break;
11267 default:
11268 llvm_unreachable("unexpected comparison op");
11269 }
11270 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
11271 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
11272 }
11273 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11274 }
11275 }
11276
11277 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
11278
11279 return Builder.saveIP();
11280}
11281
11284 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
11285 Value *NumTeamsUpper, Value *ThreadLimit,
11286 Value *IfExpr) {
11287 if (!updateToLocation(Loc))
11288 return InsertPointTy();
11289
11290 uint32_t SrcLocStrSize;
11291 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
11292 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
11293 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
11294
11295 // Outer allocation basicblock is the entry block of the current function.
11296 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
11297 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
11298 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
11299 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11300 }
11301
11302 // The current basic block is split into four basic blocks. After outlining,
11303 // they will be mapped as follows:
11304 // ```
11305 // def current_fn() {
11306 // current_basic_block:
11307 // br label %teams.exit
11308 // teams.exit:
11309 // ; instructions after teams
11310 // }
11311 //
11312 // def outlined_fn() {
11313 // teams.alloca:
11314 // br label %teams.body
11315 // teams.body:
11316 // ; instructions within teams body
11317 // }
11318 // ```
11319 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
11320 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
11321 BasicBlock *AllocaBB =
11322 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
11323
11324 bool SubClausesPresent =
11325 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
11326 // Push num_teams
11327 if (!Config.isTargetDevice() && SubClausesPresent) {
11328 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
11329 "if lowerbound is non-null, then upperbound must also be non-null "
11330 "for bounds on num_teams");
11331
11332 if (NumTeamsUpper == nullptr)
11333 NumTeamsUpper = Builder.getInt32(0);
11334
11335 if (NumTeamsLower == nullptr)
11336 NumTeamsLower = NumTeamsUpper;
11337
11338 if (IfExpr) {
11339 assert(IfExpr->getType()->isIntegerTy() &&
11340 "argument to if clause must be an integer value");
11341
11342 // upper = ifexpr ? upper : 1
11343 if (IfExpr->getType() != Int1)
11344 IfExpr = Builder.CreateICmpNE(IfExpr,
11345 ConstantInt::get(IfExpr->getType(), 0));
11346 NumTeamsUpper = Builder.CreateSelect(
11347 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
11348
11349 // lower = ifexpr ? lower : 1
11350 NumTeamsLower = Builder.CreateSelect(
11351 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
11352 }
11353
11354 if (ThreadLimit == nullptr)
11355 ThreadLimit = Builder.getInt32(0);
11356
11357 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
11358 // truncate or sign extend the passed values to match the int32 parameters.
11359 Value *NumTeamsLowerInt32 =
11360 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
11361 Value *NumTeamsUpperInt32 =
11362 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
11363 Value *ThreadLimitInt32 =
11364 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
11365
11366 Value *ThreadNum = getOrCreateThreadID(Ident);
11367
11369 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
11370 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
11371 ThreadLimitInt32});
11372 }
11373 // Generate the body of teams.
11374 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11375 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11376 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11377 return Err;
11378
11379 auto OI = std::make_unique<OutlineInfo>();
11380 OI->EntryBB = AllocaBB;
11381 OI->ExitBB = ExitBB;
11382 OI->OuterAllocBB = &OuterAllocaBB;
11383
11384 // Insert fake values for global tid and bound tid.
11386 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11387 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11388 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11389 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11390 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11391
11392 auto HostPostOutlineCB = [this, Ident,
11393 ToBeDeleted](Function &OutlinedFn) mutable {
11394 // The stale call instruction will be replaced with a new call instruction
11395 // for runtime call with the outlined function.
11396
11397 assert(OutlinedFn.hasOneUse() &&
11398 "there must be a single user for the outlined function");
11399 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11400 ToBeDeleted.push_back(StaleCI);
11401
11402 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11403 "Outlined function must have two or three arguments only");
11404
11405 bool HasShared = OutlinedFn.arg_size() == 3;
11406
11407 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11408 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11409 if (HasShared)
11410 OutlinedFn.getArg(2)->setName("data");
11411
11412 // Call to the runtime function for teams in the current function.
11413 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11414 "outlined function.");
11415 Builder.SetInsertPoint(StaleCI);
11416 SmallVector<Value *> Args = {
11417 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11418 if (HasShared)
11419 Args.push_back(StaleCI->getArgOperand(2));
11422 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11423 Args);
11424
11425 for (Instruction *I : llvm::reverse(ToBeDeleted))
11426 I->eraseFromParent();
11427 };
11428
11429 if (!Config.isTargetDevice())
11430 OI->PostOutlineCB = HostPostOutlineCB;
11431
11432 addOutlineInfo(std::move(OI));
11433
11434 Builder.SetInsertPoint(ExitBB);
11435
11436 return Builder.saveIP();
11437}
11438
11440 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
11441 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB) {
11442 if (!updateToLocation(Loc))
11443 return InsertPointTy();
11444
11445 BasicBlock *OuterAllocaBB = OuterAllocIP.getBlock();
11446
11447 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11448 BasicBlock *BodyBB =
11449 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11450 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11451 }
11452 BasicBlock *ExitBB =
11453 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11454 BasicBlock *BodyBB =
11455 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11456 BasicBlock *AllocaBB =
11457 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11458
11459 // Generate the body of distribute clause
11460 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11461 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11462 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11463 return Err;
11464
11465 // When using target we use different runtime functions which require a
11466 // callback.
11467 if (Config.isTargetDevice()) {
11468 auto OI = std::make_unique<OutlineInfo>();
11469 OI->OuterAllocBB = OuterAllocIP.getBlock();
11470 OI->EntryBB = AllocaBB;
11471 OI->ExitBB = ExitBB;
11472 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
11473 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
11474
11475 addOutlineInfo(std::move(OI));
11476 }
11477 Builder.SetInsertPoint(ExitBB);
11478
11479 return Builder.saveIP();
11480}
11481
11484 std::string VarName) {
11485 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11487 Names.size()),
11488 Names);
11489 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11490 M, MapNamesArrayInit->getType(),
11491 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11492 VarName);
11493 return MapNamesArrayGlobal;
11494}
11495
11496// Create all simple and struct types exposed by the runtime and remember
11497// the llvm::PointerTypes of them for easy access later.
11498void OpenMPIRBuilder::initializeTypes(Module &M) {
11499 LLVMContext &Ctx = M.getContext();
11500 StructType *T;
11501 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11502 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11503#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11504#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11505 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11506 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11507#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11508 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11509 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11510#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11511 T = StructType::getTypeByName(Ctx, StructName); \
11512 if (!T) \
11513 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11514 VarName = T; \
11515 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11516#include "llvm/Frontend/OpenMP/OMPKinds.def"
11517}
11518
11521 SmallVectorImpl<BasicBlock *> &BlockVector) {
11523 BlockSet.insert(EntryBB);
11524 BlockSet.insert(ExitBB);
11525
11526 Worklist.push_back(EntryBB);
11527 while (!Worklist.empty()) {
11528 BasicBlock *BB = Worklist.pop_back_val();
11529 BlockVector.push_back(BB);
11530 for (BasicBlock *SuccBB : successors(BB))
11531 if (BlockSet.insert(SuccBB).second)
11532 Worklist.push_back(SuccBB);
11533 }
11534}
11535
11536std::unique_ptr<CodeExtractor>
11538 bool ArgsInZeroAddressSpace,
11539 Twine Suffix) {
11540 return std::make_unique<CodeExtractor>(
11541 Blocks, /* DominatorTree */ nullptr,
11542 /* AggregateArgs */ true,
11543 /* BlockFrequencyInfo */ nullptr,
11544 /* BranchProbabilityInfo */ nullptr,
11545 /* AssumptionCache */ nullptr,
11546 /* AllowVarArgs */ true,
11547 /* AllowAlloca */ true,
11548 /* AllocationBlock*/ OuterAllocBB,
11549 /* DeallocationBlocks */ ArrayRef<BasicBlock *>(),
11550 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11551}
11552
11553std::unique_ptr<CodeExtractor> DeviceSharedMemOutlineInfo::createCodeExtractor(
11554 ArrayRef<BasicBlock *> Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) {
11555 return std::make_unique<DeviceSharedMemCodeExtractor>(
11556 OMPBuilder, Blocks, /* DominatorTree */ nullptr,
11557 /* AggregateArgs */ true,
11558 /* BlockFrequencyInfo */ nullptr,
11559 /* BranchProbabilityInfo */ nullptr,
11560 /* AssumptionCache */ nullptr,
11561 /* AllowVarArgs */ true,
11562 /* AllowAlloca */ true,
11563 /* AllocationBlock*/ OuterAllocBB,
11564 /* DeallocationBlocks */ OuterDeallocBBs.empty()
11566 : OuterDeallocBBs,
11567 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11568}
11569
11571 uint64_t Size, int32_t Flags,
11573 StringRef Name) {
11574 if (!Config.isGPU()) {
11577 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11578 return;
11579 }
11580 // TODO: Add support for global variables on the device after declare target
11581 // support.
11582 Function *Fn = dyn_cast<Function>(Addr);
11583 if (!Fn)
11584 return;
11585
11586 // Add a function attribute for the kernel.
11587 Fn->addFnAttr("kernel");
11588 if (T.isAMDGCN())
11589 Fn->addFnAttr("uniform-work-group-size");
11590 Fn->addFnAttr(Attribute::MustProgress);
11591}
11592
11593// We only generate metadata for function that contain target regions.
11596
11597 // If there are no entries, we don't need to do anything.
11598 if (OffloadInfoManager.empty())
11599 return;
11600
11601 LLVMContext &C = M.getContext();
11604 16>
11605 OrderedEntries(OffloadInfoManager.size());
11606
11607 // Auxiliary methods to create metadata values and strings.
11608 auto &&GetMDInt = [this](unsigned V) {
11609 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11610 };
11611
11612 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11613
11614 // Create the offloading info metadata node.
11615 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11616 auto &&TargetRegionMetadataEmitter =
11617 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11618 const TargetRegionEntryInfo &EntryInfo,
11620 // Generate metadata for target regions. Each entry of this metadata
11621 // contains:
11622 // - Entry 0 -> Kind of this type of metadata (0).
11623 // - Entry 1 -> Device ID of the file where the entry was identified.
11624 // - Entry 2 -> File ID of the file where the entry was identified.
11625 // - Entry 3 -> Mangled name of the function where the entry was
11626 // identified.
11627 // - Entry 4 -> Line in the file where the entry was identified.
11628 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11629 // - Entry 6 -> Order the entry was created.
11630 // The first element of the metadata node is the kind.
11631 Metadata *Ops[] = {
11632 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11633 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11634 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11635 GetMDInt(E.getOrder())};
11636
11637 // Save this entry in the right position of the ordered entries array.
11638 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11639
11640 // Add metadata to the named metadata node.
11641 MD->addOperand(MDNode::get(C, Ops));
11642 };
11643
11644 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11645
11646 // Create function that emits metadata for each device global variable entry;
11647 auto &&DeviceGlobalVarMetadataEmitter =
11648 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11649 StringRef MangledName,
11651 // Generate metadata for global variables. Each entry of this metadata
11652 // contains:
11653 // - Entry 0 -> Kind of this type of metadata (1).
11654 // - Entry 1 -> Mangled name of the variable.
11655 // - Entry 2 -> Declare target kind.
11656 // - Entry 3 -> Order the entry was created.
11657 // The first element of the metadata node is the kind.
11658 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11659 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11660
11661 // Save this entry in the right position of the ordered entries array.
11662 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11663 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11664
11665 // Add metadata to the named metadata node.
11666 MD->addOperand(MDNode::get(C, Ops));
11667 };
11668
11669 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11670 DeviceGlobalVarMetadataEmitter);
11671
11672 for (const auto &E : OrderedEntries) {
11673 assert(E.first && "All ordered entries must exist!");
11674 if (const auto *CE =
11676 E.first)) {
11677 if (!CE->getID() || !CE->getAddress()) {
11678 // Do not blame the entry if the parent funtion is not emitted.
11679 TargetRegionEntryInfo EntryInfo = E.second;
11680 StringRef FnName = EntryInfo.ParentName;
11681 if (!M.getNamedValue(FnName))
11682 continue;
11683 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11684 continue;
11685 }
11686 createOffloadEntry(CE->getID(), CE->getAddress(),
11687 /*Size=*/0, CE->getFlags(),
11689 } else if (const auto *CE = dyn_cast<
11691 E.first)) {
11694 CE->getFlags());
11695 switch (Flags) {
11698 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11699 continue;
11700 if (!CE->getAddress()) {
11701 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11702 continue;
11703 }
11704 // The vaiable has no definition - no need to add the entry.
11705 if (CE->getVarSize() == 0)
11706 continue;
11707 break;
11709 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11710 (!Config.isTargetDevice() && CE->getAddress())) &&
11711 "Declaret target link address is set.");
11712 if (Config.isTargetDevice())
11713 continue;
11714 if (!CE->getAddress()) {
11716 continue;
11717 }
11718 break;
11721 if (!CE->getAddress()) {
11722 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11723 continue;
11724 }
11725 break;
11726 default:
11727 break;
11728 }
11729
11730 // Hidden or internal symbols on the device are not externally visible.
11731 // We should not attempt to register them by creating an offloading
11732 // entry. Indirect variables are handled separately on the device.
11733 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11734 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11735 (Flags !=
11737 Flags != OffloadEntriesInfoManager::
11738 OMPTargetGlobalVarEntryIndirectVTable))
11739 continue;
11740
11741 // Indirect globals need to use a special name that doesn't match the name
11742 // of the associated host global.
11744 Flags ==
11746 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11747 Flags, CE->getLinkage(), CE->getVarName());
11748 else
11749 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11750 Flags, CE->getLinkage());
11751
11752 } else {
11753 llvm_unreachable("Unsupported entry kind.");
11754 }
11755 }
11756
11757 // Emit requires directive globals to a special entry so the runtime can
11758 // register them when the device image is loaded.
11759 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11760 // entries should be redesigned to better suit this use-case.
11761 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11765 ".requires", /*Size=*/0,
11767 Config.getRequiresFlags());
11768}
11769
11772 unsigned FileID, unsigned Line, unsigned Count) {
11773 raw_svector_ostream OS(Name);
11774 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11775 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11776 if (Count)
11777 OS << "_" << Count;
11778}
11779
11781 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11782 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11784 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11785 EntryInfo.Line, NewCount);
11786}
11787
11790 vfs::FileSystem &VFS,
11791 StringRef ParentName) {
11792 sys::fs::UniqueID ID(0xdeadf17e, 0);
11793 auto FileIDInfo = CallBack();
11794 uint64_t FileID = 0;
11795 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11796 ID = Status->getUniqueID();
11797 FileID = Status->getUniqueID().getFile();
11798 } else {
11799 // If the inode ID could not be determined, create a hash value
11800 // the current file name and use that as an ID.
11801 FileID = hash_value(std::get<0>(FileIDInfo));
11802 }
11803
11804 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11805 std::get<1>(FileIDInfo));
11806}
11807
11809 unsigned Offset = 0;
11810 for (uint64_t Remain =
11811 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11813 !(Remain & 1); Remain = Remain >> 1)
11814 Offset++;
11815 return Offset;
11816}
11817
11820 // Rotate by getFlagMemberOffset() bits.
11821 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11822 << getFlagMemberOffset());
11823}
11824
11827 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11828 // If the entry is PTR_AND_OBJ but has not been marked with the special
11829 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11830 // marked as MEMBER_OF.
11831 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11833 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11836 return;
11837
11838 // Entries with ATTACH are not members-of anything. They are handled
11839 // separately by the runtime after other maps have been handled.
11840 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11842 return;
11843
11844 // Reset the placeholder value to prepare the flag for the assignment of the
11845 // proper MEMBER_OF value.
11846 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11847 Flags |= MemberOfFlag;
11848}
11849
11853 bool IsDeclaration, bool IsExternallyVisible,
11854 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11855 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11856 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11857 std::function<Constant *()> GlobalInitializer,
11858 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11859 // TODO: convert this to utilise the IRBuilder Config rather than
11860 // a passed down argument.
11861 if (OpenMPSIMD)
11862 return nullptr;
11863
11866 CaptureClause ==
11868 Config.hasRequiresUnifiedSharedMemory())) {
11869 SmallString<64> PtrName;
11870 {
11871 raw_svector_ostream OS(PtrName);
11872 OS << MangledName;
11873 if (!IsExternallyVisible)
11874 OS << format("_%x", EntryInfo.FileID);
11875 OS << "_decl_tgt_ref_ptr";
11876 }
11877
11878 Value *Ptr = M.getNamedValue(PtrName);
11879
11880 if (!Ptr) {
11881 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11882 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11883
11884 auto *GV = cast<GlobalVariable>(Ptr);
11885 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11886
11887 if (!Config.isTargetDevice()) {
11888 if (GlobalInitializer)
11889 GV->setInitializer(GlobalInitializer());
11890 else
11891 GV->setInitializer(GlobalValue);
11892 }
11893
11895 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11896 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11897 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11898 }
11899
11900 return cast<Constant>(Ptr);
11901 }
11902
11903 return nullptr;
11904}
11905
11909 bool IsDeclaration, bool IsExternallyVisible,
11910 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11911 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11912 std::vector<Triple> TargetTriple,
11913 std::function<Constant *()> GlobalInitializer,
11914 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11915 Constant *Addr) {
11917 (TargetTriple.empty() && !Config.isTargetDevice()))
11918 return;
11919
11921 StringRef VarName;
11922 int64_t VarSize;
11924
11926 CaptureClause ==
11928 !Config.hasRequiresUnifiedSharedMemory()) {
11930 VarName = MangledName;
11931 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11932
11933 if (!IsDeclaration)
11934 VarSize = divideCeil(
11935 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11936 else
11937 VarSize = 0;
11938 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11939
11940 // This is a workaround carried over from Clang which prevents undesired
11941 // optimisation of internal variables.
11942 if (Config.isTargetDevice() &&
11943 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11944 // Do not create a "ref-variable" if the original is not also available
11945 // on the host.
11946 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11947 return;
11948
11949 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11950
11951 if (!M.getNamedValue(RefName)) {
11952 Constant *AddrRef =
11953 getOrCreateInternalVariable(Addr->getType(), RefName);
11954 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11955 GvAddrRef->setConstant(true);
11956 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11957 GvAddrRef->setInitializer(Addr);
11958 GeneratedRefs.push_back(GvAddrRef);
11959 }
11960 }
11961 } else {
11964 else
11966
11967 if (Config.isTargetDevice()) {
11968 VarName = (Addr) ? Addr->getName() : "";
11969 Addr = nullptr;
11970 } else {
11972 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11973 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11974 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11975 VarName = (Addr) ? Addr->getName() : "";
11976 }
11977 VarSize = M.getDataLayout().getPointerSize();
11979 }
11980
11981 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11982 Flags, Linkage);
11983}
11984
11985/// Loads all the offload entries information from the host IR
11986/// metadata.
11988 // If we are in target mode, load the metadata from the host IR. This code has
11989 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11990
11991 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11992 if (!MD)
11993 return;
11994
11995 for (MDNode *MN : MD->operands()) {
11996 auto &&GetMDInt = [MN](unsigned Idx) {
11997 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11998 return cast<ConstantInt>(V->getValue())->getZExtValue();
11999 };
12000
12001 auto &&GetMDString = [MN](unsigned Idx) {
12002 auto *V = cast<MDString>(MN->getOperand(Idx));
12003 return V->getString();
12004 };
12005
12006 switch (GetMDInt(0)) {
12007 default:
12008 llvm_unreachable("Unexpected metadata!");
12009 break;
12010 case OffloadEntriesInfoManager::OffloadEntryInfo::
12011 OffloadingEntryInfoTargetRegion: {
12012 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
12013 /*DeviceID=*/GetMDInt(1),
12014 /*FileID=*/GetMDInt(2),
12015 /*Line=*/GetMDInt(4),
12016 /*Count=*/GetMDInt(5));
12017 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
12018 /*Order=*/GetMDInt(6));
12019 break;
12020 }
12021 case OffloadEntriesInfoManager::OffloadEntryInfo::
12022 OffloadingEntryInfoDeviceGlobalVar:
12023 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
12024 /*MangledName=*/GetMDString(1),
12026 /*Flags=*/GetMDInt(2)),
12027 /*Order=*/GetMDInt(3));
12028 break;
12029 }
12030 }
12031}
12032
12034 StringRef HostFilePath) {
12035 if (HostFilePath.empty())
12036 return;
12037
12038 auto Buf = VFS.getBufferForFile(HostFilePath);
12039 if (std::error_code Err = Buf.getError()) {
12040 report_fatal_error(("error opening host file from host file path inside of "
12041 "OpenMPIRBuilder: " +
12042 Err.message())
12043 .c_str());
12044 }
12045
12046 LLVMContext Ctx;
12048 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
12049 if (std::error_code Err = M.getError()) {
12051 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
12052 .c_str());
12053 }
12054
12055 loadOffloadInfoMetadata(*M.get());
12056}
12057
12060 llvm::StringRef Name) {
12061 Builder.restoreIP(Loc.IP);
12062
12063 BasicBlock *CurBB = Builder.GetInsertBlock();
12064 assert(CurBB &&
12065 "expected a valid insertion block for creating an iterator loop");
12066 Function *F = CurBB->getParent();
12067
12068 InsertPointTy SplitIP = Builder.saveIP();
12069 if (SplitIP.getPoint() == CurBB->end())
12070 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
12071 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
12072
12073 BasicBlock *ContBB =
12074 splitBB(SplitIP, /*CreateBranch=*/false,
12075 Builder.getCurrentDebugLocation(), "omp.it.cont");
12076
12077 CanonicalLoopInfo *CLI =
12078 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
12079 /*PreInsertBefore=*/ContBB,
12080 /*PostInsertBefore=*/ContBB, Name);
12081
12082 // Enter loop from original block.
12083 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
12084
12085 // Remove the unconditional branch inserted by createLoopSkeleton in the body
12086 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
12087 T->eraseFromParent();
12088
12089 InsertPointTy BodyIP = CLI->getBodyIP();
12090 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
12091 return Err;
12092
12093 // Body must either fallthrough to the latch or branch directly to it.
12094 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
12095 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
12096 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
12098 "iterator bodygen must terminate the canonical body with an "
12099 "unconditional branch to the loop latch",
12101 }
12102 } else {
12103 // Ensure we end the loop body by jumping to the latch.
12104 Builder.SetInsertPoint(CLI->getBody());
12105 Builder.CreateBr(CLI->getLatch());
12106 }
12107
12108 // Link After -> ContBB
12109 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
12110 if (!CLI->getAfter()->hasTerminator())
12111 Builder.CreateBr(ContBB);
12112
12113 return InsertPointTy{ContBB, ContBB->begin()};
12114}
12115
12116/// Mangle the parameter part of the vector function name according to
12117/// their OpenMP classification. The mangling function is defined in
12118/// section 4.5 of the AAVFABI(2021Q1).
12119static std::string mangleVectorParameters(
12121 SmallString<256> Buffer;
12122 llvm::raw_svector_ostream Out(Buffer);
12123 for (const auto &ParamAttr : ParamAttrs) {
12124 switch (ParamAttr.Kind) {
12126 Out << 'l';
12127 break;
12129 Out << 'R';
12130 break;
12132 Out << 'U';
12133 break;
12135 Out << 'L';
12136 break;
12138 Out << 'u';
12139 break;
12141 Out << 'v';
12142 break;
12143 }
12144 if (ParamAttr.HasVarStride)
12145 Out << "s" << ParamAttr.StrideOrArg;
12146 else if (ParamAttr.Kind ==
12148 ParamAttr.Kind ==
12150 ParamAttr.Kind ==
12152 ParamAttr.Kind ==
12154 // Don't print the step value if it is not present or if it is
12155 // equal to 1.
12156 if (ParamAttr.StrideOrArg < 0)
12157 Out << 'n' << -ParamAttr.StrideOrArg;
12158 else if (ParamAttr.StrideOrArg != 1)
12159 Out << ParamAttr.StrideOrArg;
12160 }
12161
12162 if (!!ParamAttr.Alignment)
12163 Out << 'a' << ParamAttr.Alignment;
12164 }
12165
12166 return std::string(Out.str());
12167}
12168
12170 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
12172 struct ISADataTy {
12173 char ISA;
12174 unsigned VecRegSize;
12175 };
12176 ISADataTy ISAData[] = {
12177 {'b', 128}, // SSE
12178 {'c', 256}, // AVX
12179 {'d', 256}, // AVX2
12180 {'e', 512}, // AVX512
12181 };
12183 switch (Branch) {
12185 Masked.push_back('N');
12186 Masked.push_back('M');
12187 break;
12189 Masked.push_back('N');
12190 break;
12192 Masked.push_back('M');
12193 break;
12194 }
12195 for (char Mask : Masked) {
12196 for (const ISADataTy &Data : ISAData) {
12198 llvm::raw_svector_ostream Out(Buffer);
12199 Out << "_ZGV" << Data.ISA << Mask;
12200 if (!VLENVal) {
12201 assert(NumElts && "Non-zero simdlen/cdtsize expected");
12202 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
12203 } else {
12204 Out << VLENVal;
12205 }
12206 Out << mangleVectorParameters(ParamAttrs);
12207 Out << '_' << Fn->getName();
12208 Fn->addFnAttr(Out.str());
12209 }
12210 }
12211}
12212
12213// Function used to add the attribute. The parameter `VLEN` is templated to
12214// allow the use of `x` when targeting scalable functions for SVE.
12215template <typename T>
12216static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
12217 char ISA, StringRef ParSeq,
12218 StringRef MangledName, bool OutputBecomesInput,
12219 llvm::Function *Fn) {
12220 SmallString<256> Buffer;
12221 llvm::raw_svector_ostream Out(Buffer);
12222 Out << Prefix << ISA << LMask << VLEN;
12223 if (OutputBecomesInput)
12224 Out << 'v';
12225 Out << ParSeq << '_' << MangledName;
12226 Fn->addFnAttr(Out.str());
12227}
12228
12229// Helper function to generate the Advanced SIMD names depending on the value
12230// of the NDS when simdlen is not present.
12231static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
12232 StringRef Prefix, char ISA,
12233 StringRef ParSeq, StringRef MangledName,
12234 bool OutputBecomesInput,
12235 llvm::Function *Fn) {
12236 switch (NDS) {
12237 case 8:
12238 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12239 OutputBecomesInput, Fn);
12240 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
12241 OutputBecomesInput, Fn);
12242 break;
12243 case 16:
12244 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12245 OutputBecomesInput, Fn);
12246 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12247 OutputBecomesInput, Fn);
12248 break;
12249 case 32:
12250 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12251 OutputBecomesInput, Fn);
12252 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12253 OutputBecomesInput, Fn);
12254 break;
12255 case 64:
12256 case 128:
12257 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12258 OutputBecomesInput, Fn);
12259 break;
12260 default:
12261 llvm_unreachable("Scalar type is too wide.");
12262 }
12263}
12264
12265/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
12267 llvm::Function *Fn, unsigned UserVLEN,
12269 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
12270 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
12271
12272 // Sort out parameter sequence.
12273 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
12274 StringRef Prefix = "_ZGV";
12275 StringRef MangledName = Fn->getName();
12276
12277 // Generate simdlen from user input (if any).
12278 if (UserVLEN) {
12279 if (ISA == 's') {
12280 // SVE generates only a masked function.
12281 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12282 OutputBecomesInput, Fn);
12283 return;
12284 }
12285
12286 switch (Branch) {
12288 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12289 OutputBecomesInput, Fn);
12290 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12291 OutputBecomesInput, Fn);
12292 break;
12294 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12295 OutputBecomesInput, Fn);
12296 break;
12298 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12299 OutputBecomesInput, Fn);
12300 break;
12301 }
12302 return;
12303 }
12304
12305 if (ISA == 's') {
12306 // SVE, section 3.4.1, item 1.
12307 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
12308 OutputBecomesInput, Fn);
12309 return;
12310 }
12311
12312 switch (Branch) {
12314 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12315 MangledName, OutputBecomesInput, Fn);
12316 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12317 MangledName, OutputBecomesInput, Fn);
12318 break;
12320 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12321 MangledName, OutputBecomesInput, Fn);
12322 break;
12324 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12325 MangledName, OutputBecomesInput, Fn);
12326 break;
12327 }
12328}
12329
12330//===----------------------------------------------------------------------===//
12331// OffloadEntriesInfoManager
12332//===----------------------------------------------------------------------===//
12333
12335 return OffloadEntriesTargetRegion.empty() &&
12336 OffloadEntriesDeviceGlobalVar.empty();
12337}
12338
12339unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
12340 const TargetRegionEntryInfo &EntryInfo) const {
12341 auto It = OffloadEntriesTargetRegionCount.find(
12342 getTargetRegionEntryCountKey(EntryInfo));
12343 if (It == OffloadEntriesTargetRegionCount.end())
12344 return 0;
12345 return It->second;
12346}
12347
12348void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
12349 const TargetRegionEntryInfo &EntryInfo) {
12350 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
12351 EntryInfo.Count + 1;
12352}
12353
12354/// Initialize target region entry.
12356 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
12357 OffloadEntriesTargetRegion[EntryInfo] =
12358 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
12360 ++OffloadingEntriesNum;
12361}
12362
12364 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
12366 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
12367
12368 // Update the EntryInfo with the next available count for this location.
12369 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12370
12371 // If we are emitting code for a target, the entry is already initialized,
12372 // only has to be registered.
12373 if (OMPBuilder->Config.isTargetDevice()) {
12374 // This could happen if the device compilation is invoked standalone.
12375 if (!hasTargetRegionEntryInfo(EntryInfo)) {
12376 return;
12377 }
12378 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
12379 Entry.setAddress(Addr);
12380 Entry.setID(ID);
12381 Entry.setFlags(Flags);
12382 } else {
12384 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
12385 return;
12386 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
12387 "Target region entry already registered!");
12388 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
12389 OffloadEntriesTargetRegion[EntryInfo] = Entry;
12390 ++OffloadingEntriesNum;
12391 }
12392 incrementTargetRegionEntryInfoCount(EntryInfo);
12393}
12394
12396 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
12397
12398 // Update the EntryInfo with the next available count for this location.
12399 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12400
12401 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
12402 if (It == OffloadEntriesTargetRegion.end()) {
12403 return false;
12404 }
12405 // Fail if this entry is already registered.
12406 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
12407 return false;
12408 return true;
12409}
12410
12412 const OffloadTargetRegionEntryInfoActTy &Action) {
12413 // Scan all target region entries and perform the provided action.
12414 for (const auto &It : OffloadEntriesTargetRegion) {
12415 Action(It.first, It.second);
12416 }
12417}
12418
12420 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12421 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12422 ++OffloadingEntriesNum;
12423}
12424
12426 StringRef VarName, Constant *Addr, int64_t VarSize,
12428 if (OMPBuilder->Config.isTargetDevice()) {
12429 // This could happen if the device compilation is invoked standalone.
12430 if (!hasDeviceGlobalVarEntryInfo(VarName))
12431 return;
12432 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12433 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12434 if (Entry.getVarSize() == 0) {
12435 Entry.setVarSize(VarSize);
12436 Entry.setLinkage(Linkage);
12437 }
12438 return;
12439 }
12440 Entry.setVarSize(VarSize);
12441 Entry.setLinkage(Linkage);
12442 Entry.setAddress(Addr);
12443 } else {
12444 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12445 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12446 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12447 "Entry not initialized!");
12448 if (Entry.getVarSize() == 0) {
12449 Entry.setVarSize(VarSize);
12450 Entry.setLinkage(Linkage);
12451 }
12452 return;
12453 }
12455 Flags ==
12457 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12458 Addr, VarSize, Flags, Linkage,
12459 VarName.str());
12460 else
12461 OffloadEntriesDeviceGlobalVar.try_emplace(
12462 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12463 ++OffloadingEntriesNum;
12464 }
12465}
12466
12469 // Scan all target region entries and perform the provided action.
12470 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12471 Action(E.getKey(), E.getValue());
12472}
12473
12474//===----------------------------------------------------------------------===//
12475// CanonicalLoopInfo
12476//===----------------------------------------------------------------------===//
12477
12478void CanonicalLoopInfo::collectControlBlocks(
12480 // We only count those BBs as control block for which we do not need to
12481 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12482 // flow. For consistency, this also means we do not add the Body block, which
12483 // is just the entry to the body code.
12484 BBs.reserve(BBs.size() + 6);
12485 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12486}
12487
12489 assert(isValid() && "Requires a valid canonical loop");
12490 for (BasicBlock *Pred : predecessors(Header)) {
12491 if (Pred != Latch)
12492 return Pred;
12493 }
12494 llvm_unreachable("Missing preheader");
12495}
12496
12497void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12498 assert(isValid() && "Requires a valid canonical loop");
12499
12500 Instruction *CmpI = &getCond()->front();
12501 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12502 CmpI->setOperand(1, TripCount);
12503
12504#ifndef NDEBUG
12505 assertOK();
12506#endif
12507}
12508
12509void CanonicalLoopInfo::mapIndVar(
12510 llvm::function_ref<Value *(Instruction *)> Updater) {
12511 assert(isValid() && "Requires a valid canonical loop");
12512
12513 Instruction *OldIV = getIndVar();
12514
12515 // Record all uses excluding those introduced by the updater. Uses by the
12516 // CanonicalLoopInfo itself to keep track of the number of iterations are
12517 // excluded.
12518 SmallVector<Use *> ReplacableUses;
12519 for (Use &U : OldIV->uses()) {
12520 auto *User = dyn_cast<Instruction>(U.getUser());
12521 if (!User)
12522 continue;
12523 if (User->getParent() == getCond())
12524 continue;
12525 if (User->getParent() == getLatch())
12526 continue;
12527 ReplacableUses.push_back(&U);
12528 }
12529
12530 // Run the updater that may introduce new uses
12531 Value *NewIV = Updater(OldIV);
12532
12533 // Replace the old uses with the value returned by the updater.
12534 for (Use *U : ReplacableUses)
12535 U->set(NewIV);
12536
12537#ifndef NDEBUG
12538 assertOK();
12539#endif
12540}
12541
12543#ifndef NDEBUG
12544 // No constraints if this object currently does not describe a loop.
12545 if (!isValid())
12546 return;
12547
12548 BasicBlock *Preheader = getPreheader();
12549 BasicBlock *Body = getBody();
12550 BasicBlock *After = getAfter();
12551
12552 // Verify standard control-flow we use for OpenMP loops.
12553 assert(Preheader);
12554 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12555 "Preheader must terminate with unconditional branch");
12556 assert(Preheader->getSingleSuccessor() == Header &&
12557 "Preheader must jump to header");
12558
12559 assert(Header);
12560 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12561 "Header must terminate with unconditional branch");
12562 assert(Header->getSingleSuccessor() == Cond &&
12563 "Header must jump to exiting block");
12564
12565 assert(Cond);
12566 assert(Cond->getSinglePredecessor() == Header &&
12567 "Exiting block only reachable from header");
12568
12569 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12570 "Exiting block must terminate with conditional branch");
12571 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12572 "Exiting block's first successor jump to the body");
12573 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12574 "Exiting block's second successor must exit the loop");
12575
12576 assert(Body);
12577 assert(Body->getSinglePredecessor() == Cond &&
12578 "Body only reachable from exiting block");
12579 assert(!isa<PHINode>(Body->front()));
12580
12581 assert(Latch);
12582 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12583 "Latch must terminate with unconditional branch");
12584 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12585 // TODO: To support simple redirecting of the end of the body code that has
12586 // multiple; introduce another auxiliary basic block like preheader and after.
12587 assert(Latch->getSinglePredecessor() != nullptr);
12588 assert(!isa<PHINode>(Latch->front()));
12589
12590 assert(Exit);
12591 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12592 "Exit block must terminate with unconditional branch");
12593 assert(Exit->getSingleSuccessor() == After &&
12594 "Exit block must jump to after block");
12595
12596 assert(After);
12597 assert(After->getSinglePredecessor() == Exit &&
12598 "After block only reachable from exit block");
12599 assert(After->empty() || !isa<PHINode>(After->front()));
12600
12601 Instruction *IndVar = getIndVar();
12602 assert(IndVar && "Canonical induction variable not found?");
12603 assert(isa<IntegerType>(IndVar->getType()) &&
12604 "Induction variable must be an integer");
12605 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12606 "Induction variable must be a PHI in the loop header");
12607 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12608 assert(
12609 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12610 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12611
12612 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12613 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12614 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12615 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12616 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12617 ->isOne());
12618
12619 Value *TripCount = getTripCount();
12620 assert(TripCount && "Loop trip count not found?");
12621 assert(IndVar->getType() == TripCount->getType() &&
12622 "Trip count and induction variable must have the same type");
12623
12624 auto *CmpI = cast<CmpInst>(&Cond->front());
12625 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12626 "Exit condition must be a signed less-than comparison");
12627 assert(CmpI->getOperand(0) == IndVar &&
12628 "Exit condition must compare the induction variable");
12629 assert(CmpI->getOperand(1) == TripCount &&
12630 "Exit condition must compare with the trip count");
12631#endif
12632}
12633
12635 Header = nullptr;
12636 Cond = nullptr;
12637 Latch = nullptr;
12638 Exit = nullptr;
12639}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static Function * createTargetParallelWrapper(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn)
Create wrapper function used to gather the outlined function's argument structure from a shared buffe...
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static bool isGenericKernel(Function &Fn)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static std::optional< omp::OMPTgtExecModeFlags > getTargetKernelExecMode(Function &Kernel)
Given a function, if it represents the entry point of a target kernel, this returns the execution mod...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:109
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:150
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:865
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
InsertPoint - A saved insertion point.
Definition IRBuilder.h:298
BasicBlock * getBlock() const
Definition IRBuilder.h:313
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:311
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:314
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:318
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:330
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:996
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
size_type size() const
Definition MapVector.h:58
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:287
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:280
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
LLVM_ABI CallInst * createOMPAllocShared(const LocationDescription &Loc, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_alloc_shared.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
LLVM_ABI InsertPointOrErrorTy createScope(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait)
Generator for 'omp scope'.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies={}, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> TargetBodyGenCallbackTy
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, const DependenciesInfo &Dependencies={}, const AffinityData &Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={}, ArrayRef< BasicBlock * > DeallocBlocks={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
void addOutlineInfo(std::unique_ptr< OutlineInfo > &&OI)
Add a new region that will be outlined later.
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskDependency(IRBuilderBase &Builder, Value *Entry, const DependData &Dep)
Store one kmp_depend_info entry at the given Entry pointer.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI CallInst * createOMPAlignedAlloc(const LocationDescription &Loc, Value *Align, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_align_alloc.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPFreeShared(const LocationDescription &Loc, Value *Addr, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_free_shared.
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
SmallVector< std::unique_ptr< OutlineInfo >, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< InsertPointTy > DeallocIPs)> TargetGenArgAccessorsCallbackTy
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::string str() const
Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1051
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1111
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1125
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll(OptimizationRemarkEmitter *ORE=nullptr, const Loop *L=nullptr) const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:173
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr)
Definition Utility.cpp:105
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
@ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:557
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:376
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:27
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
omp::RTLDependenceKindTy DepKind
A struct to pack static and dynamic dependency information for a task.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
virtual LLVM_ABI std::unique_ptr< CodeExtractor > createCodeExtractor(ArrayRef< BasicBlock * > Blocks, bool ArgsInZeroAddressSpace, Twine Suffix=Twine(""))
Create a CodeExtractor instance based on the information stored in this structure,...
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static LLVM_ABI const Target * lookupTarget(const Triple &TheTriple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...