LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
20#include "llvm/ADT/StringRef.h"
31#include "llvm/IR/Attributes.h"
32#include "llvm/IR/BasicBlock.h"
33#include "llvm/IR/CFG.h"
34#include "llvm/IR/CallingConv.h"
35#include "llvm/IR/Constant.h"
36#include "llvm/IR/Constants.h"
37#include "llvm/IR/DIBuilder.h"
40#include "llvm/IR/Function.h"
42#include "llvm/IR/IRBuilder.h"
45#include "llvm/IR/LLVMContext.h"
46#include "llvm/IR/MDBuilder.h"
47#include "llvm/IR/Metadata.h"
49#include "llvm/IR/PassManager.h"
51#include "llvm/IR/Value.h"
54#include "llvm/Support/Error.h"
66
67#include <cstdint>
68#include <optional>
69
70#define DEBUG_TYPE "openmp-ir-builder"
71
72using namespace llvm;
73using namespace omp;
74
75static cl::opt<bool>
76 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
77 cl::desc("Use optimistic attributes describing "
78 "'as-if' properties of runtime calls."),
79 cl::init(false));
80
82 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
83 cl::desc("Factor for the unroll threshold to account for code "
84 "simplifications still taking place"),
85 cl::init(1.5));
86
88 "openmp-ir-builder-use-default-max-threads", cl::Hidden,
89 cl::desc("Use a default max threads if none is provided."), cl::init(true));
90
91#ifndef NDEBUG
92/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
93/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
94/// an InsertPoint stores the instruction before something is inserted. For
95/// instance, if both point to the same instruction, two IRBuilders alternating
96/// creating instruction will cause the instructions to be interleaved.
99 if (!IP1.isSet() || !IP2.isSet())
100 return false;
101 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
102}
103
105 // Valid ordered/unordered and base algorithm combinations.
106 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
107 case OMPScheduleType::UnorderedStaticChunked:
108 case OMPScheduleType::UnorderedStatic:
109 case OMPScheduleType::UnorderedDynamicChunked:
110 case OMPScheduleType::UnorderedGuidedChunked:
111 case OMPScheduleType::UnorderedRuntime:
112 case OMPScheduleType::UnorderedAuto:
113 case OMPScheduleType::UnorderedTrapezoidal:
114 case OMPScheduleType::UnorderedGreedy:
115 case OMPScheduleType::UnorderedBalanced:
116 case OMPScheduleType::UnorderedGuidedIterativeChunked:
117 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
118 case OMPScheduleType::UnorderedSteal:
119 case OMPScheduleType::UnorderedStaticBalancedChunked:
120 case OMPScheduleType::UnorderedGuidedSimd:
121 case OMPScheduleType::UnorderedRuntimeSimd:
122 case OMPScheduleType::OrderedStaticChunked:
123 case OMPScheduleType::OrderedStatic:
124 case OMPScheduleType::OrderedDynamicChunked:
125 case OMPScheduleType::OrderedGuidedChunked:
126 case OMPScheduleType::OrderedRuntime:
127 case OMPScheduleType::OrderedAuto:
128 case OMPScheduleType::OrderdTrapezoidal:
129 case OMPScheduleType::NomergeUnorderedStaticChunked:
130 case OMPScheduleType::NomergeUnorderedStatic:
131 case OMPScheduleType::NomergeUnorderedDynamicChunked:
132 case OMPScheduleType::NomergeUnorderedGuidedChunked:
133 case OMPScheduleType::NomergeUnorderedRuntime:
134 case OMPScheduleType::NomergeUnorderedAuto:
135 case OMPScheduleType::NomergeUnorderedTrapezoidal:
136 case OMPScheduleType::NomergeUnorderedGreedy:
137 case OMPScheduleType::NomergeUnorderedBalanced:
138 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
139 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
140 case OMPScheduleType::NomergeUnorderedSteal:
141 case OMPScheduleType::NomergeOrderedStaticChunked:
142 case OMPScheduleType::NomergeOrderedStatic:
143 case OMPScheduleType::NomergeOrderedDynamicChunked:
144 case OMPScheduleType::NomergeOrderedGuidedChunked:
145 case OMPScheduleType::NomergeOrderedRuntime:
146 case OMPScheduleType::NomergeOrderedAuto:
147 case OMPScheduleType::NomergeOrderedTrapezoidal:
148 case OMPScheduleType::OrderedDistributeChunked:
149 case OMPScheduleType::OrderedDistribute:
150 break;
151 default:
152 return false;
153 }
154
155 // Must not set both monotonicity modifiers at the same time.
156 OMPScheduleType MonotonicityFlags =
157 SchedType & OMPScheduleType::MonotonicityMask;
158 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
159 return false;
160
161 return true;
162}
163#endif
164
165/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
166/// debug location to the last instruction in the specified basic block if the
167/// insert point points to the end of the block.
170 Builder.restoreIP(IP);
171 llvm::BasicBlock *BB = Builder.GetInsertBlock();
172 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
173 if (!BB->empty() && I == BB->end())
174 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
175}
176
177static bool hasGridValue(const Triple &T) {
178 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
179}
180
181static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
182 if (T.isAMDGPU()) {
183 StringRef Features =
184 Kernel->getFnAttribute("target-features").getValueAsString();
185 if (Features.count("+wavefrontsize64"))
188 }
189 if (T.isNVPTX())
191 if (T.isSPIRV())
193 llvm_unreachable("No grid value available for this architecture!");
194}
195
196/// Determine which scheduling algorithm to use, determined from schedule clause
197/// arguments.
198static OMPScheduleType
199getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
200 bool HasSimdModifier, bool HasDistScheduleChunks) {
201 // Currently, the default schedule it static.
202 switch (ClauseKind) {
203 case OMP_SCHEDULE_Default:
204 case OMP_SCHEDULE_Static:
205 return HasChunks ? OMPScheduleType::BaseStaticChunked
206 : OMPScheduleType::BaseStatic;
207 case OMP_SCHEDULE_Dynamic:
208 return OMPScheduleType::BaseDynamicChunked;
209 case OMP_SCHEDULE_Guided:
210 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
211 : OMPScheduleType::BaseGuidedChunked;
212 case OMP_SCHEDULE_Auto:
214 case OMP_SCHEDULE_Runtime:
215 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
216 : OMPScheduleType::BaseRuntime;
217 case OMP_SCHEDULE_Distribute:
218 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
219 : OMPScheduleType::BaseDistribute;
220 }
221 llvm_unreachable("unhandled schedule clause argument");
222}
223
224/// Adds ordering modifier flags to schedule type.
225static OMPScheduleType
227 bool HasOrderedClause) {
228 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
229 OMPScheduleType::None &&
230 "Must not have ordering nor monotonicity flags already set");
231
232 OMPScheduleType OrderingModifier = HasOrderedClause
233 ? OMPScheduleType::ModifierOrdered
234 : OMPScheduleType::ModifierUnordered;
235 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
236
237 // Unsupported combinations
238 if (OrderingScheduleType ==
239 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
240 return OMPScheduleType::OrderedGuidedChunked;
241 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
242 OMPScheduleType::ModifierOrdered))
243 return OMPScheduleType::OrderedRuntime;
244
245 return OrderingScheduleType;
246}
247
248/// Adds monotonicity modifier flags to schedule type.
249static OMPScheduleType
251 bool HasSimdModifier, bool HasMonotonic,
252 bool HasNonmonotonic, bool HasOrderedClause) {
253 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
254 OMPScheduleType::None &&
255 "Must not have monotonicity flags already set");
256 assert((!HasMonotonic || !HasNonmonotonic) &&
257 "Monotonic and Nonmonotonic are contradicting each other");
258
259 if (HasMonotonic) {
260 return ScheduleType | OMPScheduleType::ModifierMonotonic;
261 } else if (HasNonmonotonic) {
262 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
263 } else {
264 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
265 // If the static schedule kind is specified or if the ordered clause is
266 // specified, and if the nonmonotonic modifier is not specified, the
267 // effect is as if the monotonic modifier is specified. Otherwise, unless
268 // the monotonic modifier is specified, the effect is as if the
269 // nonmonotonic modifier is specified.
270 OMPScheduleType BaseScheduleType =
271 ScheduleType & ~OMPScheduleType::ModifierMask;
272 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
273 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
274 HasOrderedClause) {
275 // The monotonic is used by default in openmp runtime library, so no need
276 // to set it.
277 return ScheduleType;
278 } else {
279 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
280 }
281 }
282}
283
284/// Determine the schedule type using schedule and ordering clause arguments.
285static OMPScheduleType
286computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
287 bool HasSimdModifier, bool HasMonotonicModifier,
288 bool HasNonmonotonicModifier, bool HasOrderedClause,
289 bool HasDistScheduleChunks) {
291 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
292 OMPScheduleType OrderedSchedule =
293 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
295 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
296 HasNonmonotonicModifier, HasOrderedClause);
297
299 return Result;
300}
301
302/// Given a function, if it represents the entry point of a target kernel, this
303/// returns the execution mode flags associated with that kernel.
304static std::optional<omp::OMPTgtExecModeFlags>
306 CallInst *TargetInitCall = nullptr;
307 for (Instruction &Inst : Kernel.getEntryBlock()) {
308 if (auto *Call = dyn_cast<CallInst>(&Inst)) {
309 if (Call->getCalledFunction()->getName() == "__kmpc_target_init") {
310 TargetInitCall = Call;
311 break;
312 }
313 }
314 }
315
316 if (!TargetInitCall)
317 return std::nullopt;
318
319 // Get the kernel mode information from the global variable associated to the
320 // first argument to the call to __kmpc_target_init. Refer to
321 // createTargetInit() to see how this is initialized.
322 Value *InitOperand = TargetInitCall->getArgOperand(0);
323 GlobalVariable *KernelEnv = nullptr;
324 if (auto *Cast = dyn_cast<ConstantExpr>(InitOperand))
325 KernelEnv = cast<GlobalVariable>(Cast->getOperand(0));
326 else
327 KernelEnv = cast<GlobalVariable>(InitOperand);
328 auto *KernelEnvInit = cast<ConstantStruct>(KernelEnv->getInitializer());
329 auto *ConfigEnv = cast<ConstantStruct>(KernelEnvInit->getOperand(0));
330 auto *KernelMode = cast<ConstantInt>(ConfigEnv->getOperand(2));
331 return static_cast<OMPTgtExecModeFlags>(KernelMode->getZExtValue());
332}
333
334static bool isGenericKernel(Function &Fn) {
335 std::optional<omp::OMPTgtExecModeFlags> ExecMode =
337 return !ExecMode || (*ExecMode & OMP_TGT_EXEC_MODE_GENERIC);
338}
339
340/// Make \p Source branch to \p Target.
341///
342/// Handles two situations:
343/// * \p Source already has an unconditional branch.
344/// * \p Source is a degenerate block (no terminator because the BB is
345/// the current head of the IR construction).
347 if (Instruction *Term = Source->getTerminatorOrNull()) {
348 auto *Br = cast<UncondBrInst>(Term);
349 BasicBlock *Succ = Br->getSuccessor();
350 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
351 Br->setSuccessor(Target);
352 return;
353 }
354
355 auto *NewBr = UncondBrInst::Create(Target, Source);
356 NewBr->setDebugLoc(DL);
357}
358
360 bool CreateBranch, DebugLoc DL) {
361 assert(New->getFirstInsertionPt() == New->begin() &&
362 "Target BB must not have PHI nodes");
363
364 // Move instructions to new block.
365 BasicBlock *Old = IP.getBlock();
366 // If the `Old` block is empty then there are no instructions to move. But in
367 // the new debug scheme, it could have trailing debug records which will be
368 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
369 // reasons:
370 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
371 // 2. Even if `New` is not empty, the rationale to move those records to `New`
372 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
373 // assumes that `Old` is optimized out and is going away. This is not the case
374 // here. The `Old` block is still being used e.g. a branch instruction is
375 // added to it later in this function.
376 // So we call `BasicBlock::splice` only when `Old` is not empty.
377 if (!Old->empty())
378 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
379
380 if (CreateBranch) {
381 auto *NewBr = UncondBrInst::Create(New, Old);
382 NewBr->setDebugLoc(DL);
383 }
384}
385
386void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
387 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
388 BasicBlock *Old = Builder.GetInsertBlock();
389
390 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
391 if (CreateBranch)
392 Builder.SetInsertPoint(Old->getTerminator());
393 else
394 Builder.SetInsertPoint(Old);
395
396 // SetInsertPoint also updates the Builder's debug location, but we want to
397 // keep the one the Builder was configured to use.
398 Builder.SetCurrentDebugLocation(DebugLoc);
399}
400
402 DebugLoc DL, llvm::Twine Name) {
403 BasicBlock *Old = IP.getBlock();
405 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
406 Old->getParent(), Old->getNextNode());
407 spliceBB(IP, New, CreateBranch, DL);
408 New->replaceSuccessorsPhiUsesWith(Old, New);
409 return New;
410}
411
412BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
413 llvm::Twine Name) {
414 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
415 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
416 if (CreateBranch)
417 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
418 else
419 Builder.SetInsertPoint(Builder.GetInsertBlock());
420 // SetInsertPoint also updates the Builder's debug location, but we want to
421 // keep the one the Builder was configured to use.
422 Builder.SetCurrentDebugLocation(DebugLoc);
423 return New;
424}
425
426BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
427 llvm::Twine Name) {
428 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
429 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
430 if (CreateBranch)
431 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
432 else
433 Builder.SetInsertPoint(Builder.GetInsertBlock());
434 // SetInsertPoint also updates the Builder's debug location, but we want to
435 // keep the one the Builder was configured to use.
436 Builder.SetCurrentDebugLocation(DebugLoc);
437 return New;
438}
439
441 llvm::Twine Suffix) {
442 BasicBlock *Old = Builder.GetInsertBlock();
443 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
444}
445
446// This function creates a fake integer value and a fake use for the integer
447// value. It returns the fake value created. This is useful in modeling the
448// extra arguments to the outlined functions.
450 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
452 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
453 const Twine &Name = "", bool AsPtr = true,
454 bool Is64Bit = false) {
455 Builder.restoreIP(OuterAllocaIP);
456 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
457 Instruction *FakeVal;
458 AllocaInst *FakeValAddr =
459 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
460 ToBeDeleted.push_back(FakeValAddr);
461
462 if (AsPtr) {
463 FakeVal = FakeValAddr;
464 } else {
465 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
466 ToBeDeleted.push_back(FakeVal);
467 }
468
469 // Generate a fake use of this value
470 Builder.restoreIP(InnerAllocaIP);
471 Instruction *UseFakeVal;
472 if (AsPtr) {
473 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
474 } else {
475 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
476 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
477 }
478 ToBeDeleted.push_back(UseFakeVal);
479 return FakeVal;
480}
481
482//===----------------------------------------------------------------------===//
483// OpenMPIRBuilderConfig
484//===----------------------------------------------------------------------===//
485
486namespace {
488/// Values for bit flags for marking which requires clauses have been used.
489enum OpenMPOffloadingRequiresDirFlags {
490 /// flag undefined.
491 OMP_REQ_UNDEFINED = 0x000,
492 /// no requires directive present.
493 OMP_REQ_NONE = 0x001,
494 /// reverse_offload clause.
495 OMP_REQ_REVERSE_OFFLOAD = 0x002,
496 /// unified_address clause.
497 OMP_REQ_UNIFIED_ADDRESS = 0x004,
498 /// unified_shared_memory clause.
499 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
500 /// dynamic_allocators clause.
501 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
502 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
503};
504
505class OMPCodeExtractor : public CodeExtractor {
506public:
507 OMPCodeExtractor(OpenMPIRBuilder &OMPBuilder, ArrayRef<BasicBlock *> BBs,
508 DominatorTree *DT = nullptr, bool AggregateArgs = false,
509 BlockFrequencyInfo *BFI = nullptr,
510 BranchProbabilityInfo *BPI = nullptr,
511 AssumptionCache *AC = nullptr, bool AllowVarArgs = false,
512 bool AllowAlloca = false,
513 BasicBlock *AllocationBlock = nullptr,
514 ArrayRef<BasicBlock *> DeallocationBlocks = {},
515 std::string Suffix = "", bool ArgsInZeroAddressSpace = false)
516 : CodeExtractor(BBs, DT, AggregateArgs, BFI, BPI, AC, AllowVarArgs,
517 AllowAlloca, AllocationBlock, DeallocationBlocks, Suffix,
518 ArgsInZeroAddressSpace),
519 OMPBuilder(OMPBuilder) {}
520
521 virtual ~OMPCodeExtractor() = default;
522
523protected:
524 OpenMPIRBuilder &OMPBuilder;
525};
526
527class DeviceSharedMemCodeExtractor : public OMPCodeExtractor {
528public:
529 using OMPCodeExtractor::OMPCodeExtractor;
530 virtual ~DeviceSharedMemCodeExtractor() = default;
531
532protected:
533 virtual Instruction *
534 allocateVar(IRBuilder<>::InsertPoint AllocaIP, Type *VarType,
535 const Twine &Name = Twine(""),
536 AddrSpaceCastInst **CastedAlloc = nullptr) override {
537 return OMPBuilder.createOMPAllocShared(AllocaIP, VarType, Name);
538 }
539
540 virtual Instruction *deallocateVar(IRBuilder<>::InsertPoint DeallocIP,
541 Value *Var, Type *VarType) override {
542 return OMPBuilder.createOMPFreeShared(DeallocIP, Var, VarType);
543 }
544};
545
546/// Helper storing information about regions to outline using device shared
547/// memory for intermediate allocations.
548struct DeviceSharedMemOutlineInfo : public OpenMPIRBuilder::OutlineInfo {
549 OpenMPIRBuilder &OMPBuilder;
550
551 DeviceSharedMemOutlineInfo(OpenMPIRBuilder &OMPBuilder)
552 : OMPBuilder(OMPBuilder) {}
553 virtual ~DeviceSharedMemOutlineInfo() = default;
554
555 virtual std::unique_ptr<CodeExtractor>
556 createCodeExtractor(ArrayRef<BasicBlock *> Blocks,
557 bool ArgsInZeroAddressSpace,
558 Twine Suffix = Twine("")) override;
559};
560
561} // anonymous namespace
562
564 : RequiresFlags(OMP_REQ_UNDEFINED) {}
565
568 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
569 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
572 RequiresFlags(OMP_REQ_UNDEFINED) {
573 if (HasRequiresReverseOffload)
574 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
575 if (HasRequiresUnifiedAddress)
576 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
577 if (HasRequiresUnifiedSharedMemory)
578 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
579 if (HasRequiresDynamicAllocators)
580 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
581}
582
584 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
585}
586
588 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
589}
590
592 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
593}
594
596 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
597}
598
600 return hasRequiresFlags() ? RequiresFlags
601 : static_cast<int64_t>(OMP_REQ_NONE);
602}
603
605 if (Value)
606 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
607 else
608 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
609}
610
612 if (Value)
613 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
614 else
615 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
616}
617
619 if (Value)
620 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
621 else
622 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
623}
624
626 if (Value)
627 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
628 else
629 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
630}
631
632//===----------------------------------------------------------------------===//
633// OpenMPIRBuilder
634//===----------------------------------------------------------------------===//
635
638 SmallVector<Value *> &ArgsVector) {
640 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
641 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
642 constexpr size_t MaxDim = 3;
643 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
644
645 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
646
647 Value *DynCGroupMemFallbackFlag =
648 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
649 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
650
651 Value *StrictFlag = Builder.getInt64(KernelArgs.StrictBlocksAndThreads);
652 StrictFlag = Builder.CreateShl(StrictFlag, 6);
653
654 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
655 Flags = Builder.CreateOr(Flags, StrictFlag);
656
657 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
658
659 Value *NumTeams3D =
660 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
661 Value *NumThreads3D =
662 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
663 for (unsigned I :
664 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
665 NumTeams3D =
666 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
667 for (unsigned I :
668 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
669 NumThreads3D =
670 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
671
672 ArgsVector = {Version,
673 PointerNum,
674 KernelArgs.RTArgs.BasePointersArray,
675 KernelArgs.RTArgs.PointersArray,
676 KernelArgs.RTArgs.SizesArray,
677 KernelArgs.RTArgs.MapTypesArray,
678 KernelArgs.RTArgs.MapNamesArray,
679 KernelArgs.RTArgs.MappersArray,
680 KernelArgs.NumIterations,
681 Flags,
682 NumTeams3D,
683 NumThreads3D,
684 KernelArgs.DynCGroupMem};
685}
686
688 LLVMContext &Ctx = Fn.getContext();
689
690 // Get the function's current attributes.
691 auto Attrs = Fn.getAttributes();
692 auto FnAttrs = Attrs.getFnAttrs();
693 auto RetAttrs = Attrs.getRetAttrs();
695 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
696 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
697
698 // Add AS to FnAS while taking special care with integer extensions.
699 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
700 bool Param = true) -> void {
701 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
702 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
703 if (HasSignExt || HasZeroExt) {
704 assert(AS.getNumAttributes() == 1 &&
705 "Currently not handling extension attr combined with others.");
706 if (Param) {
707 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
708 FnAS = FnAS.addAttribute(Ctx, AK);
709 } else if (auto AK =
710 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
711 FnAS = FnAS.addAttribute(Ctx, AK);
712 } else {
713 FnAS = FnAS.addAttributes(Ctx, AS);
714 }
715 };
716
717#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
718#include "llvm/Frontend/OpenMP/OMPKinds.def"
719
720 // Add attributes to the function declaration.
721 switch (FnID) {
722#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
723 case Enum: \
724 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
725 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
726 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
727 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
728 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
729 break;
730#include "llvm/Frontend/OpenMP/OMPKinds.def"
731 default:
732 // Attributes are optional.
733 break;
734 }
735}
736
739 FunctionType *FnTy = nullptr;
740 Function *Fn = nullptr;
741
742 // Try to find the declation in the module first.
743 switch (FnID) {
744#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
745 case Enum: \
746 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
747 IsVarArg); \
748 Fn = M.getFunction(Str); \
749 break;
750#include "llvm/Frontend/OpenMP/OMPKinds.def"
751 }
752
753 if (!Fn) {
754 // Create a new declaration if we need one.
755 switch (FnID) {
756#define OMP_RTL(Enum, Str, ...) \
757 case Enum: \
758 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
759 break;
760#include "llvm/Frontend/OpenMP/OMPKinds.def"
761 }
762 Fn->setCallingConv(Config.getRuntimeCC());
763 // Add information if the runtime function takes a callback function
764 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
765 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
766 LLVMContext &Ctx = Fn->getContext();
767 MDBuilder MDB(Ctx);
768 // Annotate the callback behavior of the runtime function:
769 // - The callback callee is argument number 2 (microtask).
770 // - The first two arguments of the callback callee are unknown (-1).
771 // - All variadic arguments to the runtime function are passed to the
772 // callback callee.
773 Fn->addMetadata(
774 LLVMContext::MD_callback,
776 2, {-1, -1}, /* VarArgsArePassed */ true)}));
777 }
778 }
779
780 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
781 << " with type " << *Fn->getFunctionType() << "\n");
782 addAttributes(FnID, *Fn);
783
784 } else {
785 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
786 << " with type " << *Fn->getFunctionType() << "\n");
787 }
788
789 assert(Fn && "Failed to create OpenMP runtime function");
790
791 return {FnTy, Fn};
792}
793
796 if (!FiniBB) {
797 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
799 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
800 Builder.SetInsertPoint(FiniBB);
801 // FiniCB adds the branch to the exit stub.
802 if (Error Err = FiniCB(Builder.saveIP()))
803 return Err;
804 }
805 return FiniBB;
806}
807
809 BasicBlock *OtherFiniBB) {
810 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
811 if (!FiniBB) {
812 FiniBB = OtherFiniBB;
813
814 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
815 if (Error Err = FiniCB(Builder.saveIP()))
816 return Err;
817
818 return Error::success();
819 }
820
821 // Move instructions from FiniBB to the start of OtherFiniBB.
822 auto EndIt = FiniBB->end();
823 if (FiniBB->size() >= 1)
824 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
825 EndIt = Prev;
826 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
827 EndIt);
828
829 FiniBB->replaceAllUsesWith(OtherFiniBB);
830 FiniBB->eraseFromParent();
831 FiniBB = OtherFiniBB;
832 return Error::success();
833}
834
837 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
838 assert(Fn && "Failed to create OpenMP runtime function pointer");
839 return Fn;
840}
841
844 StringRef Name) {
845 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
846 Call->setCallingConv(Config.getRuntimeCC());
847 return Call;
848}
849
850void OpenMPIRBuilder::initialize() { initializeTypes(M); }
851
854 BasicBlock &EntryBlock = Function->getEntryBlock();
855 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
856
857 // Loop over blocks looking for constant allocas, skipping the entry block
858 // as any allocas there are already in the desired location.
859 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
860 Block++) {
861 for (auto Inst = Block->getReverseIterator()->begin();
862 Inst != Block->getReverseIterator()->end();) {
864 Inst++;
866 continue;
867 AllocaInst->moveBeforePreserving(MoveLocInst);
868 } else {
869 Inst++;
870 }
871 }
872 }
873}
874
877
878 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
879 // TODO: For now, we support simple static allocations, we might need to
880 // move non-static ones as well. However, this will need further analysis to
881 // move the lenght arguments as well.
883 };
884
885 for (llvm::Instruction &Inst : Block)
887 if (ShouldHoistAlloca(*AllocaInst))
888 AllocasToMove.push_back(AllocaInst);
889
890 auto InsertPoint =
891 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
892
893 for (llvm::Instruction *AllocaInst : AllocasToMove)
895}
896
898 PostDominatorTree PostDomTree(*Func);
899 for (llvm::BasicBlock &BB : *Func)
900 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
902}
903
905 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
907 SmallVector<std::unique_ptr<OutlineInfo>, 16> DeferredOutlines;
908 for (std::unique_ptr<OutlineInfo> &OI : OutlineInfos) {
909 // Skip functions that have not finalized yet; may happen with nested
910 // function generation.
911 if (Fn && OI->getFunction() != Fn) {
912 DeferredOutlines.push_back(std::move(OI));
913 continue;
914 }
915
916 ParallelRegionBlockSet.clear();
917 Blocks.clear();
918 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
919
920 Function *OuterFn = OI->getFunction();
921 CodeExtractorAnalysisCache CEAC(*OuterFn);
922 // If we generate code for the target device, we need to allocate
923 // struct for aggregate params in the device default alloca address space.
924 // OpenMP runtime requires that the params of the extracted functions are
925 // passed as zero address space pointers. This flag ensures that
926 // CodeExtractor generates correct code for extracted functions
927 // which are used by OpenMP runtime.
928 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
929 std::unique_ptr<CodeExtractor> Extractor =
930 OI->createCodeExtractor(Blocks, ArgsInZeroAddressSpace, ".omp_par");
931
932 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
933 LLVM_DEBUG(dbgs() << "Entry " << OI->EntryBB->getName()
934 << " Exit: " << OI->ExitBB->getName() << "\n");
935 assert(Extractor->isEligible() &&
936 "Expected OpenMP outlining to be possible!");
937
938 for (auto *V : OI->ExcludeArgsFromAggregate)
939 Extractor->excludeArgFromAggregate(V);
940
941 Function *OutlinedFn =
942 Extractor->extractCodeRegion(CEAC, OI->Inputs, OI->Outputs);
943
944 // Forward target-cpu, target-features attributes to the outlined function.
945 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
946 if (TargetCpuAttr.isStringAttribute())
947 OutlinedFn->addFnAttr(TargetCpuAttr);
948
949 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
950 if (TargetFeaturesAttr.isStringAttribute())
951 OutlinedFn->addFnAttr(TargetFeaturesAttr);
952
953 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
954 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
955 assert(OutlinedFn->getReturnType()->isVoidTy() &&
956 "OpenMP outlined functions should not return a value!");
957
958 // For compability with the clang CG we move the outlined function after the
959 // one with the parallel region.
960 OutlinedFn->removeFromParent();
961 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
962
963 // Remove the artificial entry introduced by the extractor right away, we
964 // made our own entry block after all.
965 {
966 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
967 assert(ArtificialEntry.getUniqueSuccessor() == OI->EntryBB);
968 assert(OI->EntryBB->getUniquePredecessor() == &ArtificialEntry);
969 // Move instructions from the to-be-deleted ArtificialEntry to the entry
970 // basic block of the parallel region. CodeExtractor generates
971 // instructions to unwrap the aggregate argument and may sink
972 // allocas/bitcasts for values that are solely used in the outlined region
973 // and do not escape.
974 assert(!ArtificialEntry.empty() &&
975 "Expected instructions to add in the outlined region entry");
976 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
977 End = ArtificialEntry.rend();
978 It != End;) {
979 Instruction &I = *It;
980 It++;
981
982 if (I.isTerminator()) {
983 // Absorb any debug value that terminator may have
984 if (Instruction *TI = OI->EntryBB->getTerminatorOrNull())
985 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
986 continue;
987 }
988
989 I.moveBeforePreserving(*OI->EntryBB,
990 OI->EntryBB->getFirstInsertionPt());
991 }
992
993 OI->EntryBB->moveBefore(&ArtificialEntry);
994 ArtificialEntry.eraseFromParent();
995 }
996 assert(&OutlinedFn->getEntryBlock() == OI->EntryBB);
997 assert(OutlinedFn && OutlinedFn->hasNUses(1));
998
999 // Run a user callback, e.g. to add attributes.
1000 if (OI->PostOutlineCB)
1001 OI->PostOutlineCB(*OutlinedFn);
1002
1003 if (OI->FixUpNonEntryAllocas)
1005 }
1006
1007 // Remove work items that have been completed.
1008 OutlineInfos = std::move(DeferredOutlines);
1009
1010 // The createTarget functions embeds user written code into
1011 // the target region which may inject allocas which need to
1012 // be moved to the entry block of our target or risk malformed
1013 // optimisations by later passes, this is only relevant for
1014 // the device pass which appears to be a little more delicate
1015 // when it comes to optimisations (however, we do not block on
1016 // that here, it's up to the inserter to the list to do so).
1017 // This notbaly has to occur after the OutlinedInfo candidates
1018 // have been extracted so we have an end product that will not
1019 // be implicitly adversely affected by any raises unless
1020 // intentionally appended to the list.
1021 // NOTE: This only does so for ConstantData, it could be extended
1022 // to ConstantExpr's with further effort, however, they should
1023 // largely be folded when they get here. Extending it to runtime
1024 // defined/read+writeable allocation sizes would be non-trivial
1025 // (need to factor in movement of any stores to variables the
1026 // allocation size depends on, as well as the usual loads,
1027 // otherwise it'll yield the wrong result after movement) and
1028 // likely be more suitable as an LLVM optimisation pass.
1031
1032 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
1033 [](EmitMetadataErrorKind Kind,
1034 const TargetRegionEntryInfo &EntryInfo) -> void {
1035 errs() << "Error of kind: " << Kind
1036 << " when emitting offload entries and metadata during "
1037 "OMPIRBuilder finalization \n";
1038 };
1039
1040 if (!OffloadInfoManager.empty())
1042
1043 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
1044 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
1045 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
1046 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
1047 }
1048
1049 IsFinalized = true;
1050}
1051
1052bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
1053
1055 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
1056}
1057
1059 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
1060 auto *GV =
1061 new GlobalVariable(M, I32Ty,
1062 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
1063 ConstantInt::get(I32Ty, Value), Name);
1064 GV->setVisibility(GlobalValue::HiddenVisibility);
1065
1066 return GV;
1067}
1068
1070 if (List.empty())
1071 return;
1072
1073 // Convert List to what ConstantArray needs.
1075 UsedArray.resize(List.size());
1076 for (unsigned I = 0, E = List.size(); I != E; ++I)
1078 cast<Constant>(&*List[I]), Builder.getPtrTy());
1079
1080 if (UsedArray.empty())
1081 return;
1082 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
1083
1084 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
1085 ConstantArray::get(ATy, UsedArray), Name);
1086
1087 GV->setSection("llvm.metadata");
1088}
1089
1092 OMPTgtExecModeFlags Mode) {
1093 auto *Int8Ty = Builder.getInt8Ty();
1094 auto *GVMode = new GlobalVariable(
1095 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
1096 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
1097 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1098 return GVMode;
1099}
1100
1102 uint32_t SrcLocStrSize,
1103 IdentFlag LocFlags,
1104 unsigned Reserve2Flags) {
1105 // Enable "C-mode".
1106 LocFlags |= OMP_IDENT_FLAG_KMPC;
1107
1108 Constant *&Ident =
1109 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1110 if (!Ident) {
1111 Constant *I32Null = ConstantInt::getNullValue(Int32);
1112 Constant *IdentData[] = {I32Null,
1113 ConstantInt::get(Int32, uint32_t(LocFlags)),
1114 ConstantInt::get(Int32, Reserve2Flags),
1115 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1116
1117 size_t SrcLocStrArgIdx = 4;
1118 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1120 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1121 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1122 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1123 Constant *Initializer =
1124 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1125
1126 // Look for existing encoding of the location + flags, not needed but
1127 // minimizes the difference to the existing solution while we transition.
1128 for (GlobalVariable &GV : M.globals())
1129 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1130 if (GV.getInitializer() == Initializer)
1131 Ident = &GV;
1132
1133 if (!Ident) {
1134 auto *GV = new GlobalVariable(
1135 M, OpenMPIRBuilder::Ident,
1136 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1138 M.getDataLayout().getDefaultGlobalsAddressSpace());
1139 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1140 GV->setAlignment(Align(8));
1141 Ident = GV;
1142 }
1143 }
1144
1145 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1146}
1147
1149 uint32_t &SrcLocStrSize) {
1150 SrcLocStrSize = LocStr.size();
1151 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1152 if (!SrcLocStr) {
1153 Constant *Initializer =
1154 ConstantDataArray::getString(M.getContext(), LocStr);
1155
1156 // Look for existing encoding of the location, not needed but minimizes the
1157 // difference to the existing solution while we transition.
1158 for (GlobalVariable &GV : M.globals())
1159 if (GV.isConstant() && GV.hasInitializer() &&
1160 GV.getInitializer() == Initializer)
1161 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1162
1163 SrcLocStr = Builder.CreateGlobalString(
1164 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1165 &M);
1166 }
1167 return SrcLocStr;
1168}
1169
1171 StringRef FileName,
1172 unsigned Line, unsigned Column,
1173 uint32_t &SrcLocStrSize) {
1174 SmallString<128> Buffer;
1175 Buffer.push_back(';');
1176 Buffer.append(FileName);
1177 Buffer.push_back(';');
1178 Buffer.append(FunctionName);
1179 Buffer.push_back(';');
1180 Buffer.append(std::to_string(Line));
1181 Buffer.push_back(';');
1182 Buffer.append(std::to_string(Column));
1183 Buffer.push_back(';');
1184 Buffer.push_back(';');
1185 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1186}
1187
1188Constant *
1190 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1191 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1192}
1193
1195 uint32_t &SrcLocStrSize,
1196 Function *F) {
1197 DILocation *DIL = DL.get();
1198 if (!DIL)
1199 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1200 StringRef FileName =
1201 !DIL->getFilename().empty() ? DIL->getFilename() : M.getName();
1202 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1203 if (Function.empty() && F)
1204 Function = F->getName();
1205 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1206 DIL->getColumn(), SrcLocStrSize);
1207}
1208
1210 uint32_t &SrcLocStrSize) {
1211 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1212 Loc.IP.getBlock()->getParent());
1213}
1214
1217 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1218 "omp_global_thread_num");
1219}
1220
1223 bool ForceSimpleCall, bool CheckCancelFlag) {
1224 if (!updateToLocation(Loc))
1225 return Loc.IP;
1226
1227 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1228 // __kmpc_barrier(loc, thread_id);
1229
1230 IdentFlag BarrierLocFlags;
1231 switch (Kind) {
1232 case OMPD_for:
1233 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1234 break;
1235 case OMPD_sections:
1236 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1237 break;
1238 case OMPD_single:
1239 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1240 break;
1241 case OMPD_barrier:
1242 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1243 break;
1244 default:
1245 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1246 break;
1247 }
1248
1249 uint32_t SrcLocStrSize;
1250 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1251 Value *Args[] = {
1252 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1253 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1254
1255 // If we are in a cancellable parallel region, barriers are cancellation
1256 // points.
1257 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1258 bool UseCancelBarrier =
1259 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1260
1262 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1263 ? OMPRTL___kmpc_cancel_barrier
1264 : OMPRTL___kmpc_barrier),
1265 Args);
1266
1267 if (UseCancelBarrier && CheckCancelFlag)
1268 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1269 return Err;
1270
1271 return Builder.saveIP();
1272}
1273
1276 Value *IfCondition,
1277 omp::Directive CanceledDirective) {
1278 if (!updateToLocation(Loc))
1279 return Loc.IP;
1280
1281 // LLVM utilities like blocks with terminators.
1282 auto *UI = Builder.CreateUnreachable();
1283
1284 Instruction *ThenTI = UI, *ElseTI = nullptr;
1285 if (IfCondition) {
1286 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1287
1288 // Even if the if condition evaluates to false, this should count as a
1289 // cancellation point
1290 Builder.SetInsertPoint(ElseTI);
1291 auto ElseIP = Builder.saveIP();
1292
1294 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1295 if (!IPOrErr)
1296 return IPOrErr;
1297 }
1298
1299 Builder.SetInsertPoint(ThenTI);
1300
1301 Value *CancelKind = nullptr;
1302 switch (CanceledDirective) {
1303#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1304 case DirectiveEnum: \
1305 CancelKind = Builder.getInt32(Value); \
1306 break;
1307#include "llvm/Frontend/OpenMP/OMPKinds.def"
1308 default:
1309 llvm_unreachable("Unknown cancel kind!");
1310 }
1311
1312 uint32_t SrcLocStrSize;
1313 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1314 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1315 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1317 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1318
1319 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1320 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1321 return Err;
1322
1323 // Update the insertion point and remove the terminator we introduced.
1324 Builder.SetInsertPoint(UI->getParent());
1325 UI->eraseFromParent();
1326
1327 return Builder.saveIP();
1328}
1329
1332 omp::Directive CanceledDirective) {
1333 if (!updateToLocation(Loc))
1334 return Loc.IP;
1335
1336 // LLVM utilities like blocks with terminators.
1337 auto *UI = Builder.CreateUnreachable();
1338 Builder.SetInsertPoint(UI);
1339
1340 Value *CancelKind = nullptr;
1341 switch (CanceledDirective) {
1342#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1343 case DirectiveEnum: \
1344 CancelKind = Builder.getInt32(Value); \
1345 break;
1346#include "llvm/Frontend/OpenMP/OMPKinds.def"
1347 default:
1348 llvm_unreachable("Unknown cancel kind!");
1349 }
1350
1351 uint32_t SrcLocStrSize;
1352 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1353 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1354 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1356 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1357
1358 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1359 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1360 return Err;
1361
1362 // Update the insertion point and remove the terminator we introduced.
1363 Builder.SetInsertPoint(UI->getParent());
1364 UI->eraseFromParent();
1365
1366 return Builder.saveIP();
1367}
1368
1370 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1371 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1372 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1373 if (!updateToLocation(Loc))
1374 return Loc.IP;
1375
1376 Builder.restoreIP(AllocaIP);
1377 auto *KernelArgsPtr =
1378 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1380
1381 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1382 llvm::Value *Arg =
1383 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1384 Builder.CreateAlignedStore(
1385 KernelArgs[I], Arg,
1386 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1387 }
1388
1389 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1390 NumThreads, HostPtr, KernelArgsPtr};
1391
1393 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1394 OffloadingArgs);
1395
1396 return Builder.saveIP();
1397}
1398
1400 const LocationDescription &Loc, Value *OutlinedFnID,
1401 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1402 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1403
1404 if (!updateToLocation(Loc))
1405 return Loc.IP;
1406
1407 // On top of the arrays that were filled up, the target offloading call
1408 // takes as arguments the device id as well as the host pointer. The host
1409 // pointer is used by the runtime library to identify the current target
1410 // region, so it only has to be unique and not necessarily point to
1411 // anything. It could be the pointer to the outlined function that
1412 // implements the target region, but we aren't using that so that the
1413 // compiler doesn't need to keep that, and could therefore inline the host
1414 // function if proven worthwhile during optimization.
1415
1416 // From this point on, we need to have an ID of the target region defined.
1417 assert(OutlinedFnID && "Invalid outlined function ID!");
1418 (void)OutlinedFnID;
1419
1420 // Return value of the runtime offloading call.
1421 Value *Return = nullptr;
1422
1423 // Arguments for the target kernel.
1424 SmallVector<Value *> ArgsVector;
1425 getKernelArgsVector(Args, Builder, ArgsVector);
1426
1427 // The target region is an outlined function launched by the runtime
1428 // via calls to __tgt_target_kernel().
1429 //
1430 // Note that on the host and CPU targets, the runtime implementation of
1431 // these calls simply call the outlined function without forking threads.
1432 // The outlined functions themselves have runtime calls to
1433 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1434 // the compiler in emitTeamsCall() and emitParallelCall().
1435 //
1436 // In contrast, on the NVPTX target, the implementation of
1437 // __tgt_target_teams() launches a GPU kernel with the requested number
1438 // of teams and threads so no additional calls to the runtime are required.
1439 // Check the error code and execute the host version if required.
1440 Builder.restoreIP(emitTargetKernel(
1441 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1442 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1443
1444 BasicBlock *OffloadFailedBlock =
1445 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1446 BasicBlock *OffloadContBlock =
1447 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1448 Value *Failed = Builder.CreateIsNotNull(Return);
1449 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1450
1451 auto CurFn = Builder.GetInsertBlock()->getParent();
1452 emitBlock(OffloadFailedBlock, CurFn);
1453 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1454 if (!AfterIP)
1455 return AfterIP.takeError();
1456 Builder.restoreIP(*AfterIP);
1457 emitBranch(OffloadContBlock);
1458 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1459 return Builder.saveIP();
1460}
1461
1463 Value *CancelFlag, omp::Directive CanceledDirective) {
1464 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1465 "Unexpected cancellation!");
1466
1467 // For a cancel barrier we create two new blocks.
1468 BasicBlock *BB = Builder.GetInsertBlock();
1469 BasicBlock *NonCancellationBlock;
1470 if (Builder.GetInsertPoint() == BB->end()) {
1471 // TODO: This branch will not be needed once we moved to the
1472 // OpenMPIRBuilder codegen completely.
1473 NonCancellationBlock = BasicBlock::Create(
1474 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1475 } else {
1476 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1478 Builder.SetInsertPoint(BB);
1479 }
1480 BasicBlock *CancellationBlock = BasicBlock::Create(
1481 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1482
1483 // Jump to them based on the return value.
1484 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1485 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1486 /* TODO weight */ nullptr, nullptr);
1487
1488 // From the cancellation block we finalize all variables and go to the
1489 // post finalization block that is known to the FiniCB callback.
1490 auto &FI = FinalizationStack.back();
1491 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1492 if (!FiniBBOrErr)
1493 return FiniBBOrErr.takeError();
1494 Builder.SetInsertPoint(CancellationBlock);
1495 Builder.CreateBr(*FiniBBOrErr);
1496
1497 // The continuation block is where code generation continues.
1498 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1499 return Error::success();
1500}
1501
1502/// Create wrapper function used to gather the outlined function's argument
1503/// structure from a shared buffer and to forward them to it when running in
1504/// Generic mode.
1505///
1506/// The outlined function is expected to receive 2 integer arguments followed by
1507/// an optional pointer argument to an argument structure holding the rest.
1509 Function &OutlinedFn) {
1510 size_t NumArgs = OutlinedFn.arg_size();
1511 assert((NumArgs == 2 || NumArgs == 3) &&
1512 "expected a 2-3 argument parallel outlined function");
1513 bool UseArgStruct = NumArgs == 3;
1514
1515 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1516 IRBuilder<>::InsertPointGuard IPG(Builder);
1517 auto *FnTy = FunctionType::get(Builder.getVoidTy(),
1518 {Builder.getInt16Ty(), Builder.getInt32Ty()},
1519 /*isVarArg=*/false);
1520 auto *WrapperFn =
1522 OutlinedFn.getName() + ".wrapper", OMPIRBuilder->M);
1523
1524 WrapperFn->addParamAttr(0, Attribute::NoUndef);
1525 WrapperFn->addParamAttr(0, Attribute::ZExt);
1526 WrapperFn->addParamAttr(1, Attribute::NoUndef);
1527
1528 BasicBlock *EntryBB =
1529 BasicBlock::Create(OMPIRBuilder->M.getContext(), "entry", WrapperFn);
1530 Builder.SetInsertPoint(EntryBB);
1531
1532 // Allocation.
1533 Value *AddrAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1534 /*ArraySize=*/nullptr, "addr");
1535 AddrAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1536 AddrAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1537 AddrAlloca->getName() + ".ascast");
1538
1539 Value *ZeroAlloca = Builder.CreateAlloca(Builder.getInt32Ty(),
1540 /*ArraySize=*/nullptr, "zero");
1541 ZeroAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1542 ZeroAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1543 ZeroAlloca->getName() + ".ascast");
1544
1545 Value *ArgsAlloca = nullptr;
1546 if (UseArgStruct) {
1547 ArgsAlloca = Builder.CreateAlloca(Builder.getPtrTy(),
1548 /*ArraySize=*/nullptr, "global_args");
1549 ArgsAlloca = Builder.CreatePointerBitCastOrAddrSpaceCast(
1550 ArgsAlloca, Builder.getPtrTy(/*AddrSpace=*/0),
1551 ArgsAlloca->getName() + ".ascast");
1552 }
1553
1554 // Initialization.
1555 Builder.CreateStore(WrapperFn->getArg(1), AddrAlloca);
1556 Builder.CreateStore(Builder.getInt32(0), ZeroAlloca);
1557 if (UseArgStruct) {
1558 Builder.CreateCall(
1559 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(
1560 llvm::omp::RuntimeFunction::OMPRTL___kmpc_get_shared_variables),
1561 {ArgsAlloca});
1562 }
1563
1564 SmallVector<Value *, 3> Args{AddrAlloca, ZeroAlloca};
1565
1566 // Load structArg from global_args.
1567 if (UseArgStruct) {
1568 Value *StructArg = Builder.CreateLoad(Builder.getPtrTy(), ArgsAlloca);
1569 StructArg = Builder.CreateInBoundsGEP(Builder.getPtrTy(), StructArg,
1570 {Builder.getInt64(0)});
1571 StructArg = Builder.CreateLoad(Builder.getPtrTy(), StructArg, "structArg");
1572 Args.push_back(StructArg);
1573 }
1574
1575 // Call the outlined function holding the parallel body.
1576 Builder.CreateCall(&OutlinedFn, Args);
1577 Builder.CreateRetVoid();
1578
1579 return WrapperFn;
1580}
1581
1582// Callback used to create OpenMP runtime calls to support
1583// omp parallel clause for the device.
1584// We need to use this callback to replace call to the OutlinedFn in OuterFn
1585// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1587 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1588 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1589 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1590 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1591 assert(OutlinedFn.arg_size() >= 2 &&
1592 "Expected at least tid and bounded tid as arguments");
1593 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1594
1595 // Add some known attributes.
1596 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1597 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1598 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1599 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1600 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1601 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1602
1603 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1604 assert(CI && "Expected call instruction to outlined function");
1605 CI->getParent()->setName("omp_parallel");
1606
1607 Builder.SetInsertPoint(CI);
1608 Type *PtrTy = OMPIRBuilder->VoidPtr;
1609
1610 // Add alloca for kernel args
1611 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1612 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1613 AllocaInst *ArgsAlloca =
1614 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1615 Value *Args = ArgsAlloca;
1616 // Add address space cast if array for storing arguments is not allocated
1617 // in address space 0
1618 if (ArgsAlloca->getAddressSpace())
1619 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1620 Builder.restoreIP(CurrentIP);
1621
1622 // Store captured vars which are used by kmpc_parallel_60
1623 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1624 Value *V = *(CI->arg_begin() + 2 + Idx);
1625 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1626 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1627 Builder.CreateStore(V, StoreAddress);
1628 }
1629
1630 Value *Cond =
1631 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1632 : Builder.getInt32(1);
1633 Value *NumThreadsArg =
1634 NumThreads ? Builder.CreateZExtOrTrunc(NumThreads, OMPIRBuilder->Int32)
1635 : Builder.getInt32(-1);
1636
1637 // If this is not a Generic kernel, we can skip generating the wrapper.
1638 Value *WrapperFn;
1639 if (isGenericKernel(*OuterFn))
1640 WrapperFn = createTargetParallelWrapper(OMPIRBuilder, OutlinedFn);
1641 else
1642 WrapperFn = Constant::getNullValue(PtrTy);
1643
1644 // Build kmpc_parallel_60 call
1645 Value *Parallel60CallArgs[] = {
1646 /* identifier*/ Ident,
1647 /* global thread num*/ ThreadID,
1648 /* if expression */ Cond,
1649 /* number of threads */ NumThreadsArg,
1650 /* Proc bind */ Builder.getInt32(-1),
1651 /* outlined function */ &OutlinedFn,
1652 /* wrapper function */ WrapperFn,
1653 /* arguments of the outlined funciton*/ Args,
1654 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1655 /* strict for number of threads */ Builder.getInt32(0)};
1656
1657 FunctionCallee RTLFn =
1658 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1659
1660 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1661
1662 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1663 << *Builder.GetInsertBlock()->getParent() << "\n");
1664
1665 // Initialize the local TID stack location with the argument value.
1666 Builder.SetInsertPoint(PrivTID);
1667 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1668 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1669 PrivTIDAddr);
1670
1671 // Remove redundant call to the outlined function.
1672 CI->eraseFromParent();
1673
1674 for (Instruction *I : ToBeDeleted) {
1675 I->eraseFromParent();
1676 }
1677}
1678
1679// Callback used to create OpenMP runtime calls to support
1680// omp parallel clause for the host.
1681// We need to use this callback to replace call to the OutlinedFn in OuterFn
1682// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1683static void
1685 Function *OuterFn, Value *Ident, Value *IfCondition,
1686 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1687 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1688 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1689 FunctionCallee RTLFn;
1690 if (IfCondition) {
1691 RTLFn =
1692 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1693 } else {
1694 RTLFn =
1695 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1696 }
1697 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1698 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1699 LLVMContext &Ctx = F->getContext();
1700 MDBuilder MDB(Ctx);
1701 // Annotate the callback behavior of the __kmpc_fork_call:
1702 // - The callback callee is argument number 2 (microtask).
1703 // - The first two arguments of the callback callee are unknown (-1).
1704 // - All variadic arguments to the __kmpc_fork_call are passed to the
1705 // callback callee.
1706 F->addMetadata(LLVMContext::MD_callback,
1708 2, {-1, -1},
1709 /* VarArgsArePassed */ true)}));
1710 }
1711 }
1712 // Add some known attributes.
1713 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1714 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1715 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1716
1717 assert(OutlinedFn.arg_size() >= 2 &&
1718 "Expected at least tid and bounded tid as arguments");
1719 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1720
1721 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1722 CI->getParent()->setName("omp_parallel");
1723 Builder.SetInsertPoint(CI);
1724
1725 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1726 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1727 &OutlinedFn};
1728
1729 SmallVector<Value *, 16> RealArgs;
1730 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1731 if (IfCondition) {
1732 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1733 RealArgs.push_back(Cond);
1734 }
1735 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1736
1737 // __kmpc_fork_call_if always expects a void ptr as the last argument
1738 // If there are no arguments, pass a null pointer.
1739 auto PtrTy = OMPIRBuilder->VoidPtr;
1740 if (IfCondition && NumCapturedVars == 0) {
1741 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1742 RealArgs.push_back(NullPtrValue);
1743 }
1744
1745 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1746
1747 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1748 << *Builder.GetInsertBlock()->getParent() << "\n");
1749
1750 // Initialize the local TID stack location with the argument value.
1751 Builder.SetInsertPoint(PrivTID);
1752 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1753 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1754 PrivTIDAddr);
1755
1756 // Remove redundant call to the outlined function.
1757 CI->eraseFromParent();
1758
1759 for (Instruction *I : ToBeDeleted) {
1760 I->eraseFromParent();
1761 }
1762}
1763
1765 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
1766 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB,
1767 PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition,
1768 Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable) {
1769 assert(!isConflictIP(Loc.IP, OuterAllocIP) && "IPs must not be ambiguous");
1770
1771 if (!updateToLocation(Loc))
1772 return Loc.IP;
1773
1774 uint32_t SrcLocStrSize;
1775 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1776 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1777 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1778 (ProcBind != OMP_PROC_BIND_default);
1779 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1780 // If we generate code for the target device, we need to allocate
1781 // struct for aggregate params in the device default alloca address space.
1782 // OpenMP runtime requires that the params of the extracted functions are
1783 // passed as zero address space pointers. This flag ensures that extracted
1784 // function arguments are declared in zero address space
1785 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1786
1787 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1788 // only if we compile for host side.
1789 if (NumThreads && !Config.isTargetDevice()) {
1790 Value *Args[] = {
1791 Ident, ThreadID,
1792 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1794 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1795 }
1796
1797 if (ProcBind != OMP_PROC_BIND_default) {
1798 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1799 Value *Args[] = {
1800 Ident, ThreadID,
1801 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1803 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1804 }
1805
1806 BasicBlock *InsertBB = Builder.GetInsertBlock();
1807 Function *OuterFn = InsertBB->getParent();
1808
1809 // Save the outer alloca block because the insertion iterator may get
1810 // invalidated and we still need this later.
1811 BasicBlock *OuterAllocaBlock = OuterAllocIP.getBlock();
1812
1813 // Vector to remember instructions we used only during the modeling but which
1814 // we want to delete at the end.
1816
1817 // Change the location to the outer alloca insertion point to create and
1818 // initialize the allocas we pass into the parallel region.
1819 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1820 Builder.restoreIP(NewOuter);
1821 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1822 AllocaInst *ZeroAddrAlloca =
1823 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1824 Instruction *TIDAddr = TIDAddrAlloca;
1825 Instruction *ZeroAddr = ZeroAddrAlloca;
1826 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1827 // Add additional casts to enforce pointers in zero address space
1828 TIDAddr = new AddrSpaceCastInst(
1829 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1830 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1831 ToBeDeleted.push_back(TIDAddr);
1832 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1833 PointerType ::get(M.getContext(), 0),
1834 "zero.addr.ascast");
1835 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1836 ToBeDeleted.push_back(ZeroAddr);
1837 }
1838
1839 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1840 // associated arguments in the outlined function, so we delete them later.
1841 ToBeDeleted.push_back(TIDAddrAlloca);
1842 ToBeDeleted.push_back(ZeroAddrAlloca);
1843
1844 // Create an artificial insertion point that will also ensure the blocks we
1845 // are about to split are not degenerated.
1846 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1847
1848 BasicBlock *EntryBB = UI->getParent();
1849 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1850 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1851 BasicBlock *PRegPreFiniBB =
1852 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1853 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1854
1855 auto FiniCBWrapper = [&](InsertPointTy IP) {
1856 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1857 // target to the region exit block.
1858 if (IP.getBlock()->end() == IP.getPoint()) {
1860 Builder.restoreIP(IP);
1861 Instruction *I = Builder.CreateBr(PRegExitBB);
1862 IP = InsertPointTy(I->getParent(), I->getIterator());
1863 }
1864 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1865 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1866 "Unexpected insertion point for finalization call!");
1867 return FiniCB(IP);
1868 };
1869
1870 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1871
1872 // Generate the privatization allocas in the block that will become the entry
1873 // of the outlined function.
1874 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1875 InsertPointTy InnerAllocaIP = Builder.saveIP();
1876
1877 AllocaInst *PrivTIDAddr =
1878 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1879 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1880
1881 // Add some fake uses for OpenMP provided arguments.
1882 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1883 Instruction *ZeroAddrUse =
1884 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1885 ToBeDeleted.push_back(ZeroAddrUse);
1886
1887 // EntryBB
1888 // |
1889 // V
1890 // PRegionEntryBB <- Privatization allocas are placed here.
1891 // |
1892 // V
1893 // PRegionBodyBB <- BodeGen is invoked here.
1894 // |
1895 // V
1896 // PRegPreFiniBB <- The block we will start finalization from.
1897 // |
1898 // V
1899 // PRegionExitBB <- A common exit to simplify block collection.
1900 //
1901
1902 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1903
1904 // Let the caller create the body.
1905 assert(BodyGenCB && "Expected body generation callback!");
1906 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1907 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP, PRegExitBB))
1908 return Err;
1909
1910 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1911
1912 // If OuterFn is a Generic kernel, we need to use device shared memory to
1913 // allocate argument structures. Otherwise, we use stack allocations as usual.
1914 bool UsesDeviceSharedMemory =
1915 Config.isTargetDevice() && isGenericKernel(*OuterFn);
1916 std::unique_ptr<OutlineInfo> OI =
1917 UsesDeviceSharedMemory
1918 ? std::make_unique<DeviceSharedMemOutlineInfo>(*this)
1919 : std::make_unique<OutlineInfo>();
1920
1921 if (Config.isTargetDevice()) {
1922 // Generate OpenMP target specific runtime call
1923 OI->PostOutlineCB = [=, ToBeDeletedVec =
1924 std::move(ToBeDeleted)](Function &OutlinedFn) {
1925 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1926 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1927 ThreadID, ToBeDeletedVec);
1928 };
1929 } else {
1930 // Generate OpenMP host runtime call
1931 OI->PostOutlineCB = [=, ToBeDeletedVec =
1932 std::move(ToBeDeleted)](Function &OutlinedFn) {
1933 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1934 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1935 };
1936 }
1937
1938 OI->FixUpNonEntryAllocas = true;
1939 OI->OuterAllocBB = OuterAllocaBlock;
1940 OI->EntryBB = PRegEntryBB;
1941 OI->ExitBB = PRegExitBB;
1942 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
1943 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
1944
1945 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1947 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
1948
1949 CodeExtractorAnalysisCache CEAC(*OuterFn);
1950 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1951 /* AggregateArgs */ false,
1952 /* BlockFrequencyInfo */ nullptr,
1953 /* BranchProbabilityInfo */ nullptr,
1954 /* AssumptionCache */ nullptr,
1955 /* AllowVarArgs */ true,
1956 /* AllowAlloca */ true,
1957 /* AllocationBlock */ OuterAllocaBlock,
1958 /* DeallocationBlocks */ {},
1959 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1960
1961 // Find inputs to, outputs from the code region.
1962 BasicBlock *CommonExit = nullptr;
1963 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1964 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1965
1966 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1967 /*CollectGlobalInputs=*/true);
1968
1969 Inputs.remove_if([&](Value *I) {
1971 return GV->getValueType() == OpenMPIRBuilder::Ident;
1972
1973 return false;
1974 });
1975
1976 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1977
1978 FunctionCallee TIDRTLFn =
1979 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1980
1981 auto PrivHelper = [&](Value &V) -> Error {
1982 if (&V == TIDAddr || &V == ZeroAddr) {
1983 OI->ExcludeArgsFromAggregate.push_back(&V);
1984 return Error::success();
1985 }
1986
1988 for (Use &U : V.uses())
1989 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1990 if (ParallelRegionBlockSet.count(UserI->getParent()))
1991 Uses.insert(&U);
1992
1993 // __kmpc_fork_call expects extra arguments as pointers. If the input
1994 // already has a pointer type, everything is fine. Otherwise, store the
1995 // value onto stack and load it back inside the to-be-outlined region. This
1996 // will ensure only the pointer will be passed to the function.
1997 // FIXME: if there are more than 15 trailing arguments, they must be
1998 // additionally packed in a struct.
1999 Value *Inner = &V;
2000 if (!V.getType()->isPointerTy()) {
2002 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
2003
2004 Builder.restoreIP(OuterAllocIP);
2005 Value *Ptr;
2006 if (UsesDeviceSharedMemory) {
2007 // Use device shared memory instead, if needed.
2008 Ptr = createOMPAllocShared(OuterAllocIP, V.getType(),
2009 V.getName() + ".reloaded");
2010 for (BasicBlock *DeallocBlock : OuterDeallocBlocks)
2012 InsertPointTy(DeallocBlock, DeallocBlock->getFirstInsertionPt()),
2013 Ptr, V.getType());
2014 } else {
2015 Ptr = Builder.CreateAlloca(V.getType(), nullptr,
2016 V.getName() + ".reloaded");
2017 }
2018
2019 // Store to stack at end of the block that currently branches to the entry
2020 // block of the to-be-outlined region.
2021 Builder.SetInsertPoint(InsertBB,
2022 InsertBB->getTerminator()->getIterator());
2023 Builder.CreateStore(&V, Ptr);
2024
2025 // Load back next to allocations in the to-be-outlined region.
2026 Builder.restoreIP(InnerAllocaIP);
2027 Inner = Builder.CreateLoad(V.getType(), Ptr);
2028 }
2029
2030 Value *ReplacementValue = nullptr;
2031 CallInst *CI = dyn_cast<CallInst>(&V);
2032 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
2033 ReplacementValue = PrivTID;
2034 } else {
2035 InsertPointOrErrorTy AfterIP =
2036 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
2037 if (!AfterIP)
2038 return AfterIP.takeError();
2039 Builder.restoreIP(*AfterIP);
2040 InnerAllocaIP = {
2041 InnerAllocaIP.getBlock(),
2042 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
2043
2044 assert(ReplacementValue &&
2045 "Expected copy/create callback to set replacement value!");
2046 if (ReplacementValue == &V)
2047 return Error::success();
2048 }
2049
2050 for (Use *UPtr : Uses)
2051 UPtr->set(ReplacementValue);
2052
2053 return Error::success();
2054 };
2055
2056 // Reset the inner alloca insertion as it will be used for loading the values
2057 // wrapped into pointers before passing them into the to-be-outlined region.
2058 // Configure it to insert immediately after the fake use of zero address so
2059 // that they are available in the generated body and so that the
2060 // OpenMP-related values (thread ID and zero address pointers) remain leading
2061 // in the argument list.
2062 InnerAllocaIP = IRBuilder<>::InsertPoint(
2063 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
2064
2065 // Reset the outer alloca insertion point to the entry of the relevant block
2066 // in case it was invalidated.
2067 OuterAllocIP = IRBuilder<>::InsertPoint(
2068 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
2069
2070 for (Value *Input : Inputs) {
2071 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
2072 if (Error Err = PrivHelper(*Input))
2073 return Err;
2074 }
2075 LLVM_DEBUG({
2076 for (Value *Output : Outputs)
2077 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
2078 });
2079 assert(Outputs.empty() &&
2080 "OpenMP outlining should not produce live-out values!");
2081
2082 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
2083 LLVM_DEBUG({
2084 for (auto *BB : Blocks)
2085 dbgs() << " PBR: " << BB->getName() << "\n";
2086 });
2087
2088 // Adjust the finalization stack, verify the adjustment, and call the
2089 // finalize function a last time to finalize values between the pre-fini
2090 // block and the exit block if we left the parallel "the normal way".
2091 auto FiniInfo = FinalizationStack.pop_back_val();
2092 (void)FiniInfo;
2093 assert(FiniInfo.DK == OMPD_parallel &&
2094 "Unexpected finalization stack state!");
2095
2096 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
2097
2098 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
2099 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
2100 if (!FiniBBOrErr)
2101 return FiniBBOrErr.takeError();
2102 {
2104 Builder.restoreIP(PreFiniIP);
2105 Builder.CreateBr(*FiniBBOrErr);
2106 // There's currently a branch to omp.par.exit. Delete it. We will get there
2107 // via the fini block
2108 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
2109 Term->eraseFromParent();
2110 }
2111
2112 // Register the outlined info.
2113 addOutlineInfo(std::move(OI));
2114
2115 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
2116 UI->eraseFromParent();
2117
2118 return AfterIP;
2119}
2120
2122 // Build call void __kmpc_flush(ident_t *loc)
2123 uint32_t SrcLocStrSize;
2124 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2125 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
2126
2128 Args);
2129}
2130
2132 if (!updateToLocation(Loc))
2133 return;
2134 emitFlush(Loc);
2135}
2136
2138 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
2139 // global_tid);
2140 uint32_t SrcLocStrSize;
2141 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2142 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2143 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
2144
2145 // Ignore return result until untied tasks are supported.
2147 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
2148}
2149
2155
2157 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
2158 uint32_t SrcLocStrSize;
2159 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2160 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2161 Constant *I32Null = ConstantInt::getNullValue(Int32);
2162 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
2163
2165 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
2166}
2167
2173
2175 const DependData &Dep) {
2176 // Store the pointer to the variable
2177 Value *Addr = Builder.CreateStructGEP(
2178 DependInfo, Entry,
2179 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2180 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, SizeTy);
2181 Builder.CreateStore(DepValPtr, Addr);
2182 // Store the size of the variable
2183 Value *Size = Builder.CreateStructGEP(
2184 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Len));
2185 Builder.CreateStore(
2186 ConstantInt::get(SizeTy,
2187 M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2188 Size);
2189 // Store the dependency kind
2190 Value *Flags = Builder.CreateStructGEP(
2191 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Flags));
2192 Builder.CreateStore(ConstantInt::get(Builder.getInt8Ty(),
2193 static_cast<unsigned int>(Dep.DepKind)),
2194 Flags);
2195}
2196
2197// Processes the dependencies in Dependencies and does the following
2198// - Allocates space on the stack of an array of DependInfo objects
2199// - Populates each DependInfo object with relevant information of
2200// the corresponding dependence.
2201// - All code is inserted in the entry block of the current function.
2203 OpenMPIRBuilder &OMPBuilder,
2205 // Early return if we have no dependencies to process
2206 if (Dependencies.empty())
2207 return nullptr;
2208
2209 // Given a vector of DependData objects, in this function we create an
2210 // array on the stack that holds kmp_depend_info objects corresponding
2211 // to each dependency. This is then passed to the OpenMP runtime.
2212 // For example, if there are 'n' dependencies then the following psedo
2213 // code is generated. Assume the first dependence is on a variable 'a'
2214 //
2215 // \code{c}
2216 // DepArray = alloc(n x sizeof(kmp_depend_info);
2217 // idx = 0;
2218 // DepArray[idx].base_addr = ptrtoint(&a);
2219 // DepArray[idx].len = 8;
2220 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
2221 // ++idx;
2222 // DepArray[idx].base_addr = ...;
2223 // \endcode
2224
2225 IRBuilderBase &Builder = OMPBuilder.Builder;
2226 Type *DependInfo = OMPBuilder.DependInfo;
2227
2228 Value *DepArray = nullptr;
2229 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2230 Builder.SetInsertPoint(
2232
2233 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2234 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2235
2236 Builder.restoreIP(OldIP);
2237
2238 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2239 Value *Base =
2240 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2241 OMPBuilder.emitTaskDependency(Builder, Base, Dep);
2242 }
2243 return DepArray;
2244}
2245
2246/// Create the task duplication function passed to kmpc_taskloop.
2247Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2248 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2249 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2250 if (!DupCB)
2252 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2253
2254 // From OpenMP Runtime p_task_dup_t:
2255 // Routine optionally generated by the compiler for setting the lastprivate
2256 // flag and calling needed constructors for private/firstprivate objects (used
2257 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2258 // lastprivate flag.
2259 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2260
2261 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2262
2263 FunctionType *DupFuncTy = FunctionType::get(
2264 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2265 /*isVarArg=*/false);
2266
2267 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2268 "omp_taskloop_dup", M);
2269 Value *DestTaskArg = DupFunction->getArg(0);
2270 Value *SrcTaskArg = DupFunction->getArg(1);
2271 Value *LastprivateFlagArg = DupFunction->getArg(2);
2272 DestTaskArg->setName("dest_task");
2273 SrcTaskArg->setName("src_task");
2274 LastprivateFlagArg->setName("lastprivate_flag");
2275
2276 IRBuilderBase::InsertPointGuard Guard(Builder);
2277 Builder.SetInsertPoint(
2278 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2279
2280 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2281 Type *TaskWithPrivatesTy =
2282 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2283 Value *TaskPrivates = Builder.CreateGEP(
2284 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2285 Value *ContextPtr = Builder.CreateGEP(
2286 PrivatesTy, TaskPrivates,
2287 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2288 return ContextPtr;
2289 };
2290
2291 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2292 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2293
2294 DestTaskContextPtr->setName("destPtr");
2295 SrcTaskContextPtr->setName("srcPtr");
2296
2297 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2298 DupFunction->getEntryBlock().begin());
2299 InsertPointTy CodeGenIP = Builder.saveIP();
2300 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2301 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2302 if (!AfterIPOrError)
2303 return AfterIPOrError.takeError();
2304 Builder.restoreIP(*AfterIPOrError);
2305
2306 Builder.CreateRetVoid();
2307
2308 return DupFunction;
2309}
2310
2311OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2312 const LocationDescription &Loc, InsertPointTy AllocaIP,
2313 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2314 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2315 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2316 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2317 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2318 Value *TaskContextStructPtrVal) {
2319
2320 if (!updateToLocation(Loc))
2321 return InsertPointTy();
2322
2323 uint32_t SrcLocStrSize;
2324 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2325 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2326
2327 BasicBlock *TaskloopExitBB =
2328 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2329 BasicBlock *TaskloopBodyBB =
2330 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2331 BasicBlock *TaskloopAllocaBB =
2332 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2333
2334 InsertPointTy TaskloopAllocaIP =
2335 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2336 InsertPointTy TaskloopBodyIP =
2337 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2338
2339 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP, TaskloopExitBB))
2340 return Err;
2341
2342 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2343 if (!result) {
2344 return result.takeError();
2345 }
2346
2347 llvm::CanonicalLoopInfo *CLI = result.get();
2348 auto OI = std::make_unique<OutlineInfo>();
2349 OI->EntryBB = TaskloopAllocaBB;
2350 OI->OuterAllocBB = AllocaIP.getBlock();
2351 OI->ExitBB = TaskloopExitBB;
2352 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2353 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2354
2355 // Add the thread ID argument.
2356 SmallVector<Instruction *> ToBeDeleted;
2357 // dummy instruction to be used as a fake argument
2358 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2359 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2360 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2361 TaskloopAllocaIP, "lb", false, true);
2362 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2363 TaskloopAllocaIP, "ub", false, true);
2364 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2365 TaskloopAllocaIP, "step", false, true);
2366 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2367 // aggregate struct
2368 OI->Inputs.insert(FakeLB);
2369 OI->Inputs.insert(FakeUB);
2370 OI->Inputs.insert(FakeStep);
2371 if (TaskContextStructPtrVal)
2372 OI->Inputs.insert(TaskContextStructPtrVal);
2373 assert(((TaskContextStructPtrVal && DupCB) ||
2374 (!TaskContextStructPtrVal && !DupCB)) &&
2375 "Task context struct ptr and duplication callback must be both set "
2376 "or both null");
2377
2378 // It isn't safe to run the duplication bodygen callback inside the post
2379 // outlining callback so this has to be run now before we know the real task
2380 // shareds structure type.
2381 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2382 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2383 Type *FakeSharedsTy = StructType::get(
2384 Builder.getContext(),
2385 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2386 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2387 FakeSharedsTy,
2388 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2389 if (!TaskDupFnOrErr) {
2390 return TaskDupFnOrErr.takeError();
2391 }
2392 Value *TaskDupFn = *TaskDupFnOrErr;
2393
2394 OI->PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2395 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2396 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2397 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2398 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2399 // Replace the Stale CI by appropriate RTL function call.
2400 assert(OutlinedFn.hasOneUse() &&
2401 "there must be a single user for the outlined function");
2402 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2403
2404 /* Create the casting for the Bounds Values that can be used when outlining
2405 * to replace the uses of the fakes with real values */
2406 BasicBlock *CodeReplBB = StaleCI->getParent();
2407 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2408 Value *CastedLBVal =
2409 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2410 Value *CastedUBVal =
2411 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2412 Value *CastedStepVal =
2413 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2414
2415 Builder.SetInsertPoint(StaleCI);
2416
2417 // Gather the arguments for emitting the runtime call for
2418 // @__kmpc_omp_task_alloc
2419 Function *TaskAllocFn =
2420 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2421
2422 Value *ThreadID = getOrCreateThreadID(Ident);
2423
2424 if (!NoGroup) {
2425 // Emit runtime call for @__kmpc_taskgroup
2426 Function *TaskgroupFn =
2427 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2428 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2429 }
2430
2431 // `flags` Argument Configuration
2432 // Task is tied if (Flags & 1) == 1.
2433 // Task is untied if (Flags & 1) == 0.
2434 // Task is final if (Flags & 2) == 2.
2435 // Task is not final if (Flags & 2) == 0.
2436 // Task is mergeable if (Flags & 4) == 4.
2437 // Task is not mergeable if (Flags & 4) == 0.
2438 // Task is priority if (Flags & 32) == 32.
2439 // Task is not priority if (Flags & 32) == 0.
2440 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2441 if (Final)
2442 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2443 if (Mergeable)
2444 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2445 if (Priority)
2446 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2447
2448 Value *TaskSize = Builder.getInt64(
2449 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2450
2451 AllocaInst *ArgStructAlloca =
2453 assert(ArgStructAlloca &&
2454 "Unable to find the alloca instruction corresponding to arguments "
2455 "for extracted function");
2456 std::optional<TypeSize> ArgAllocSize =
2457 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2458 assert(ArgAllocSize &&
2459 "Unable to determine size of arguments for extracted function");
2460 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2461
2462 // Emit the @__kmpc_omp_task_alloc runtime call
2463 // The runtime call returns a pointer to an area where the task captured
2464 // variables must be copied before the task is run (TaskData)
2465 CallInst *TaskData = Builder.CreateCall(
2466 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2467 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2468 /*task_func=*/&OutlinedFn});
2469
2470 Value *Shareds = StaleCI->getArgOperand(1);
2471 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2472 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2473 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2474 SharedsSize);
2475 // Get the pointer to loop lb, ub, step from task ptr
2476 // and set up the lowerbound,upperbound and step values
2477 llvm::Value *Lb = Builder.CreateGEP(
2478 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2479
2480 llvm::Value *Ub = Builder.CreateGEP(
2481 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2482
2483 llvm::Value *Step = Builder.CreateGEP(
2484 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2485 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2486
2487 // set up the arguments for emitting kmpc_taskloop runtime call
2488 // setting values for ifval, nogroup, sched, grainsize, task_dup
2489 Value *IfCondVal =
2490 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2491 : Builder.getInt32(1);
2492 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2493 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2494 Value *NoGroupVal = Builder.getInt32(1);
2495 Value *SchedVal = Builder.getInt32(Sched);
2496 Value *GrainSizeVal =
2497 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2498 : Builder.getInt64(0);
2499 Value *TaskDup = TaskDupFn;
2500
2501 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2502 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2503
2504 // taskloop runtime call
2505 Function *TaskloopFn =
2506 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2507 Builder.CreateCall(TaskloopFn, Args);
2508
2509 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2510 // nogroup is not defined
2511 if (!NoGroup) {
2512 Function *EndTaskgroupFn =
2513 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2514 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2515 }
2516
2517 StaleCI->eraseFromParent();
2518
2519 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2520
2521 LoadInst *SharedsOutlined =
2522 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2523 OutlinedFn.getArg(1)->replaceUsesWithIf(
2524 SharedsOutlined,
2525 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2526
2527 Value *IV = CLI->getIndVar();
2528 Type *IVTy = IV->getType();
2529 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2530
2531 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2532 // UpperBound. These GEP's can be reused for loading the tasks respective
2533 // bounds.
2534 Value *TaskLB = nullptr;
2535 Value *TaskUB = nullptr;
2536 Value *TaskStep = nullptr;
2537 Value *LoadTaskLB = nullptr;
2538 Value *LoadTaskUB = nullptr;
2539 Value *LoadTaskStep = nullptr;
2540 for (Instruction &I : *TaskloopAllocaBB) {
2541 if (I.getOpcode() == Instruction::GetElementPtr) {
2542 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2543 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2544 switch (CI->getZExtValue()) {
2545 case 0:
2546 TaskLB = &I;
2547 break;
2548 case 1:
2549 TaskUB = &I;
2550 break;
2551 case 2:
2552 TaskStep = &I;
2553 break;
2554 }
2555 }
2556 } else if (I.getOpcode() == Instruction::Load) {
2557 LoadInst &Load = cast<LoadInst>(I);
2558 if (Load.getPointerOperand() == TaskLB) {
2559 assert(TaskLB != nullptr && "Expected value for TaskLB");
2560 LoadTaskLB = &I;
2561 } else if (Load.getPointerOperand() == TaskUB) {
2562 assert(TaskUB != nullptr && "Expected value for TaskUB");
2563 LoadTaskUB = &I;
2564 } else if (Load.getPointerOperand() == TaskStep) {
2565 assert(TaskStep != nullptr && "Expected value for TaskStep");
2566 LoadTaskStep = &I;
2567 }
2568 }
2569 }
2570
2571 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2572
2573 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2574 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2575 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2576 Value *TripCountMinusOne = Builder.CreateSDiv(
2577 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2578 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2579 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2580 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2581 // set the trip count in the CLI
2582 CLI->setTripCount(CastedTripCount);
2583
2584 Builder.SetInsertPoint(CLI->getBody(),
2585 CLI->getBody()->getFirstInsertionPt());
2586
2587 if (NumOfCollapseLoops > 1) {
2588 llvm::SmallVector<User *> UsersToReplace;
2589 // When using the collapse clause, the bounds of the loop have to be
2590 // adjusted to properly represent the iterator of the outer loop.
2591 Value *IVPlusTaskLB = Builder.CreateAdd(
2592 CLI->getIndVar(),
2593 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2594 // To ensure every Use is correctly captured, we first want to record
2595 // which users to replace the value in, and then replace the value.
2596 for (auto IVUse = CLI->getIndVar()->uses().begin();
2597 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2598 User *IVUser = IVUse->getUser();
2599 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2600 if (Op->getOpcode() == Instruction::URem ||
2601 Op->getOpcode() == Instruction::UDiv) {
2602 UsersToReplace.push_back(IVUser);
2603 }
2604 }
2605 }
2606 for (User *User : UsersToReplace) {
2607 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2608 }
2609 } else {
2610 // The canonical loop is generated with a fixed lower bound. We need to
2611 // update the index calculation code to use the task's lower bound. The
2612 // generated code looks like this:
2613 // %omp_loop.iv = phi ...
2614 // ...
2615 // %tmp = mul [type] %omp_loop.iv, step
2616 // %user_index = add [type] tmp, lb
2617 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2618 // of the normalised induction variable:
2619 // 1. This one: converting the normalised IV to the user IV
2620 // 2. The increment (add)
2621 // 3. The comparison against the trip count (icmp)
2622 // (1) is the only use that is a mul followed by an add so this cannot
2623 // match other IR.
2624 assert(CLI->getIndVar()->getNumUses() == 3 &&
2625 "Canonical loop should have exactly three uses of the ind var");
2626 for (User *IVUser : CLI->getIndVar()->users()) {
2627 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2628 if (Mul->getOpcode() == Instruction::Mul) {
2629 for (User *MulUser : Mul->users()) {
2630 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2631 if (Add->getOpcode() == Instruction::Add) {
2632 Add->setOperand(1, CastedTaskLB);
2633 }
2634 }
2635 }
2636 }
2637 }
2638 }
2639 }
2640
2641 FakeLB->replaceAllUsesWith(CastedLBVal);
2642 FakeUB->replaceAllUsesWith(CastedUBVal);
2643 FakeStep->replaceAllUsesWith(CastedStepVal);
2644 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2645 I->eraseFromParent();
2646 }
2647 };
2648
2649 addOutlineInfo(std::move(OI));
2650 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2651 return Builder.saveIP();
2652}
2653
2656 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2657 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2658 llvm::Type::getInt32Ty(M.getContext()));
2659}
2660
2662 const LocationDescription &Loc, InsertPointTy AllocaIP,
2663 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB,
2664 bool Tied, Value *Final, Value *IfCondition,
2665 const DependenciesInfo &Dependencies, const AffinityData &Affinities,
2666 bool Mergeable, Value *EventHandle, Value *Priority) {
2667
2668 if (!updateToLocation(Loc))
2669 return InsertPointTy();
2670
2671 uint32_t SrcLocStrSize;
2672 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2673 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2674 // The current basic block is split into four basic blocks. After outlining,
2675 // they will be mapped as follows:
2676 // ```
2677 // def current_fn() {
2678 // current_basic_block:
2679 // br label %task.exit
2680 // task.exit:
2681 // ; instructions after task
2682 // }
2683 // def outlined_fn() {
2684 // task.alloca:
2685 // br label %task.body
2686 // task.body:
2687 // ret void
2688 // }
2689 // ```
2690 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2691 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2692 BasicBlock *TaskAllocaBB =
2693 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2694
2695 InsertPointTy TaskAllocaIP =
2696 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2697 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2698 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP, TaskExitBB))
2699 return Err;
2700
2701 auto OI = std::make_unique<OutlineInfo>();
2702 OI->EntryBB = TaskAllocaBB;
2703 OI->OuterAllocBB = AllocaIP.getBlock();
2704 OI->ExitBB = TaskExitBB;
2705 OI->OuterDeallocBBs.reserve(DeallocBlocks.size());
2706 copy(DeallocBlocks, OI->OuterDeallocBBs.end());
2707
2708 // Add the thread ID argument.
2710 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2711 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2712
2713 OI->PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2714 Affinities, Mergeable, Priority, EventHandle,
2715 TaskAllocaBB,
2716 ToBeDeleted](Function &OutlinedFn) mutable {
2717 // Replace the Stale CI by appropriate RTL function call.
2718 assert(OutlinedFn.hasOneUse() &&
2719 "there must be a single user for the outlined function");
2720 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2721
2722 // HasShareds is true if any variables are captured in the outlined region,
2723 // false otherwise.
2724 bool HasShareds = StaleCI->arg_size() > 1;
2725 Builder.SetInsertPoint(StaleCI);
2726
2727 // Gather the arguments for emitting the runtime call for
2728 // @__kmpc_omp_task_alloc
2729 Function *TaskAllocFn =
2730 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2731
2732 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2733 // call.
2734 Value *ThreadID = getOrCreateThreadID(Ident);
2735
2736 // Argument - `flags`
2737 // Task is tied iff (Flags & 1) == 1.
2738 // Task is untied iff (Flags & 1) == 0.
2739 // Task is final iff (Flags & 2) == 2.
2740 // Task is not final iff (Flags & 2) == 0.
2741 // Task is mergeable or merged-if0 iff (Flags & 4) == 4.
2742 // Task is neither mergeable nor merged-if0 iff (Flags & 4) == 0.
2743 // Task is detachable iff (Flags & 64) == 64.
2744 // Task is not detachable iff (Flags & 64) == 0.
2745 // Task is priority iff (Flags & 32) == 32.
2746 // Task is not priority iff (Flags & 32) == 0.
2747 // TODO: Handle the other flags.
2748 Value *Flags = Builder.getInt32(Tied);
2749 auto *ConstIfCondition = dyn_cast_or_null<ConstantInt>(IfCondition);
2750 bool UseMergedIf0Path = ConstIfCondition && ConstIfCondition->isZero();
2751 if (Final) {
2752 Value *FinalFlag =
2753 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2754 Flags = Builder.CreateOr(FinalFlag, Flags);
2755 }
2756
2757 if (Mergeable || UseMergedIf0Path)
2758 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2759 if (EventHandle)
2760 Flags = Builder.CreateOr(Builder.getInt32(64), Flags);
2761 if (Priority)
2762 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2763
2764 // Argument - `sizeof_kmp_task_t` (TaskSize)
2765 // Tasksize refers to the size in bytes of kmp_task_t data structure
2766 // including private vars accessed in task.
2767 // TODO: add kmp_task_t_with_privates (privates)
2768 Value *TaskSize = Builder.getInt64(
2769 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2770
2771 // Argument - `sizeof_shareds` (SharedsSize)
2772 // SharedsSize refers to the shareds array size in the kmp_task_t data
2773 // structure.
2774 Value *SharedsSize = Builder.getInt64(0);
2775 if (HasShareds) {
2776 AllocaInst *ArgStructAlloca =
2778 assert(ArgStructAlloca &&
2779 "Unable to find the alloca instruction corresponding to arguments "
2780 "for extracted function");
2781 std::optional<TypeSize> ArgAllocSize =
2782 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2783 assert(ArgAllocSize &&
2784 "Unable to determine size of arguments for extracted function");
2785 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2786 }
2787 // Emit the @__kmpc_omp_task_alloc runtime call
2788 // The runtime call returns a pointer to an area where the task captured
2789 // variables must be copied before the task is run (TaskData)
2791 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2792 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2793 /*task_func=*/&OutlinedFn});
2794
2795 if (Affinities.Count && Affinities.Info) {
2797 OMPRTL___kmpc_omp_reg_task_with_affinity);
2798
2799 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2800 Affinities.Count, Affinities.Info});
2801 }
2802
2803 // Emit detach clause initialization.
2804 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2805 // task_descriptor);
2806 if (EventHandle) {
2808 OMPRTL___kmpc_task_allow_completion_event);
2809 llvm::Value *EventVal =
2810 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2811 llvm::Value *EventHandleAddr =
2812 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2813 Builder.getPtrTy(0));
2814 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2815 Builder.CreateStore(EventVal, EventHandleAddr);
2816 }
2817 // Copy the arguments for outlined function
2818 if (HasShareds) {
2819 Value *Shareds = StaleCI->getArgOperand(1);
2820 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2821 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2822 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2823 SharedsSize);
2824 }
2825
2826 if (Priority) {
2827 //
2828 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2829 // we populate the priority information into the "kmp_task_t" here
2830 //
2831 // The struct "kmp_task_t" definition is available in kmp.h
2832 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2833 // data2 is used for priority
2834 //
2835 Type *Int32Ty = Builder.getInt32Ty();
2836 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2837 // kmp_task_t* => { ptr }
2838 Type *TaskPtr = StructType::get(VoidPtr);
2839 Value *TaskGEP =
2840 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2841 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2842 Type *TaskStructType = StructType::get(
2843 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2844 Value *PriorityData = Builder.CreateInBoundsGEP(
2845 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2846 // kmp_cmplrdata_t => { ptr, ptr }
2847 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2848 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2849 PriorityData, {Zero, Zero});
2850 Builder.CreateStore(Priority, CmplrData);
2851 }
2852
2853 Value *DepArray = nullptr;
2854 Value *NumDeps = nullptr;
2855 if (Dependencies.DepArray) {
2856 DepArray = Dependencies.DepArray;
2857 NumDeps = Dependencies.NumDeps;
2858 } else if (!Dependencies.Deps.empty()) {
2859 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
2860 NumDeps = Builder.getInt32(Dependencies.Deps.size());
2861 }
2862
2863 // In the presence of the `if` clause, the following IR is generated:
2864 // ...
2865 // %data = call @__kmpc_omp_task_alloc(...)
2866 // br i1 %if_condition, label %then, label %else
2867 // then:
2868 // call @__kmpc_omp_task(...)
2869 // br label %exit
2870 // else:
2871 // ;; Wait for resolution of dependencies, if any, before
2872 // ;; beginning the task
2873 // call @__kmpc_omp_wait_deps(...)
2874 // call @__kmpc_omp_task_begin_if0(...)
2875 // call @outlined_fn(...)
2876 // call @__kmpc_omp_task_complete_if0(...)
2877 // br label %exit
2878 // exit:
2879 // ...
2880 if (IfCondition && !UseMergedIf0Path) {
2881 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2882 // terminator.
2883 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2884 Instruction *IfTerminator =
2885 Builder.GetInsertPoint()->getParent()->getTerminator();
2886 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2887 Builder.SetInsertPoint(IfTerminator);
2888 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2889 &ElseTI);
2890 Builder.SetInsertPoint(ElseTI);
2891
2892 if (DepArray) {
2893 Function *TaskWaitFn =
2894 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2896 TaskWaitFn,
2897 {Ident, ThreadID, NumDeps, DepArray,
2898 ConstantInt::get(Builder.getInt32Ty(), 0),
2900 }
2901 Function *TaskBeginFn =
2902 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2903 Function *TaskCompleteFn =
2904 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2905 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2906 CallInst *CI = nullptr;
2907 if (HasShareds)
2908 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2909 else
2910 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2911 CI->setDebugLoc(StaleCI->getDebugLoc());
2912 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2913 Builder.SetInsertPoint(ThenTI);
2914 }
2915
2916 if (DepArray) {
2917 Function *TaskFn =
2918 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2920 TaskFn,
2921 {Ident, ThreadID, TaskData, NumDeps, DepArray,
2922 ConstantInt::get(Builder.getInt32Ty(), 0),
2924
2925 } else {
2926 // Emit the @__kmpc_omp_task runtime call to spawn the task
2927 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2928 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2929 }
2930
2931 StaleCI->eraseFromParent();
2932
2933 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2934 if (HasShareds) {
2935 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2936 OutlinedFn.getArg(1)->replaceUsesWithIf(
2937 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2938 }
2939
2940 for (Instruction *I : llvm::reverse(ToBeDeleted))
2941 I->eraseFromParent();
2942 };
2943
2944 addOutlineInfo(std::move(OI));
2945 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2946
2947 return Builder.saveIP();
2948}
2949
2951 const LocationDescription &Loc, InsertPointTy AllocaIP,
2952 ArrayRef<BasicBlock *> DeallocBlocks, BodyGenCallbackTy BodyGenCB) {
2953 if (!updateToLocation(Loc))
2954 return InsertPointTy();
2955
2956 uint32_t SrcLocStrSize;
2957 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2958 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2959 Value *ThreadID = getOrCreateThreadID(Ident);
2960
2961 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2962 Function *TaskgroupFn =
2963 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2964 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2965
2966 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2967 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP(), DeallocBlocks))
2968 return Err;
2969
2970 Builder.SetInsertPoint(TaskgroupExitBB);
2971 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2972 Function *EndTaskgroupFn =
2973 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2974 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2975
2976 return Builder.saveIP();
2977}
2978
2980 const LocationDescription &Loc, InsertPointTy AllocaIP,
2982 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2983 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2984
2985 if (!updateToLocation(Loc))
2986 return Loc.IP;
2987
2988 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2989
2990 // Each section is emitted as a switch case
2991 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2992 // -> OMP.createSection() which generates the IR for each section
2993 // Iterate through all sections and emit a switch construct:
2994 // switch (IV) {
2995 // case 0:
2996 // <SectionStmt[0]>;
2997 // break;
2998 // ...
2999 // case <NumSection> - 1:
3000 // <SectionStmt[<NumSection> - 1]>;
3001 // break;
3002 // }
3003 // ...
3004 // section_loop.after:
3005 // <FiniCB>;
3006 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
3007 Builder.restoreIP(CodeGenIP);
3009 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
3010 Function *CurFn = Continue->getParent();
3011 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
3012
3013 unsigned CaseNumber = 0;
3014 for (auto SectionCB : SectionCBs) {
3016 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
3017 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
3018 Builder.SetInsertPoint(CaseBB);
3019 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
3020 if (Error Err =
3021 SectionCB(InsertPointTy(),
3022 {CaseEndBr->getParent(), CaseEndBr->getIterator()}, {}))
3023 return Err;
3024 CaseNumber++;
3025 }
3026 // remove the existing terminator from body BB since there can be no
3027 // terminators after switch/case
3028 return Error::success();
3029 };
3030 // Loop body ends here
3031 // LowerBound, UpperBound, and STride for createCanonicalLoop
3032 Type *I32Ty = Type::getInt32Ty(M.getContext());
3033 Value *LB = ConstantInt::get(I32Ty, 0);
3034 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
3035 Value *ST = ConstantInt::get(I32Ty, 1);
3037 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
3038 if (!LoopInfo)
3039 return LoopInfo.takeError();
3040
3041 InsertPointOrErrorTy WsloopIP =
3042 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
3043 WorksharingLoopType::ForStaticLoop, !IsNowait);
3044 if (!WsloopIP)
3045 return WsloopIP.takeError();
3046 InsertPointTy AfterIP = *WsloopIP;
3047
3048 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
3049 assert(LoopFini && "Bad structure of static workshare loop finalization");
3050
3051 // Apply the finalization callback in LoopAfterBB
3052 auto FiniInfo = FinalizationStack.pop_back_val();
3053 assert(FiniInfo.DK == OMPD_sections &&
3054 "Unexpected finalization stack state!");
3055 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
3056 return Err;
3057
3058 return AfterIP;
3059}
3060
3063 BodyGenCallbackTy BodyGenCB,
3064 FinalizeCallbackTy FiniCB) {
3065 if (!updateToLocation(Loc))
3066 return Loc.IP;
3067
3068 auto FiniCBWrapper = [&](InsertPointTy IP) {
3069 if (IP.getBlock()->end() != IP.getPoint())
3070 return FiniCB(IP);
3071 // This must be done otherwise any nested constructs using FinalizeOMPRegion
3072 // will fail because that function requires the Finalization Basic Block to
3073 // have a terminator, which is already removed by EmitOMPRegionBody.
3074 // IP is currently at cancelation block.
3075 // We need to backtrack to the condition block to fetch
3076 // the exit block and create a branch from cancelation
3077 // to exit block.
3079 Builder.restoreIP(IP);
3080 auto *CaseBB = Loc.IP.getBlock();
3081 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
3082 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
3083 Instruction *I = Builder.CreateBr(ExitBB);
3084 IP = InsertPointTy(I->getParent(), I->getIterator());
3085 return FiniCB(IP);
3086 };
3087
3088 Directive OMPD = Directive::OMPD_sections;
3089 // Since we are using Finalization Callback here, HasFinalize
3090 // and IsCancellable have to be true
3091 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
3092 /*Conditional*/ false, /*hasFinalize*/ true,
3093 /*IsCancellable*/ true);
3094}
3095
3101
3102Value *OpenMPIRBuilder::getGPUThreadID() {
3105 OMPRTL___kmpc_get_hardware_thread_id_in_block),
3106 {});
3107}
3108
3109Value *OpenMPIRBuilder::getGPUWarpSize() {
3111 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
3112}
3113
3114Value *OpenMPIRBuilder::getNVPTXWarpID() {
3115 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3116 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
3117}
3118
3119Value *OpenMPIRBuilder::getNVPTXLaneID() {
3120 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
3121 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
3122 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
3123 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
3124 "nvptx_lane_id");
3125}
3126
3127Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
3128 Type *ToType) {
3129 Type *FromType = From->getType();
3130 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
3131 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
3132 assert(FromSize > 0 && "From size must be greater than zero");
3133 assert(ToSize > 0 && "To size must be greater than zero");
3134 if (FromType == ToType)
3135 return From;
3136 if (FromSize == ToSize)
3137 return Builder.CreateBitCast(From, ToType);
3138 if (ToType->isIntegerTy() && FromType->isIntegerTy())
3139 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
3140 InsertPointTy SaveIP = Builder.saveIP();
3141 Builder.restoreIP(AllocaIP);
3142 Value *CastItem = Builder.CreateAlloca(ToType);
3143 Builder.restoreIP(SaveIP);
3144
3145 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
3146 CastItem, Builder.getPtrTy(0));
3147 Builder.CreateStore(From, ValCastItem);
3148 return Builder.CreateLoad(ToType, CastItem);
3149}
3150
3151Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
3152 Value *Element,
3153 Type *ElementType,
3154 Value *Offset) {
3155 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
3156 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
3157
3158 // Cast all types to 32- or 64-bit values before calling shuffle routines.
3159 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
3160 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
3161 Value *WarpSize =
3162 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
3164 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
3165 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
3166 Value *WarpSizeCast =
3167 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
3168 Value *ShuffleCall =
3169 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
3170 return castValueToType(AllocaIP, ShuffleCall, CastTy);
3171}
3172
3173void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
3174 Value *DstAddr, Type *ElemType,
3175 Value *Offset, Type *ReductionArrayTy,
3176 bool IsByRefElem) {
3177 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
3178 // Create the loop over the big sized data.
3179 // ptr = (void*)Elem;
3180 // ptrEnd = (void*) Elem + 1;
3181 // Step = 8;
3182 // while (ptr + Step < ptrEnd)
3183 // shuffle((int64_t)*ptr);
3184 // Step = 4;
3185 // while (ptr + Step < ptrEnd)
3186 // shuffle((int32_t)*ptr);
3187 // ...
3188 Type *IndexTy = Builder.getIndexTy(
3189 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3190 Value *ElemPtr = DstAddr;
3191 Value *Ptr = SrcAddr;
3192 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
3193 if (Size < IntSize)
3194 continue;
3195 Type *IntType = Builder.getIntNTy(IntSize * 8);
3196 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3197 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
3198 Value *SrcAddrGEP =
3199 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
3200 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3201 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
3202
3203 Function *CurFunc = Builder.GetInsertBlock()->getParent();
3204 if ((Size / IntSize) > 1) {
3205 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
3206 SrcAddrGEP, Builder.getPtrTy());
3207 BasicBlock *PreCondBB =
3208 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
3209 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
3210 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
3211 BasicBlock *CurrentBB = Builder.GetInsertBlock();
3212 emitBlock(PreCondBB, CurFunc);
3213 PHINode *PhiSrc =
3214 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
3215 PhiSrc->addIncoming(Ptr, CurrentBB);
3216 PHINode *PhiDest =
3217 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
3218 PhiDest->addIncoming(ElemPtr, CurrentBB);
3219 Ptr = PhiSrc;
3220 ElemPtr = PhiDest;
3221 Value *PtrDiff = Builder.CreatePtrDiff(
3222 Builder.getInt8Ty(), PtrEnd,
3223 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
3224 Builder.CreateCondBr(
3225 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
3226 ExitBB);
3227 emitBlock(ThenBB, CurFunc);
3228 Value *Res = createRuntimeShuffleFunction(
3229 AllocaIP,
3230 Builder.CreateAlignedLoad(
3231 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3232 IntType, Offset);
3233 Builder.CreateAlignedStore(Res, ElemPtr,
3234 M.getDataLayout().getPrefTypeAlign(ElemType));
3235 Value *LocalPtr =
3236 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3237 Value *LocalElemPtr =
3238 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3239 PhiSrc->addIncoming(LocalPtr, ThenBB);
3240 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3241 emitBranch(PreCondBB);
3242 emitBlock(ExitBB, CurFunc);
3243 } else {
3244 Value *Res = createRuntimeShuffleFunction(
3245 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3246 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3247 Res->getType()->getScalarSizeInBits())
3248 Res = Builder.CreateTrunc(Res, ElemType);
3249 Builder.CreateStore(Res, ElemPtr);
3250 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3251 ElemPtr =
3252 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3253 }
3254 Size = Size % IntSize;
3255 }
3256}
3257
3258Error OpenMPIRBuilder::emitReductionListCopy(
3259 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3260 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3261 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3262 Type *IndexTy = Builder.getIndexTy(
3263 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3264 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3265
3266 // Iterates, element-by-element, through the source Reduce list and
3267 // make a copy.
3268 for (auto En : enumerate(ReductionInfos)) {
3269 const ReductionInfo &RI = En.value();
3270 Value *SrcElementAddr = nullptr;
3271 AllocaInst *DestAlloca = nullptr;
3272 Value *DestElementAddr = nullptr;
3273 Value *DestElementPtrAddr = nullptr;
3274 // Should we shuffle in an element from a remote lane?
3275 bool ShuffleInElement = false;
3276 // Set to true to update the pointer in the dest Reduce list to a
3277 // newly created element.
3278 bool UpdateDestListPtr = false;
3279
3280 // Step 1.1: Get the address for the src element in the Reduce list.
3281 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3282 ReductionArrayTy, SrcBase,
3283 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3284 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3285
3286 // Step 1.2: Create a temporary to store the element in the destination
3287 // Reduce list.
3288 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3289 ReductionArrayTy, DestBase,
3290 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3291 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3292 switch (Action) {
3294 InsertPointTy CurIP = Builder.saveIP();
3295 Builder.restoreIP(AllocaIP);
3296
3297 Type *DestAllocaType =
3298 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3299 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3300 ".omp.reduction.element");
3301 DestAlloca->setAlignment(
3302 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3303 DestElementAddr = DestAlloca;
3304 DestElementAddr =
3305 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3306 DestElementAddr->getName() + ".ascast");
3307 Builder.restoreIP(CurIP);
3308 ShuffleInElement = true;
3309 UpdateDestListPtr = true;
3310 break;
3311 }
3313 DestElementAddr =
3314 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3315 break;
3316 }
3317 }
3318
3319 // Now that all active lanes have read the element in the
3320 // Reduce list, shuffle over the value from the remote lane.
3321 if (ShuffleInElement) {
3322 Type *ShuffleType = RI.ElementType;
3323 Value *ShuffleSrcAddr = SrcElementAddr;
3324 Value *ShuffleDestAddr = DestElementAddr;
3325 AllocaInst *LocalStorage = nullptr;
3326
3327 if (IsByRefElem) {
3328 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3329 assert(RI.ByRefAllocatedType &&
3330 "Expected by-ref allocated type to be set");
3331 // For by-ref reductions, we need to copy from the remote lane the
3332 // actual value of the partial reduction computed by that remote lane;
3333 // rather than, for example, a pointer to that data or, even worse, a
3334 // pointer to the descriptor of the by-ref reduction element.
3335 ShuffleType = RI.ByRefElementType;
3336
3337 if (RI.DataPtrPtrGen) {
3338 // Descriptor-based by-ref: extract data pointer from descriptor.
3339 InsertPointOrErrorTy GenResult = RI.DataPtrPtrGen(
3340 Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3341
3342 if (!GenResult)
3343 return GenResult.takeError();
3344
3345 ShuffleSrcAddr =
3346 Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3347
3348 {
3349 InsertPointTy OldIP = Builder.saveIP();
3350 Builder.restoreIP(AllocaIP);
3351
3352 LocalStorage = Builder.CreateAlloca(ShuffleType);
3353 Builder.restoreIP(OldIP);
3354 ShuffleDestAddr = LocalStorage;
3355 }
3356 } else {
3357 // Non-descriptor by-ref: the pointer already references data
3358 // directly. Shuffle into the destination alloca.
3359 ShuffleDestAddr = DestElementAddr;
3360 }
3361 }
3362
3363 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3364 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3365
3366 if (IsByRefElem && RI.DataPtrPtrGen) {
3367 // Copy descriptor from source and update base_ptr to shuffled data
3368 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3369 DestAlloca, Builder.getPtrTy(), ".ascast");
3370
3371 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3372 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3373 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3374
3375 if (!GenResult)
3376 return GenResult.takeError();
3377 }
3378 } else {
3379 switch (RI.EvaluationKind) {
3380 case EvalKind::Scalar: {
3381 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3382 // Store the source element value to the dest element address.
3383 Builder.CreateStore(Elem, DestElementAddr);
3384 break;
3385 }
3386 case EvalKind::Complex: {
3387 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3388 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3389 Value *SrcReal = Builder.CreateLoad(
3390 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3391 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3392 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3393 Value *SrcImg = Builder.CreateLoad(
3394 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3395
3396 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3397 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3398 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3399 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3400 Builder.CreateStore(SrcReal, DestRealPtr);
3401 Builder.CreateStore(SrcImg, DestImgPtr);
3402 break;
3403 }
3404 case EvalKind::Aggregate: {
3405 Value *SizeVal = Builder.getInt64(
3406 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3407 Builder.CreateMemCpy(
3408 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3409 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3410 SizeVal, false);
3411 break;
3412 }
3413 };
3414 }
3415
3416 // Step 3.1: Modify reference in dest Reduce list as needed.
3417 // Modifying the reference in Reduce list to point to the newly
3418 // created element. The element is live in the current function
3419 // scope and that of functions it invokes (i.e., reduce_function).
3420 // RemoteReduceData[i] = (void*)&RemoteElem
3421 if (UpdateDestListPtr) {
3422 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3423 DestElementAddr, Builder.getPtrTy(),
3424 DestElementAddr->getName() + ".ascast");
3425 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3426 }
3427 }
3428
3429 return Error::success();
3430}
3431
3432Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3433 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3434 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3435 InsertPointTy SavedIP = Builder.saveIP();
3436 LLVMContext &Ctx = M.getContext();
3437 FunctionType *FuncTy = FunctionType::get(
3438 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3439 /* IsVarArg */ false);
3440 Function *WcFunc =
3442 "_omp_reduction_inter_warp_copy_func", &M);
3443 WcFunc->setCallingConv(Config.getRuntimeCC());
3444 WcFunc->setAttributes(FuncAttrs);
3445 WcFunc->addParamAttr(0, Attribute::NoUndef);
3446 WcFunc->addParamAttr(1, Attribute::NoUndef);
3447 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3448 Builder.SetInsertPoint(EntryBB);
3449
3450 // ReduceList: thread local Reduce list.
3451 // At the stage of the computation when this function is called, partially
3452 // aggregated values reside in the first lane of every active warp.
3453 Argument *ReduceListArg = WcFunc->getArg(0);
3454 // NumWarps: number of warps active in the parallel region. This could
3455 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3456 Argument *NumWarpsArg = WcFunc->getArg(1);
3457
3458 // This array is used as a medium to transfer, one reduce element at a time,
3459 // the data from the first lane of every warp to lanes in the first warp
3460 // in order to perform the final step of a reduction in a parallel region
3461 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3462 // for reduced latency, as well as to have a distinct copy for concurrently
3463 // executing target regions. The array is declared with common linkage so
3464 // as to be shared across compilation units.
3465 StringRef TransferMediumName =
3466 "__openmp_nvptx_data_transfer_temporary_storage";
3467 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3468 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3469 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3470 if (!TransferMedium) {
3471 TransferMedium = new GlobalVariable(
3472 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3473 UndefValue::get(ArrayTy), TransferMediumName,
3474 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3475 /*AddressSpace=*/3);
3476 }
3477
3478 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3479 Value *GPUThreadID = getGPUThreadID();
3480 // nvptx_lane_id = nvptx_id % warpsize
3481 Value *LaneID = getNVPTXLaneID();
3482 // nvptx_warp_id = nvptx_id / warpsize
3483 Value *WarpID = getNVPTXWarpID();
3484
3485 InsertPointTy AllocaIP =
3486 InsertPointTy(Builder.GetInsertBlock(),
3487 Builder.GetInsertBlock()->getFirstInsertionPt());
3488 Type *Arg0Type = ReduceListArg->getType();
3489 Type *Arg1Type = NumWarpsArg->getType();
3490 Builder.restoreIP(AllocaIP);
3491 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3492 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3493 AllocaInst *NumWarpsAlloca =
3494 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3495 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3496 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3497 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3498 NumWarpsAlloca, Builder.getPtrTy(0),
3499 NumWarpsAlloca->getName() + ".ascast");
3500 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3501 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3502 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3503 InsertPointTy CodeGenIP =
3504 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3505 Builder.restoreIP(CodeGenIP);
3506
3507 Value *ReduceList =
3508 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3509
3510 for (auto En : enumerate(ReductionInfos)) {
3511 //
3512 // Warp master copies reduce element to transfer medium in __shared__
3513 // memory.
3514 //
3515 const ReductionInfo &RI = En.value();
3516 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3517 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3518 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3519 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3520 Type *CType = Builder.getIntNTy(TySize * 8);
3521
3522 unsigned NumIters = RealTySize / TySize;
3523 if (NumIters == 0)
3524 continue;
3525 Value *Cnt = nullptr;
3526 Value *CntAddr = nullptr;
3527 BasicBlock *PrecondBB = nullptr;
3528 BasicBlock *ExitBB = nullptr;
3529 if (NumIters > 1) {
3530 CodeGenIP = Builder.saveIP();
3531 Builder.restoreIP(AllocaIP);
3532 CntAddr =
3533 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3534
3535 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3536 CntAddr->getName() + ".ascast");
3537 Builder.restoreIP(CodeGenIP);
3538 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3539 CntAddr,
3540 /*Volatile=*/false);
3541 PrecondBB = BasicBlock::Create(Ctx, "precond");
3542 ExitBB = BasicBlock::Create(Ctx, "exit");
3543 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3544 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3545 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3546 /*Volatile=*/false);
3547 Value *Cmp = Builder.CreateICmpULT(
3548 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3549 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3550 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3551 }
3552
3553 // kmpc_barrier.
3554 InsertPointOrErrorTy BarrierIP1 =
3555 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3556 omp::Directive::OMPD_unknown,
3557 /* ForceSimpleCall */ false,
3558 /* CheckCancelFlag */ true);
3559 if (!BarrierIP1)
3560 return BarrierIP1.takeError();
3561 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3562 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3563 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3564
3565 // if (lane_id == 0)
3566 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3567 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3568 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3569
3570 // Reduce element = LocalReduceList[i]
3571 auto *RedListArrayTy =
3572 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3573 Type *IndexTy = Builder.getIndexTy(
3574 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3575 Value *ElemPtrPtr =
3576 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3577 {ConstantInt::get(IndexTy, 0),
3578 ConstantInt::get(IndexTy, En.index())});
3579 // elemptr = ((CopyType*)(elemptrptr)) + I
3580 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3581
3582 if (IsByRefElem && RI.DataPtrPtrGen) {
3583 InsertPointOrErrorTy GenRes =
3584 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3585
3586 if (!GenRes)
3587 return GenRes.takeError();
3588
3589 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3590 }
3591
3592 if (NumIters > 1)
3593 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3594
3595 // Get pointer to location in transfer medium.
3596 // MediumPtr = &medium[warp_id]
3597 Value *MediumPtr = Builder.CreateInBoundsGEP(
3598 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3599 // elem = *elemptr
3600 //*MediumPtr = elem
3601 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3602 // Store the source element value to the dest element address.
3603 Builder.CreateStore(Elem, MediumPtr,
3604 /*IsVolatile*/ true);
3605 Builder.CreateBr(MergeBB);
3606
3607 // else
3608 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3609 Builder.CreateBr(MergeBB);
3610
3611 // endif
3612 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3613 InsertPointOrErrorTy BarrierIP2 =
3614 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3615 omp::Directive::OMPD_unknown,
3616 /* ForceSimpleCall */ false,
3617 /* CheckCancelFlag */ true);
3618 if (!BarrierIP2)
3619 return BarrierIP2.takeError();
3620
3621 // Warp 0 copies reduce element from transfer medium
3622 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3623 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3624 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3625
3626 Value *NumWarpsVal =
3627 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3628 // Up to 32 threads in warp 0 are active.
3629 Value *IsActiveThread =
3630 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3631 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3632
3633 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3634
3635 // SecMediumPtr = &medium[tid]
3636 // SrcMediumVal = *SrcMediumPtr
3637 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3638 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3639 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3640 Value *TargetElemPtrPtr =
3641 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3642 {ConstantInt::get(IndexTy, 0),
3643 ConstantInt::get(IndexTy, En.index())});
3644 Value *TargetElemPtrVal =
3645 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3646 Value *TargetElemPtr = TargetElemPtrVal;
3647
3648 if (IsByRefElem && RI.DataPtrPtrGen) {
3649 InsertPointOrErrorTy GenRes =
3650 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3651
3652 if (!GenRes)
3653 return GenRes.takeError();
3654
3655 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3656 }
3657
3658 if (NumIters > 1)
3659 TargetElemPtr =
3660 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3661
3662 // *TargetElemPtr = SrcMediumVal;
3663 Value *SrcMediumValue =
3664 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3665 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3666 Builder.CreateBr(W0MergeBB);
3667
3668 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3669 Builder.CreateBr(W0MergeBB);
3670
3671 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3672
3673 if (NumIters > 1) {
3674 Cnt = Builder.CreateNSWAdd(
3675 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3676 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3677
3678 auto *CurFn = Builder.GetInsertBlock()->getParent();
3679 emitBranch(PrecondBB);
3680 emitBlock(ExitBB, CurFn);
3681 }
3682 RealTySize %= TySize;
3683 }
3684 }
3685
3686 Builder.CreateRetVoid();
3687 Builder.restoreIP(SavedIP);
3688
3689 return WcFunc;
3690}
3691
3692Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3693 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3694 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3695 LLVMContext &Ctx = M.getContext();
3696 FunctionType *FuncTy =
3697 FunctionType::get(Builder.getVoidTy(),
3698 {Builder.getPtrTy(), Builder.getInt16Ty(),
3699 Builder.getInt16Ty(), Builder.getInt16Ty()},
3700 /* IsVarArg */ false);
3701 Function *SarFunc =
3703 "_omp_reduction_shuffle_and_reduce_func", &M);
3704 SarFunc->setCallingConv(Config.getRuntimeCC());
3705 SarFunc->setAttributes(FuncAttrs);
3706 SarFunc->addParamAttr(0, Attribute::NoUndef);
3707 SarFunc->addParamAttr(1, Attribute::NoUndef);
3708 SarFunc->addParamAttr(2, Attribute::NoUndef);
3709 SarFunc->addParamAttr(3, Attribute::NoUndef);
3710 SarFunc->addParamAttr(1, Attribute::SExt);
3711 SarFunc->addParamAttr(2, Attribute::SExt);
3712 SarFunc->addParamAttr(3, Attribute::SExt);
3713 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3714 Builder.SetInsertPoint(EntryBB);
3715
3716 // Thread local Reduce list used to host the values of data to be reduced.
3717 Argument *ReduceListArg = SarFunc->getArg(0);
3718 // Current lane id; could be logical.
3719 Argument *LaneIDArg = SarFunc->getArg(1);
3720 // Offset of the remote source lane relative to the current lane.
3721 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3722 // Algorithm version. This is expected to be known at compile time.
3723 Argument *AlgoVerArg = SarFunc->getArg(3);
3724
3725 Type *ReduceListArgType = ReduceListArg->getType();
3726 Type *LaneIDArgType = LaneIDArg->getType();
3727 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3728 Value *ReduceListAlloca = Builder.CreateAlloca(
3729 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3730 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3731 LaneIDArg->getName() + ".addr");
3732 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3733 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3734 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3735 AlgoVerArg->getName() + ".addr");
3736 ArrayType *RedListArrayTy =
3737 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3738
3739 // Create a local thread-private variable to host the Reduce list
3740 // from a remote lane.
3741 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3742 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3743
3744 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3745 ReduceListAlloca, ReduceListArgType,
3746 ReduceListAlloca->getName() + ".ascast");
3747 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3748 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3749 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3750 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3751 RemoteLaneOffsetAlloca->getName() + ".ascast");
3752 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3753 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3754 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3755 RemoteReductionListAlloca, Builder.getPtrTy(),
3756 RemoteReductionListAlloca->getName() + ".ascast");
3757
3758 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3759 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3760 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3761 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3762
3763 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3764 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3765 Value *RemoteLaneOffset =
3766 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3767 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3768
3769 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3770
3771 // This loop iterates through the list of reduce elements and copies,
3772 // element by element, from a remote lane in the warp to RemoteReduceList,
3773 // hosted on the thread's stack.
3774 Error EmitRedLsCpRes = emitReductionListCopy(
3775 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3776 ReduceList, RemoteListAddrCast, IsByRef,
3777 {RemoteLaneOffset, nullptr, nullptr});
3778
3779 if (EmitRedLsCpRes)
3780 return EmitRedLsCpRes;
3781
3782 // The actions to be performed on the Remote Reduce list is dependent
3783 // on the algorithm version.
3784 //
3785 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3786 // LaneId % 2 == 0 && Offset > 0):
3787 // do the reduction value aggregation
3788 //
3789 // The thread local variable Reduce list is mutated in place to host the
3790 // reduced data, which is the aggregated value produced from local and
3791 // remote lanes.
3792 //
3793 // Note that AlgoVer is expected to be a constant integer known at compile
3794 // time.
3795 // When AlgoVer==0, the first conjunction evaluates to true, making
3796 // the entire predicate true during compile time.
3797 // When AlgoVer==1, the second conjunction has only the second part to be
3798 // evaluated during runtime. Other conjunctions evaluates to false
3799 // during compile time.
3800 // When AlgoVer==2, the third conjunction has only the second part to be
3801 // evaluated during runtime. Other conjunctions evaluates to false
3802 // during compile time.
3803 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3804 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3805 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3806 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3807 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3808 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3809 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3810 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3811 Value *RemoteOffsetComp =
3812 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3813 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3814 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3815 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3816
3817 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3818 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3819 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3820
3821 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3822 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3823 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3824 ReduceList, Builder.getPtrTy());
3825 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3826 RemoteListAddrCast, Builder.getPtrTy());
3827 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3828 ->addFnAttr(Attribute::NoUnwind);
3829 Builder.CreateBr(MergeBB);
3830
3831 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3832 Builder.CreateBr(MergeBB);
3833
3834 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3835
3836 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3837 // Reduce list.
3838 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3839 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3840 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3841
3842 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3843 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3844 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3845 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3846
3847 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3848
3849 EmitRedLsCpRes = emitReductionListCopy(
3850 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3851 RemoteListAddrCast, ReduceList, IsByRef);
3852
3853 if (EmitRedLsCpRes)
3854 return EmitRedLsCpRes;
3855
3856 Builder.CreateBr(CpyMergeBB);
3857
3858 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3859 Builder.CreateBr(CpyMergeBB);
3860
3861 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3862
3863 Builder.CreateRetVoid();
3864
3865 return SarFunc;
3866}
3867
3869OpenMPIRBuilder::generateReductionDescriptor(
3870 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3871 Type *DescriptorType,
3872 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3873 DataPtrPtrGen) {
3874
3875 // Copy the source descriptor to preserve all metadata (rank, extents,
3876 // strides, etc.)
3877 Value *DescriptorSize =
3878 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3879 Builder.CreateMemCpy(
3880 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3881 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3882 DescriptorSize);
3883
3884 // Update the base pointer field to point to the local shuffled data
3885 Value *DataPtrField;
3886 InsertPointOrErrorTy GenResult =
3887 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3888
3889 if (!GenResult)
3890 return GenResult.takeError();
3891
3892 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3893 DataPtr, Builder.getPtrTy(), ".ascast"),
3894 DataPtrField);
3895
3896 return Builder.saveIP();
3897}
3898
3899Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3900 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3901 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3902 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3903 LLVMContext &Ctx = M.getContext();
3904 FunctionType *FuncTy = FunctionType::get(
3905 Builder.getVoidTy(),
3906 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3907 /* IsVarArg */ false);
3908 Function *LtGCFunc =
3910 "_omp_reduction_list_to_global_copy_func", &M);
3911 LtGCFunc->setAttributes(FuncAttrs);
3912 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3913 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3914 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3915
3916 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3917 Builder.SetInsertPoint(EntryBlock);
3918
3919 // Buffer: global reduction buffer.
3920 Argument *BufferArg = LtGCFunc->getArg(0);
3921 // Idx: index of the buffer.
3922 Argument *IdxArg = LtGCFunc->getArg(1);
3923 // ReduceList: thread local Reduce list.
3924 Argument *ReduceListArg = LtGCFunc->getArg(2);
3925
3926 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3927 BufferArg->getName() + ".addr");
3928 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3929 IdxArg->getName() + ".addr");
3930 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3931 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3932 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3933 BufferArgAlloca, Builder.getPtrTy(),
3934 BufferArgAlloca->getName() + ".ascast");
3935 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3936 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3937 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3938 ReduceListArgAlloca, Builder.getPtrTy(),
3939 ReduceListArgAlloca->getName() + ".ascast");
3940
3941 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3942 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3943 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3944
3945 Value *LocalReduceList =
3946 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3947 Value *BufferArgVal =
3948 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3949 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3950 Type *IndexTy = Builder.getIndexTy(
3951 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3952 for (auto En : enumerate(ReductionInfos)) {
3953 const ReductionInfo &RI = En.value();
3954 auto *RedListArrayTy =
3955 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3956 // Reduce element = LocalReduceList[i]
3957 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3958 RedListArrayTy, LocalReduceList,
3959 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3960 // elemptr = ((CopyType*)(elemptrptr)) + I
3961 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3962
3963 // Global = Buffer.VD[Idx];
3964 Value *BufferVD =
3965 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3966 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3967 ReductionsBufferTy, BufferVD, 0, En.index());
3968
3969 switch (RI.EvaluationKind) {
3970 case EvalKind::Scalar: {
3971 Value *TargetElement;
3972
3973 if (IsByRef.empty() || !IsByRef[En.index()]) {
3974 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3975 } else {
3976 if (RI.DataPtrPtrGen) {
3977 InsertPointOrErrorTy GenResult =
3978 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3979
3980 if (!GenResult)
3981 return GenResult.takeError();
3982
3983 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3984 }
3985 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3986 }
3987
3988 Builder.CreateStore(TargetElement, GlobVal);
3989 break;
3990 }
3991 case EvalKind::Complex: {
3992 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3993 RI.ElementType, ElemPtr, 0, 0, ".realp");
3994 Value *SrcReal = Builder.CreateLoad(
3995 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3996 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3997 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3998 Value *SrcImg = Builder.CreateLoad(
3999 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4000
4001 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4002 RI.ElementType, GlobVal, 0, 0, ".realp");
4003 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4004 RI.ElementType, GlobVal, 0, 1, ".imagp");
4005 Builder.CreateStore(SrcReal, DestRealPtr);
4006 Builder.CreateStore(SrcImg, DestImgPtr);
4007 break;
4008 }
4009 case EvalKind::Aggregate: {
4010 Value *SizeVal =
4011 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4012 Builder.CreateMemCpy(
4013 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
4014 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
4015 break;
4016 }
4017 }
4018 }
4019
4020 Builder.CreateRetVoid();
4021 Builder.restoreIP(OldIP);
4022 return LtGCFunc;
4023}
4024
4025Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
4026 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4027 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4028 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4029 LLVMContext &Ctx = M.getContext();
4030 FunctionType *FuncTy = FunctionType::get(
4031 Builder.getVoidTy(),
4032 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4033 /* IsVarArg */ false);
4034 Function *LtGRFunc =
4036 "_omp_reduction_list_to_global_reduce_func", &M);
4037 LtGRFunc->setAttributes(FuncAttrs);
4038 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
4039 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
4040 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
4041
4042 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
4043 Builder.SetInsertPoint(EntryBlock);
4044
4045 // Buffer: global reduction buffer.
4046 Argument *BufferArg = LtGRFunc->getArg(0);
4047 // Idx: index of the buffer.
4048 Argument *IdxArg = LtGRFunc->getArg(1);
4049 // ReduceList: thread local Reduce list.
4050 Argument *ReduceListArg = LtGRFunc->getArg(2);
4051
4052 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4053 BufferArg->getName() + ".addr");
4054 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4055 IdxArg->getName() + ".addr");
4056 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4057 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4058 auto *RedListArrayTy =
4059 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4060
4061 // 1. Build a list of reduction variables.
4062 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4063 Value *LocalReduceList =
4064 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4065
4066 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4067
4068 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4069 BufferArgAlloca, Builder.getPtrTy(),
4070 BufferArgAlloca->getName() + ".ascast");
4071 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4072 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4073 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4074 ReduceListArgAlloca, Builder.getPtrTy(),
4075 ReduceListArgAlloca->getName() + ".ascast");
4076 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4077 LocalReduceList, Builder.getPtrTy(),
4078 LocalReduceList->getName() + ".ascast");
4079
4080 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4081 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4082 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4083
4084 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4085 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4086 Type *IndexTy = Builder.getIndexTy(
4087 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4088 for (auto En : enumerate(ReductionInfos)) {
4089 const ReductionInfo &RI = En.value();
4090
4091 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4092 RedListArrayTy, LocalReduceListAddrCast,
4093 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4094 Value *BufferVD =
4095 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4096 // Global = Buffer.VD[Idx];
4097 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4098 ReductionsBufferTy, BufferVD, 0, En.index());
4099
4100 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4101 InsertPointTy OldIP = Builder.saveIP();
4102 Builder.restoreIP(AllocaIP);
4103
4104 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4105 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4106 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4107
4108 Builder.restoreIP(OldIP);
4109
4110 // Get source descriptor from the reduce list argument
4111 Value *ReduceList =
4112 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4113 Value *SrcElementPtrPtr =
4114 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
4115 {ConstantInt::get(IndexTy, 0),
4116 ConstantInt::get(IndexTy, En.index())});
4117 Value *SrcDescriptorAddr =
4118 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4119
4120 // Copy descriptor from source and update base_ptr to global buffer data
4121 InsertPointOrErrorTy GenResult =
4122 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4123 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4124
4125 if (!GenResult)
4126 return GenResult.takeError();
4127
4128 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4129 } else {
4130 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4131 }
4132 }
4133
4134 // Call reduce_function(GlobalReduceList, ReduceList)
4135 Value *ReduceList =
4136 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4137 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
4138 ->addFnAttr(Attribute::NoUnwind);
4139 Builder.CreateRetVoid();
4140 Builder.restoreIP(OldIP);
4141 return LtGRFunc;
4142}
4143
4144Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
4145 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
4146 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4147 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4148 LLVMContext &Ctx = M.getContext();
4149 FunctionType *FuncTy = FunctionType::get(
4150 Builder.getVoidTy(),
4151 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4152 /* IsVarArg */ false);
4153 Function *GtLCFunc =
4155 "_omp_reduction_global_to_list_copy_func", &M);
4156 GtLCFunc->setAttributes(FuncAttrs);
4157 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
4158 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
4159 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
4160
4161 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
4162 Builder.SetInsertPoint(EntryBlock);
4163
4164 // Buffer: global reduction buffer.
4165 Argument *BufferArg = GtLCFunc->getArg(0);
4166 // Idx: index of the buffer.
4167 Argument *IdxArg = GtLCFunc->getArg(1);
4168 // ReduceList: thread local Reduce list.
4169 Argument *ReduceListArg = GtLCFunc->getArg(2);
4170
4171 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4172 BufferArg->getName() + ".addr");
4173 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4174 IdxArg->getName() + ".addr");
4175 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4176 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4177 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4178 BufferArgAlloca, Builder.getPtrTy(),
4179 BufferArgAlloca->getName() + ".ascast");
4180 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4181 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4182 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4183 ReduceListArgAlloca, Builder.getPtrTy(),
4184 ReduceListArgAlloca->getName() + ".ascast");
4185 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4186 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4187 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4188
4189 Value *LocalReduceList =
4190 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4191 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4192 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4193 Type *IndexTy = Builder.getIndexTy(
4194 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4195 for (auto En : enumerate(ReductionInfos)) {
4196 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4197 auto *RedListArrayTy =
4198 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4199 // Reduce element = LocalReduceList[i]
4200 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
4201 RedListArrayTy, LocalReduceList,
4202 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4203 // elemptr = ((CopyType*)(elemptrptr)) + I
4204 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
4205 // Global = Buffer.VD[Idx];
4206 Value *BufferVD =
4207 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4208 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4209 ReductionsBufferTy, BufferVD, 0, En.index());
4210
4211 switch (RI.EvaluationKind) {
4212 case EvalKind::Scalar: {
4213 Type *ElemType = RI.ElementType;
4214
4215 if (!IsByRef.empty() && IsByRef[En.index()]) {
4216 ElemType = RI.ByRefElementType;
4217 if (RI.DataPtrPtrGen) {
4218 InsertPointOrErrorTy GenResult =
4219 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
4220
4221 if (!GenResult)
4222 return GenResult.takeError();
4223
4224 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
4225 }
4226 }
4227
4228 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
4229 Builder.CreateStore(TargetElement, ElemPtr);
4230 break;
4231 }
4232 case EvalKind::Complex: {
4233 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
4234 RI.ElementType, GlobValPtr, 0, 0, ".realp");
4235 Value *SrcReal = Builder.CreateLoad(
4236 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
4237 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
4238 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
4239 Value *SrcImg = Builder.CreateLoad(
4240 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4241
4242 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4243 RI.ElementType, ElemPtr, 0, 0, ".realp");
4244 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4245 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4246 Builder.CreateStore(SrcReal, DestRealPtr);
4247 Builder.CreateStore(SrcImg, DestImgPtr);
4248 break;
4249 }
4250 case EvalKind::Aggregate: {
4251 Value *SizeVal =
4252 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4253 Builder.CreateMemCpy(
4254 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4255 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4256 SizeVal, false);
4257 break;
4258 }
4259 }
4260 }
4261
4262 Builder.CreateRetVoid();
4263 Builder.restoreIP(OldIP);
4264 return GtLCFunc;
4265}
4266
4267Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4268 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4269 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4270 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4271 LLVMContext &Ctx = M.getContext();
4272 auto *FuncTy = FunctionType::get(
4273 Builder.getVoidTy(),
4274 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4275 /* IsVarArg */ false);
4276 Function *GtLRFunc =
4278 "_omp_reduction_global_to_list_reduce_func", &M);
4279 GtLRFunc->setAttributes(FuncAttrs);
4280 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4281 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4282 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4283
4284 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4285 Builder.SetInsertPoint(EntryBlock);
4286
4287 // Buffer: global reduction buffer.
4288 Argument *BufferArg = GtLRFunc->getArg(0);
4289 // Idx: index of the buffer.
4290 Argument *IdxArg = GtLRFunc->getArg(1);
4291 // ReduceList: thread local Reduce list.
4292 Argument *ReduceListArg = GtLRFunc->getArg(2);
4293
4294 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4295 BufferArg->getName() + ".addr");
4296 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4297 IdxArg->getName() + ".addr");
4298 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4299 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4300 ArrayType *RedListArrayTy =
4301 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4302
4303 // 1. Build a list of reduction variables.
4304 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4305 Value *LocalReduceList =
4306 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4307
4308 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4309
4310 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4311 BufferArgAlloca, Builder.getPtrTy(),
4312 BufferArgAlloca->getName() + ".ascast");
4313 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4314 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4315 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4316 ReduceListArgAlloca, Builder.getPtrTy(),
4317 ReduceListArgAlloca->getName() + ".ascast");
4318 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4319 LocalReduceList, Builder.getPtrTy(),
4320 LocalReduceList->getName() + ".ascast");
4321
4322 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4323 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4324 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4325
4326 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4327 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4328 Type *IndexTy = Builder.getIndexTy(
4329 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4330 for (auto En : enumerate(ReductionInfos)) {
4331 const ReductionInfo &RI = En.value();
4332
4333 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4334 RedListArrayTy, ReductionList,
4335 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4336 // Global = Buffer.VD[Idx];
4337 Value *BufferVD =
4338 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4339 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4340 ReductionsBufferTy, BufferVD, 0, En.index());
4341
4342 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4343 InsertPointTy OldIP = Builder.saveIP();
4344 Builder.restoreIP(AllocaIP);
4345
4346 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4347 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4348 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4349
4350 Builder.restoreIP(OldIP);
4351
4352 // Get source descriptor from the reduce list
4353 Value *ReduceListVal =
4354 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4355 Value *SrcElementPtrPtr =
4356 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4357 {ConstantInt::get(IndexTy, 0),
4358 ConstantInt::get(IndexTy, En.index())});
4359 Value *SrcDescriptorAddr =
4360 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4361
4362 // Copy descriptor from source and update base_ptr to global buffer data
4363 InsertPointOrErrorTy GenResult =
4364 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4365 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4366 if (!GenResult)
4367 return GenResult.takeError();
4368
4369 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4370 } else {
4371 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4372 }
4373 }
4374
4375 // Call reduce_function(ReduceList, GlobalReduceList)
4376 Value *ReduceList =
4377 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4378 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4379 ->addFnAttr(Attribute::NoUnwind);
4380 Builder.CreateRetVoid();
4381 Builder.restoreIP(OldIP);
4382 return GtLRFunc;
4383}
4384
4385std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4386 std::string Suffix =
4387 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4388 return (Name + Suffix).str();
4389}
4390
4391Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4392 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4394 AttributeList FuncAttrs) {
4395 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4396 {Builder.getPtrTy(), Builder.getPtrTy()},
4397 /* IsVarArg */ false);
4398 std::string Name = getReductionFuncName(ReducerName);
4399 Function *ReductionFunc =
4401 ReductionFunc->setCallingConv(Config.getRuntimeCC());
4402 ReductionFunc->setAttributes(FuncAttrs);
4403 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4404 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4405 BasicBlock *EntryBB =
4406 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4407 Builder.SetInsertPoint(EntryBB);
4408
4409 // Need to alloca memory here and deal with the pointers before getting
4410 // LHS/RHS pointers out
4411 Value *LHSArrayPtr = nullptr;
4412 Value *RHSArrayPtr = nullptr;
4413 Argument *Arg0 = ReductionFunc->getArg(0);
4414 Argument *Arg1 = ReductionFunc->getArg(1);
4415 Type *Arg0Type = Arg0->getType();
4416 Type *Arg1Type = Arg1->getType();
4417
4418 Value *LHSAlloca =
4419 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4420 Value *RHSAlloca =
4421 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4422 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4423 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4424 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4425 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4426 Builder.CreateStore(Arg0, LHSAddrCast);
4427 Builder.CreateStore(Arg1, RHSAddrCast);
4428 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4429 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4430
4431 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4432 Type *IndexTy = Builder.getIndexTy(
4433 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4434 SmallVector<Value *> LHSPtrs, RHSPtrs;
4435 for (auto En : enumerate(ReductionInfos)) {
4436 const ReductionInfo &RI = En.value();
4437 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4438 RedArrayTy, RHSArrayPtr,
4439 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4440 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4441 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4442 RHSI8Ptr, RI.PrivateVariable->getType(),
4443 RHSI8Ptr->getName() + ".ascast");
4444
4445 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4446 RedArrayTy, LHSArrayPtr,
4447 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4448 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4449 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4450 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4451
4453 LHSPtrs.emplace_back(LHSPtr);
4454 RHSPtrs.emplace_back(RHSPtr);
4455 } else {
4456 Value *LHS = LHSPtr;
4457 Value *RHS = RHSPtr;
4458
4459 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4460 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4461 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4462 }
4463
4464 Value *Reduced;
4465 InsertPointOrErrorTy AfterIP =
4466 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4467 if (!AfterIP)
4468 return AfterIP.takeError();
4469 if (!Builder.GetInsertBlock())
4470 return ReductionFunc;
4471
4472 Builder.restoreIP(*AfterIP);
4473
4474 if (!IsByRef.empty() && !IsByRef[En.index()])
4475 Builder.CreateStore(Reduced, LHSPtr);
4476 }
4477 }
4478
4480 for (auto En : enumerate(ReductionInfos)) {
4481 unsigned Index = En.index();
4482 const ReductionInfo &RI = En.value();
4483 Value *LHSFixupPtr, *RHSFixupPtr;
4484 Builder.restoreIP(RI.ReductionGenClang(
4485 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4486
4487 // Fix the CallBack code genereated to use the correct Values for the LHS
4488 // and RHS
4489 LHSFixupPtr->replaceUsesWithIf(
4490 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4491 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4492 ReductionFunc;
4493 });
4494 RHSFixupPtr->replaceUsesWithIf(
4495 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4496 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4497 ReductionFunc;
4498 });
4499 }
4500
4501 Builder.CreateRetVoid();
4502 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4503 // to the entry block (this is dones for higher opt levels by later passes in
4504 // the pipeline). This has caused issues because non-entry `alloca`s force the
4505 // function to use dynamic stack allocations and we might run out of scratch
4506 // memory.
4507 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4508
4509 return ReductionFunc;
4510}
4511
4512static void
4514 bool IsGPU) {
4515 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4516 (void)RI;
4517 assert(RI.Variable && "expected non-null variable");
4518 assert(RI.PrivateVariable && "expected non-null private variable");
4519 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4520 "expected non-null reduction generator callback");
4521 if (!IsGPU) {
4522 assert(
4523 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4524 "expected variables and their private equivalents to have the same "
4525 "type");
4526 }
4527 assert(RI.Variable->getType()->isPointerTy() &&
4528 "expected variables to be pointers");
4529 }
4530}
4531
4533 const LocationDescription &Loc, InsertPointTy AllocaIP,
4534 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4535 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4536 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4537 unsigned ReductionBufNum, Value *SrcLocInfo) {
4538 if (!updateToLocation(Loc))
4539 return InsertPointTy();
4540 Builder.restoreIP(CodeGenIP);
4541 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4542 LLVMContext &Ctx = M.getContext();
4543
4544 // Source location for the ident struct
4545 if (!SrcLocInfo) {
4546 uint32_t SrcLocStrSize;
4547 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4548 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4549 }
4550
4551 if (ReductionInfos.size() == 0)
4552 return Builder.saveIP();
4553
4554 BasicBlock *ContinuationBlock = nullptr;
4556 // Copied code from createReductions
4557 BasicBlock *InsertBlock = Loc.IP.getBlock();
4558 ContinuationBlock =
4559 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4560 InsertBlock->getTerminator()->eraseFromParent();
4561 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4562 }
4563
4564 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4565 AttributeList FuncAttrs;
4566 AttrBuilder AttrBldr(Ctx);
4567 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4568 AttrBldr.addAttribute(Attr);
4569 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4570 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4571
4572 CodeGenIP = Builder.saveIP();
4573 Expected<Function *> ReductionResult = createReductionFunction(
4574 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4575 ReductionGenCBKind, FuncAttrs);
4576 if (!ReductionResult)
4577 return ReductionResult.takeError();
4578 Function *ReductionFunc = *ReductionResult;
4579 Builder.restoreIP(CodeGenIP);
4580
4581 // Set the grid value in the config needed for lowering later on
4582 if (GridValue.has_value())
4583 Config.setGridValue(GridValue.value());
4584 else
4585 Config.setGridValue(getGridValue(T, ReductionFunc));
4586
4587 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4588 // RedList, shuffle_reduce_func, interwarp_copy_func);
4589 // or
4590 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4591 Value *Res;
4592
4593 // 1. Build a list of reduction variables.
4594 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4595 auto Size = ReductionInfos.size();
4596 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4597 Type *FuncPtrTy =
4598 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4599 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4600 CodeGenIP = Builder.saveIP();
4601 Builder.restoreIP(AllocaIP);
4602 Value *ReductionListAlloca =
4603 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4604 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4605 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4606 Builder.restoreIP(CodeGenIP);
4607 Type *IndexTy = Builder.getIndexTy(
4608 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4609 for (auto En : enumerate(ReductionInfos)) {
4610 const ReductionInfo &RI = En.value();
4611 Value *ElemPtr = Builder.CreateInBoundsGEP(
4612 RedArrayTy, ReductionList,
4613 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4614
4615 Value *PrivateVar = RI.PrivateVariable;
4616 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4617 if (IsByRefElem)
4618 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4619
4620 Value *CastElem =
4621 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4622 Builder.CreateStore(CastElem, ElemPtr);
4623 }
4624 CodeGenIP = Builder.saveIP();
4625 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4626 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4627
4628 if (!SarFunc)
4629 return SarFunc.takeError();
4630
4631 Expected<Function *> CopyResult =
4632 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4633 if (!CopyResult)
4634 return CopyResult.takeError();
4635 Function *WcFunc = *CopyResult;
4636 Builder.restoreIP(CodeGenIP);
4637
4638 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4639
4640 // NOTE: ReductionDataSize is passed as the reduce_data_size
4641 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4642 // the runtime implementations do not currently use it. The teams
4643 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4644 // (set separately via TargetKernelDefaultAttrs). It is computed
4645 // here conservatively as max(element sizes) * N rather than the
4646 // exact sum, which over-calculates the size for mixed reduction
4647 // types but is harmless given the argument is unused.
4648 // TODO: Consider dropping this computation if the runtime API is
4649 // ever revised to remove the unused parameter.
4650 unsigned MaxDataSize = 0;
4651 SmallVector<Type *> ReductionTypeArgs;
4652 for (auto En : enumerate(ReductionInfos)) {
4653 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4654 // the actual data size stored in the global reduction buffer, consistent
4655 // with the ReductionsBufferTy struct used for GEP offsets below.
4656 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4657 ? En.value().ByRefElementType
4658 : En.value().ElementType;
4659 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4660 if (Size > MaxDataSize)
4661 MaxDataSize = Size;
4662 ReductionTypeArgs.emplace_back(RedTypeArg);
4663 }
4664 Value *ReductionDataSize =
4665 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4666 if (!IsTeamsReduction) {
4667 Value *SarFuncCast =
4668 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4669 Value *WcFuncCast =
4670 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4671 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4672 WcFuncCast};
4674 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4675 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4676 } else {
4677 CodeGenIP = Builder.saveIP();
4678 StructType *ReductionsBufferTy = StructType::create(
4679 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4680 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4681 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4682
4683 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4684 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4685 if (!LtGCFunc)
4686 return LtGCFunc.takeError();
4687
4688 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4689 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4690 if (!LtGRFunc)
4691 return LtGRFunc.takeError();
4692
4693 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4694 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4695 if (!GtLCFunc)
4696 return GtLCFunc.takeError();
4697
4698 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4699 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4700 if (!GtLRFunc)
4701 return GtLRFunc.takeError();
4702
4703 Builder.restoreIP(CodeGenIP);
4704
4705 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4706 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4707
4708 Value *Args3[] = {SrcLocInfo,
4709 KernelTeamsReductionPtr,
4710 Builder.getInt32(ReductionBufNum),
4711 ReductionDataSize,
4712 RL,
4713 *SarFunc,
4714 WcFunc,
4715 *LtGCFunc,
4716 *LtGRFunc,
4717 *GtLCFunc,
4718 *GtLRFunc};
4719
4720 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4721 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4722 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4723 }
4724
4725 // 5. Build if (res == 1)
4726 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4727 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4728 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4729 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4730
4731 // 6. Build then branch: where we have reduced values in the master
4732 // thread in each team.
4733 // __kmpc_end_reduce{_nowait}(<gtid>);
4734 // break;
4735 emitBlock(ThenBB, CurFunc);
4736
4737 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4738 for (auto En : enumerate(ReductionInfos)) {
4739 const ReductionInfo &RI = En.value();
4741 Value *RedValue = RI.Variable;
4742 Value *RHS =
4743 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4744
4746 Value *LHSPtr, *RHSPtr;
4747 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4748 &LHSPtr, &RHSPtr, CurFunc));
4749
4750 // Fix the CallBack code genereated to use the correct Values for the LHS
4751 // and RHS. Cast to match types before replacing (necessary to handle
4752 // different address spaces).
4753 if (LHSPtr->getType() != RedValue->getType())
4754 RedValue = Builder.CreatePointerBitCastOrAddrSpaceCast(
4755 RedValue, LHSPtr->getType());
4756 if (RHSPtr->getType() != RHS->getType())
4757 RHS =
4758 Builder.CreatePointerBitCastOrAddrSpaceCast(RHS, RHSPtr->getType());
4759
4760 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4761 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4762 ReductionFunc;
4763 });
4764 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4765 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4766 ReductionFunc;
4767 });
4768 } else {
4769 if (IsByRef.empty() || !IsByRef[En.index()]) {
4770 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4771 "red.value." + Twine(En.index()));
4772 }
4773 Value *PrivateRedValue = Builder.CreateLoad(
4774 ValueType, RHS, "red.private.value" + Twine(En.index()));
4775 Value *Reduced;
4776 InsertPointOrErrorTy AfterIP =
4777 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4778 if (!AfterIP)
4779 return AfterIP.takeError();
4780 Builder.restoreIP(*AfterIP);
4781
4782 if (!IsByRef.empty() && !IsByRef[En.index()])
4783 Builder.CreateStore(Reduced, RI.Variable);
4784 }
4785 }
4786 emitBlock(ExitBB, CurFunc);
4787 if (ContinuationBlock) {
4788 Builder.CreateBr(ContinuationBlock);
4789 Builder.SetInsertPoint(ContinuationBlock);
4790 }
4791 Config.setEmitLLVMUsed();
4792
4793 return Builder.saveIP();
4794}
4795
4797 Type *VoidTy = Type::getVoidTy(M.getContext());
4798 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4799 auto *FuncTy =
4800 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4802 ".omp.reduction.func", &M);
4803}
4804
4806 Function *ReductionFunc,
4808 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4809 Module *Module = ReductionFunc->getParent();
4810 BasicBlock *ReductionFuncBlock =
4811 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4812 Builder.SetInsertPoint(ReductionFuncBlock);
4813 Value *LHSArrayPtr = nullptr;
4814 Value *RHSArrayPtr = nullptr;
4815 if (IsGPU) {
4816 // Need to alloca memory here and deal with the pointers before getting
4817 // LHS/RHS pointers out
4818 //
4819 Argument *Arg0 = ReductionFunc->getArg(0);
4820 Argument *Arg1 = ReductionFunc->getArg(1);
4821 Type *Arg0Type = Arg0->getType();
4822 Type *Arg1Type = Arg1->getType();
4823
4824 Value *LHSAlloca =
4825 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4826 Value *RHSAlloca =
4827 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4828 Value *LHSAddrCast =
4829 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4830 Value *RHSAddrCast =
4831 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4832 Builder.CreateStore(Arg0, LHSAddrCast);
4833 Builder.CreateStore(Arg1, RHSAddrCast);
4834 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4835 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4836 } else {
4837 LHSArrayPtr = ReductionFunc->getArg(0);
4838 RHSArrayPtr = ReductionFunc->getArg(1);
4839 }
4840
4841 unsigned NumReductions = ReductionInfos.size();
4842 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4843
4844 for (auto En : enumerate(ReductionInfos)) {
4845 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4846 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4847 RedArrayTy, LHSArrayPtr, 0, En.index());
4848 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4849 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4850 LHSI8Ptr, RI.Variable->getType());
4851 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4852 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4853 RedArrayTy, RHSArrayPtr, 0, En.index());
4854 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4855 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4856 RHSI8Ptr, RI.PrivateVariable->getType());
4857 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4858 Value *Reduced;
4860 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4861 if (!AfterIP)
4862 return AfterIP.takeError();
4863
4864 Builder.restoreIP(*AfterIP);
4865 // TODO: Consider flagging an error.
4866 if (!Builder.GetInsertBlock())
4867 return Error::success();
4868
4869 // store is inside of the reduction region when using by-ref
4870 if (!IsByRef[En.index()])
4871 Builder.CreateStore(Reduced, LHSPtr);
4872 }
4873 Builder.CreateRetVoid();
4874 return Error::success();
4875}
4876
4878 const LocationDescription &Loc, InsertPointTy AllocaIP,
4879 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4880 bool IsNoWait, bool IsTeamsReduction) {
4881 assert(ReductionInfos.size() == IsByRef.size());
4882 if (Config.isGPU())
4883 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4884 IsByRef, IsNoWait, IsTeamsReduction);
4885
4886 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4887
4888 if (!updateToLocation(Loc))
4889 return InsertPointTy();
4890
4891 if (ReductionInfos.size() == 0)
4892 return Builder.saveIP();
4893
4894 BasicBlock *InsertBlock = Loc.IP.getBlock();
4895 BasicBlock *ContinuationBlock =
4896 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4897 InsertBlock->getTerminator()->eraseFromParent();
4898
4899 // Create and populate array of type-erased pointers to private reduction
4900 // values.
4901 unsigned NumReductions = ReductionInfos.size();
4902 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4903 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4904 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4905
4906 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4907
4908 for (auto En : enumerate(ReductionInfos)) {
4909 unsigned Index = En.index();
4910 const ReductionInfo &RI = En.value();
4911 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4912 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4913 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4914 }
4915
4916 // Emit a call to the runtime function that orchestrates the reduction.
4917 // Declare the reduction function in the process.
4918 Type *IndexTy = Builder.getIndexTy(
4919 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4920 Function *Func = Builder.GetInsertBlock()->getParent();
4921 Module *Module = Func->getParent();
4922 uint32_t SrcLocStrSize;
4923 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4924 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4925 return RI.AtomicReductionGen;
4926 });
4927 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4928 CanGenerateAtomic
4929 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4930 : IdentFlag(0));
4931 Value *ThreadId = getOrCreateThreadID(Ident);
4932 Constant *NumVariables = Builder.getInt32(NumReductions);
4933 const DataLayout &DL = Module->getDataLayout();
4934 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4935 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4936 Function *ReductionFunc = getFreshReductionFunc(*Module);
4937 Value *Lock = getOMPCriticalRegionLock(".reduction");
4939 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4940 : RuntimeFunction::OMPRTL___kmpc_reduce);
4941 CallInst *ReduceCall =
4942 createRuntimeFunctionCall(ReduceFunc,
4943 {Ident, ThreadId, NumVariables, RedArraySize,
4944 RedArray, ReductionFunc, Lock},
4945 "reduce");
4946
4947 // Create final reduction entry blocks for the atomic and non-atomic case.
4948 // Emit IR that dispatches control flow to one of the blocks based on the
4949 // reduction supporting the atomic mode.
4950 BasicBlock *NonAtomicRedBlock =
4951 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4952 BasicBlock *AtomicRedBlock =
4953 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4954 SwitchInst *Switch =
4955 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4956 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4957 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4958
4959 // Populate the non-atomic reduction using the elementwise reduction function.
4960 // This loads the elements from the global and private variables and reduces
4961 // them before storing back the result to the global variable.
4962 Builder.SetInsertPoint(NonAtomicRedBlock);
4963 for (auto En : enumerate(ReductionInfos)) {
4964 const ReductionInfo &RI = En.value();
4966 // We have one less load for by-ref case because that load is now inside of
4967 // the reduction region
4968 Value *RedValue = RI.Variable;
4969 if (!IsByRef[En.index()]) {
4970 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4971 "red.value." + Twine(En.index()));
4972 }
4973 Value *PrivateRedValue =
4974 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4975 "red.private.value." + Twine(En.index()));
4976 Value *Reduced;
4977 InsertPointOrErrorTy AfterIP =
4978 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4979 if (!AfterIP)
4980 return AfterIP.takeError();
4981 Builder.restoreIP(*AfterIP);
4982
4983 if (!Builder.GetInsertBlock())
4984 return InsertPointTy();
4985 // for by-ref case, the load is inside of the reduction region
4986 if (!IsByRef[En.index()])
4987 Builder.CreateStore(Reduced, RI.Variable);
4988 }
4989 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4990 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4991 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4992 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4993 Builder.CreateBr(ContinuationBlock);
4994
4995 // Populate the atomic reduction using the atomic elementwise reduction
4996 // function. There are no loads/stores here because they will be happening
4997 // inside the atomic elementwise reduction.
4998 Builder.SetInsertPoint(AtomicRedBlock);
4999 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
5000 for (const ReductionInfo &RI : ReductionInfos) {
5002 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
5003 if (!AfterIP)
5004 return AfterIP.takeError();
5005 Builder.restoreIP(*AfterIP);
5006 if (!Builder.GetInsertBlock())
5007 return InsertPointTy();
5008 }
5009 Builder.CreateBr(ContinuationBlock);
5010 } else {
5011 Builder.CreateUnreachable();
5012 }
5013
5014 // Populate the outlined reduction function using the elementwise reduction
5015 // function. Partial values are extracted from the type-erased array of
5016 // pointers to private variables.
5017 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
5018 IsByRef, /*isGPU=*/false);
5019 if (Err)
5020 return Err;
5021
5022 if (!Builder.GetInsertBlock())
5023 return InsertPointTy();
5024
5025 Builder.SetInsertPoint(ContinuationBlock);
5026 return Builder.saveIP();
5027}
5028
5031 BodyGenCallbackTy BodyGenCB,
5032 FinalizeCallbackTy FiniCB) {
5033 if (!updateToLocation(Loc))
5034 return Loc.IP;
5035
5036 Directive OMPD = Directive::OMPD_master;
5037 uint32_t SrcLocStrSize;
5038 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5039 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5040 Value *ThreadId = getOrCreateThreadID(Ident);
5041 Value *Args[] = {Ident, ThreadId};
5042
5043 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
5044 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5045
5046 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
5047 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
5048
5049 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5050 /*Conditional*/ true, /*hasFinalize*/ true);
5051}
5052
5055 BodyGenCallbackTy BodyGenCB,
5056 FinalizeCallbackTy FiniCB, Value *Filter) {
5057 if (!updateToLocation(Loc))
5058 return Loc.IP;
5059
5060 Directive OMPD = Directive::OMPD_masked;
5061 uint32_t SrcLocStrSize;
5062 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
5063 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5064 Value *ThreadId = getOrCreateThreadID(Ident);
5065 Value *Args[] = {Ident, ThreadId, Filter};
5066 Value *ArgsEnd[] = {Ident, ThreadId};
5067
5068 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
5069 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
5070
5071 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
5072 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
5073
5074 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
5075 /*Conditional*/ true, /*hasFinalize*/ true);
5076}
5077
5079 llvm::FunctionCallee Callee,
5081 const llvm::Twine &Name) {
5082 llvm::CallInst *Call = Builder.CreateCall(
5083 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
5084 Call->setDoesNotThrow();
5085 return Call;
5086}
5087
5088// Expects input basic block is dominated by BeforeScanBB.
5089// Once Scan directive is encountered, the code after scan directive should be
5090// dominated by AfterScanBB. Scan directive splits the code sequence to
5091// scan and input phase. Based on whether inclusive or exclusive
5092// clause is used in the scan directive and whether input loop or scan loop
5093// is lowered, it adds jumps to input and scan phase. First Scan loop is the
5094// input loop and second is the scan loop. The code generated handles only
5095// inclusive scans now.
5097 const LocationDescription &Loc, InsertPointTy AllocaIP,
5098 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
5099 bool IsInclusive, ScanInfo *ScanRedInfo) {
5100 if (ScanRedInfo->OMPFirstScanLoop) {
5101 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
5102 ScanVarsType, ScanRedInfo);
5103 if (Err)
5104 return Err;
5105 }
5106 if (!updateToLocation(Loc))
5107 return Loc.IP;
5108
5109 llvm::Value *IV = ScanRedInfo->IV;
5110
5111 if (ScanRedInfo->OMPFirstScanLoop) {
5112 // Emit buffer[i] = red; at the end of the input phase.
5113 for (size_t i = 0; i < ScanVars.size(); i++) {
5114 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5115 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5116 Type *DestTy = ScanVarsType[i];
5117 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5118 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
5119
5120 Builder.CreateStore(Src, Val);
5121 }
5122 }
5123 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5124 emitBlock(ScanRedInfo->OMPScanDispatch,
5125 Builder.GetInsertBlock()->getParent());
5126
5127 if (!ScanRedInfo->OMPFirstScanLoop) {
5128 IV = ScanRedInfo->IV;
5129 // Emit red = buffer[i]; at the entrance to the scan phase.
5130 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
5131 for (size_t i = 0; i < ScanVars.size(); i++) {
5132 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
5133 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5134 Type *DestTy = ScanVarsType[i];
5135 Value *SrcPtr =
5136 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5137 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
5138 Builder.CreateStore(Src, ScanVars[i]);
5139 }
5140 }
5141
5142 // TODO: Update it to CreateBr and remove dead blocks
5143 llvm::Value *CmpI = Builder.getInt1(true);
5144 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
5145 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
5146 ScanRedInfo->OMPAfterScanBlock);
5147 } else {
5148 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
5149 ScanRedInfo->OMPBeforeScanBlock);
5150 }
5151 emitBlock(ScanRedInfo->OMPAfterScanBlock,
5152 Builder.GetInsertBlock()->getParent());
5153 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
5154 return Builder.saveIP();
5155}
5156
5157Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
5158 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
5159 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
5160
5161 Builder.restoreIP(AllocaIP);
5162 // Create the shared pointer at alloca IP.
5163 for (size_t i = 0; i < ScanVars.size(); i++) {
5164 llvm::Value *BuffPtr =
5165 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
5166 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
5167 }
5168
5169 // Allocate temporary buffer by master thread
5170 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5171 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5172 Builder.restoreIP(CodeGenIP);
5173 Value *AllocSpan =
5174 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
5175 for (size_t i = 0; i < ScanVars.size(); i++) {
5176 Type *IntPtrTy = Builder.getInt32Ty();
5177 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
5178 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
5179 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
5180 AllocSpan, nullptr, "arr");
5181 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
5182 }
5183 return Error::success();
5184 };
5185 // TODO: Perform finalization actions for variables. This has to be
5186 // called for variables which have destructors/finalizers.
5187 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5188
5189 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
5190 llvm::Value *FilterVal = Builder.getInt32(0);
5192 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5193
5194 if (!AfterIP)
5195 return AfterIP.takeError();
5196 Builder.restoreIP(*AfterIP);
5197 BasicBlock *InputBB = Builder.GetInsertBlock();
5198 if (InputBB->hasTerminator())
5199 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5200 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5201 if (!AfterIP)
5202 return AfterIP.takeError();
5203 Builder.restoreIP(*AfterIP);
5204
5205 return Error::success();
5206}
5207
5208Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
5209 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
5210 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5211 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5212 Builder.restoreIP(CodeGenIP);
5213 for (ReductionInfo RedInfo : ReductionInfos) {
5214 Value *PrivateVar = RedInfo.PrivateVariable;
5215 Value *OrigVar = RedInfo.Variable;
5216 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
5217 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5218
5219 Type *SrcTy = RedInfo.ElementType;
5220 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
5221 "arrayOffset");
5222 Value *Src = Builder.CreateLoad(SrcTy, Val);
5223
5224 Builder.CreateStore(Src, OrigVar);
5225 Builder.CreateFree(Buff);
5226 }
5227 return Error::success();
5228 };
5229 // TODO: Perform finalization actions for variables. This has to be
5230 // called for variables which have destructors/finalizers.
5231 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5232
5233 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
5234 Builder.SetInsertPoint(TI);
5235 else
5236 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
5237
5238 llvm::Value *FilterVal = Builder.getInt32(0);
5240 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5241
5242 if (!AfterIP)
5243 return AfterIP.takeError();
5244 Builder.restoreIP(*AfterIP);
5245 BasicBlock *InputBB = Builder.GetInsertBlock();
5246 if (InputBB->hasTerminator())
5247 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5248 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5249 if (!AfterIP)
5250 return AfterIP.takeError();
5251 Builder.restoreIP(*AfterIP);
5252 return Error::success();
5253}
5254
5256 const LocationDescription &Loc,
5258 ScanInfo *ScanRedInfo) {
5259
5260 if (!updateToLocation(Loc))
5261 return Loc.IP;
5262 auto BodyGenCB = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
5263 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
5264 Builder.restoreIP(CodeGenIP);
5265 Function *CurFn = Builder.GetInsertBlock()->getParent();
5266 // for (int k = 0; k <= ceil(log2(n)); ++k)
5267 llvm::BasicBlock *LoopBB =
5268 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5269 llvm::BasicBlock *ExitBB =
5270 splitBB(Builder, false, "omp.outer.log.scan.exit");
5272 Builder.GetInsertBlock()->getModule(),
5273 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5274 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5275 llvm::Value *Arg =
5276 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5277 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5279 Builder.GetInsertBlock()->getModule(),
5280 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5281 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5282 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5283 llvm::Value *NMin1 = Builder.CreateNUWSub(
5284 ScanRedInfo->Span,
5285 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5286 Builder.SetInsertPoint(InputBB);
5287 Builder.CreateBr(LoopBB);
5288 emitBlock(LoopBB, CurFn);
5289 Builder.SetInsertPoint(LoopBB);
5290
5291 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5292 // size pow2k = 1;
5293 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5294 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5295 InputBB);
5296 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5297 InputBB);
5298 // for (size i = n - 1; i >= 2 ^ k; --i)
5299 // tmp[i] op= tmp[i-pow2k];
5300 llvm::BasicBlock *InnerLoopBB =
5301 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5302 llvm::BasicBlock *InnerExitBB =
5303 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5304 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5305 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5306 emitBlock(InnerLoopBB, CurFn);
5307 Builder.SetInsertPoint(InnerLoopBB);
5308 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5309 IVal->addIncoming(NMin1, LoopBB);
5310 for (ReductionInfo RedInfo : ReductionInfos) {
5311 Value *ReductionVal = RedInfo.PrivateVariable;
5312 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5313 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5314 Type *DestTy = RedInfo.ElementType;
5315 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5316 Value *LHSPtr =
5317 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5318 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5319 Value *RHSPtr =
5320 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5321 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5322 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5323 llvm::Value *Result;
5324 InsertPointOrErrorTy AfterIP =
5325 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5326 if (!AfterIP)
5327 return AfterIP.takeError();
5328 Builder.CreateStore(Result, LHSPtr);
5329 }
5330 llvm::Value *NextIVal = Builder.CreateNUWSub(
5331 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5332 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5333 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5334 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5335 emitBlock(InnerExitBB, CurFn);
5336 llvm::Value *Next = Builder.CreateNUWAdd(
5337 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5338 Counter->addIncoming(Next, Builder.GetInsertBlock());
5339 // pow2k <<= 1;
5340 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5341 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5342 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5343 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5344 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5345 return Error::success();
5346 };
5347
5348 // TODO: Perform finalization actions for variables. This has to be
5349 // called for variables which have destructors/finalizers.
5350 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5351
5352 llvm::Value *FilterVal = Builder.getInt32(0);
5354 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5355
5356 if (!AfterIP)
5357 return AfterIP.takeError();
5358 Builder.restoreIP(*AfterIP);
5359 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5360
5361 if (!AfterIP)
5362 return AfterIP.takeError();
5363 Builder.restoreIP(*AfterIP);
5364 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5365 if (Err)
5366 return Err;
5367
5368 return AfterIP;
5369}
5370
5371Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5372 llvm::function_ref<Error()> InputLoopGen,
5373 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5374 ScanInfo *ScanRedInfo) {
5375
5376 {
5377 // Emit loop with input phase:
5378 // for (i: 0..<num_iters>) {
5379 // <input phase>;
5380 // buffer[i] = red;
5381 // }
5382 ScanRedInfo->OMPFirstScanLoop = true;
5383 Error Err = InputLoopGen();
5384 if (Err)
5385 return Err;
5386 }
5387 {
5388 // Emit loop with scan phase:
5389 // for (i: 0..<num_iters>) {
5390 // red = buffer[i];
5391 // <scan phase>;
5392 // }
5393 ScanRedInfo->OMPFirstScanLoop = false;
5394 Error Err = ScanLoopGen(Builder.saveIP());
5395 if (Err)
5396 return Err;
5397 }
5398 return Error::success();
5399}
5400
5401void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5402 Function *Fun = Builder.GetInsertBlock()->getParent();
5403 ScanRedInfo->OMPScanDispatch =
5404 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5405 ScanRedInfo->OMPAfterScanBlock =
5406 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5407 ScanRedInfo->OMPBeforeScanBlock =
5408 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5409 ScanRedInfo->OMPScanLoopExit =
5410 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5411}
5413 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5414 BasicBlock *PostInsertBefore, const Twine &Name) {
5415 Module *M = F->getParent();
5416 LLVMContext &Ctx = M->getContext();
5417 Type *IndVarTy = TripCount->getType();
5418
5419 // Create the basic block structure.
5420 BasicBlock *Preheader =
5421 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5422 BasicBlock *Header =
5423 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5424 BasicBlock *Cond =
5425 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5426 BasicBlock *Body =
5427 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5428 BasicBlock *Latch =
5429 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5430 BasicBlock *Exit =
5431 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5432 BasicBlock *After =
5433 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5434
5435 // Use specified DebugLoc for new instructions.
5436 Builder.SetCurrentDebugLocation(DL);
5437
5438 Builder.SetInsertPoint(Preheader);
5439 Builder.CreateBr(Header);
5440
5441 Builder.SetInsertPoint(Header);
5442 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5443 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5444 Builder.CreateBr(Cond);
5445
5446 Builder.SetInsertPoint(Cond);
5447 Value *Cmp =
5448 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5449 Builder.CreateCondBr(Cmp, Body, Exit);
5450
5451 Builder.SetInsertPoint(Body);
5452 Builder.CreateBr(Latch);
5453
5454 Builder.SetInsertPoint(Latch);
5455 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5456 "omp_" + Name + ".next", /*HasNUW=*/true);
5457 Builder.CreateBr(Header);
5458 IndVarPHI->addIncoming(Next, Latch);
5459
5460 Builder.SetInsertPoint(Exit);
5461 Builder.CreateBr(After);
5462
5463 // Remember and return the canonical control flow.
5464 LoopInfos.emplace_front();
5465 CanonicalLoopInfo *CL = &LoopInfos.front();
5466
5467 CL->Header = Header;
5468 CL->Cond = Cond;
5469 CL->Latch = Latch;
5470 CL->Exit = Exit;
5471
5472#ifndef NDEBUG
5473 CL->assertOK();
5474#endif
5475 return CL;
5476}
5477
5480 LoopBodyGenCallbackTy BodyGenCB,
5481 Value *TripCount, const Twine &Name) {
5482 BasicBlock *BB = Loc.IP.getBlock();
5483 BasicBlock *NextBB = BB->getNextNode();
5484
5485 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5486 NextBB, NextBB, Name);
5487 BasicBlock *After = CL->getAfter();
5488
5489 // If location is not set, don't connect the loop.
5490 if (updateToLocation(Loc)) {
5491 // Split the loop at the insertion point: Branch to the preheader and move
5492 // every following instruction to after the loop (the After BB). Also, the
5493 // new successor is the loop's after block.
5494 spliceBB(Builder, After, /*CreateBranch=*/false);
5495 Builder.CreateBr(CL->getPreheader());
5496 }
5497
5498 // Emit the body content. We do it after connecting the loop to the CFG to
5499 // avoid that the callback encounters degenerate BBs.
5500 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5501 return Err;
5502
5503#ifndef NDEBUG
5504 CL->assertOK();
5505#endif
5506 return CL;
5507}
5508
5510 ScanInfos.emplace_front();
5511 ScanInfo *Result = &ScanInfos.front();
5512 return Result;
5513}
5514
5518 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5519 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5520 LocationDescription ComputeLoc =
5521 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5522 updateToLocation(ComputeLoc);
5523
5525
5527 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5528 ScanRedInfo->Span = TripCount;
5529 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5530 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5531
5532 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5533 Builder.restoreIP(CodeGenIP);
5534 ScanRedInfo->IV = IV;
5535 createScanBBs(ScanRedInfo);
5536 BasicBlock *InputBlock = Builder.GetInsertBlock();
5537 Instruction *Terminator = InputBlock->getTerminator();
5538 assert(Terminator->getNumSuccessors() == 1);
5539 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5540 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5541 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5542 Builder.GetInsertBlock()->getParent());
5543 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5544 emitBlock(ScanRedInfo->OMPScanLoopExit,
5545 Builder.GetInsertBlock()->getParent());
5546 Builder.CreateBr(ContinueBlock);
5547 Builder.SetInsertPoint(
5548 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5549 return BodyGenCB(Builder.saveIP(), IV);
5550 };
5551
5552 const auto &&InputLoopGen = [&]() -> Error {
5554 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5555 ComputeIP, Name, true, ScanRedInfo);
5556 if (!LoopInfo)
5557 return LoopInfo.takeError();
5558 Result.push_back(*LoopInfo);
5559 Builder.restoreIP((*LoopInfo)->getAfterIP());
5560 return Error::success();
5561 };
5562 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5564 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5565 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5566 if (!LoopInfo)
5567 return LoopInfo.takeError();
5568 Result.push_back(*LoopInfo);
5569 Builder.restoreIP((*LoopInfo)->getAfterIP());
5570 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5571 return Error::success();
5572 };
5573 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5574 if (Err)
5575 return Err;
5576 return Result;
5577}
5578
5580 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5581 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5582
5583 // Consider the following difficulties (assuming 8-bit signed integers):
5584 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5585 // DO I = 1, 100, 50
5586 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5587 // DO I = 100, 0, -128
5588
5589 // Start, Stop and Step must be of the same integer type.
5590 auto *IndVarTy = cast<IntegerType>(Start->getType());
5591 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5592 assert(IndVarTy == Step->getType() && "Step type mismatch");
5593
5595
5596 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5597 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5598
5599 // Like Step, but always positive.
5600 Value *Incr = Step;
5601
5602 // Distance between Start and Stop; always positive.
5603 Value *Span;
5604
5605 // Condition whether there are no iterations are executed at all, e.g. because
5606 // UB < LB.
5607 Value *ZeroCmp;
5608
5609 if (IsSigned) {
5610 // Ensure that increment is positive. If not, negate and invert LB and UB.
5611 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5612 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5613 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5614 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5615 Span = Builder.CreateSub(UB, LB, "", false, true);
5616 ZeroCmp = Builder.CreateICmp(
5617 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5618 } else {
5619 Span = Builder.CreateSub(Stop, Start, "", true);
5620 ZeroCmp = Builder.CreateICmp(
5621 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5622 }
5623
5624 Value *CountIfLooping;
5625 if (InclusiveStop) {
5626 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5627 } else {
5628 // Avoid incrementing past stop since it could overflow.
5629 Value *CountIfTwo = Builder.CreateAdd(
5630 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5631 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5632 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5633 }
5634
5635 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5636 "omp_" + Name + ".tripcount");
5637}
5638
5641 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5642 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5643 ScanInfo *ScanRedInfo) {
5644 LocationDescription ComputeLoc =
5645 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5646
5648 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5649
5650 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5651 Builder.restoreIP(CodeGenIP);
5652 Value *Span = Builder.CreateMul(IV, Step);
5653 Value *IndVar = Builder.CreateAdd(Span, Start);
5654 if (InScan)
5655 ScanRedInfo->IV = IndVar;
5656 return BodyGenCB(Builder.saveIP(), IndVar);
5657 };
5658 LocationDescription LoopLoc =
5659 ComputeIP.isSet()
5660 ? Loc
5661 : LocationDescription(Builder.saveIP(),
5662 Builder.getCurrentDebugLocation());
5663 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5664}
5665
5666// Returns an LLVM function to call for initializing loop bounds using OpenMP
5667// static scheduling for composite `distribute parallel for` depending on
5668// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5669// integers as unsigned similarly to CanonicalLoopInfo.
5670static FunctionCallee
5672 OpenMPIRBuilder &OMPBuilder) {
5673 unsigned Bitwidth = Ty->getIntegerBitWidth();
5674 if (Bitwidth == 32)
5675 return OMPBuilder.getOrCreateRuntimeFunction(
5676 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5677 if (Bitwidth == 64)
5678 return OMPBuilder.getOrCreateRuntimeFunction(
5679 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5680 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5681}
5682
5683// Returns an LLVM function to call for initializing loop bounds using OpenMP
5684// static scheduling depending on `type`. Only i32 and i64 are supported by the
5685// runtime. Always interpret integers as unsigned similarly to
5686// CanonicalLoopInfo.
5688 OpenMPIRBuilder &OMPBuilder) {
5689 unsigned Bitwidth = Ty->getIntegerBitWidth();
5690 if (Bitwidth == 32)
5691 return OMPBuilder.getOrCreateRuntimeFunction(
5692 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5693 if (Bitwidth == 64)
5694 return OMPBuilder.getOrCreateRuntimeFunction(
5695 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5696 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5697}
5698
5699OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5700 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5701 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5702 OMPScheduleType DistScheduleSchedType) {
5703 assert(CLI->isValid() && "Requires a valid canonical loop");
5704 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5705 "Require dedicated allocate IP");
5706
5707 // Set up the source location value for OpenMP runtime.
5708 Builder.restoreIP(CLI->getPreheaderIP());
5709 Builder.SetCurrentDebugLocation(DL);
5710
5711 uint32_t SrcLocStrSize;
5712 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5714 switch (LoopType) {
5715 case WorksharingLoopType::ForStaticLoop:
5716 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5717 break;
5718 case WorksharingLoopType::DistributeStaticLoop:
5719 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5720 break;
5721 case WorksharingLoopType::DistributeForStaticLoop:
5722 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5723 break;
5724 }
5725 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5726
5727 // Declare useful OpenMP runtime functions.
5728 Value *IV = CLI->getIndVar();
5729 Type *IVTy = IV->getType();
5730 FunctionCallee StaticInit =
5731 LoopType == WorksharingLoopType::DistributeForStaticLoop
5732 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5733 : getKmpcForStaticInitForType(IVTy, M, *this);
5734 FunctionCallee StaticFini =
5735 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5736
5737 // Allocate space for computed loop bounds as expected by the "init" function.
5738 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5739
5740 Type *I32Type = Type::getInt32Ty(M.getContext());
5741 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5742 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5743 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5744 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5745 CLI->setLastIter(PLastIter);
5746
5747 // At the end of the preheader, prepare for calling the "init" function by
5748 // storing the current loop bounds into the allocated space. A canonical loop
5749 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5750 // and produces an inclusive upper bound.
5751 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5752 Constant *Zero = ConstantInt::get(IVTy, 0);
5753 Constant *One = ConstantInt::get(IVTy, 1);
5754 Builder.CreateStore(Zero, PLowerBound);
5755 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5756 Builder.CreateStore(UpperBound, PUpperBound);
5757 Builder.CreateStore(One, PStride);
5758
5759 Value *ThreadNum =
5760 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5761
5762 OMPScheduleType SchedType =
5763 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5764 ? OMPScheduleType::OrderedDistribute
5766 Constant *SchedulingType =
5767 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5768
5769 // Call the "init" function and update the trip count of the loop with the
5770 // value it produced.
5771 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5772 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5773 this](Value *SchedulingType, auto &Builder) {
5774 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5775 PLowerBound, PUpperBound});
5776 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5777 Value *PDistUpperBound =
5778 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5779 Args.push_back(PDistUpperBound);
5780 }
5781 Args.append({PStride, One, Zero});
5782 createRuntimeFunctionCall(StaticInit, Args);
5783 };
5784 BuildInitCall(SchedulingType, Builder);
5785 if (HasDistSchedule &&
5786 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5787 Constant *DistScheduleSchedType = ConstantInt::get(
5788 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5789 // We want to emit a second init function call for the dist_schedule clause
5790 // to the Distribute construct. This should only be done however if a
5791 // Workshare Loop is nested within a Distribute Construct
5792 BuildInitCall(DistScheduleSchedType, Builder);
5793 }
5794 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5795 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5796 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5797 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5798 CLI->setTripCount(TripCount);
5799
5800 // Update all uses of the induction variable except the one in the condition
5801 // block that compares it with the actual upper bound, and the increment in
5802 // the latch block.
5803
5804 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5805 Builder.SetInsertPoint(CLI->getBody(),
5806 CLI->getBody()->getFirstInsertionPt());
5807 Builder.SetCurrentDebugLocation(DL);
5808 return Builder.CreateAdd(OldIV, LowerBound);
5809 });
5810
5811 // In the "exit" block, call the "fini" function.
5812 Builder.SetInsertPoint(CLI->getExit(),
5813 CLI->getExit()->getTerminator()->getIterator());
5814 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5815
5816 // Add the barrier if requested.
5817 if (NeedsBarrier) {
5818 InsertPointOrErrorTy BarrierIP =
5820 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5821 /* CheckCancelFlag */ false);
5822 if (!BarrierIP)
5823 return BarrierIP.takeError();
5824 }
5825
5826 InsertPointTy AfterIP = CLI->getAfterIP();
5827 CLI->invalidate();
5828
5829 return AfterIP;
5830}
5831
5832static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5833 LoopInfo &LI);
5834static void addLoopMetadata(CanonicalLoopInfo *Loop,
5835 ArrayRef<Metadata *> Properties);
5836
5838 LLVMContext &Ctx, Loop *Loop,
5840 SmallVector<Metadata *> &LoopMDList) {
5841 SmallSet<BasicBlock *, 8> Reachable;
5842
5843 // Get the basic blocks from the loop in which memref instructions
5844 // can be found.
5845 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5846 // preferably without running any passes.
5847 for (BasicBlock *Block : Loop->getBlocks()) {
5848 if (Block == CLI->getCond() || Block == CLI->getHeader())
5849 continue;
5850 Reachable.insert(Block);
5851 }
5852
5853 // Add access group metadata to memory-access instructions.
5854 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5855 for (BasicBlock *BB : Reachable)
5856 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5857 // TODO: If the loop has existing parallel access metadata, have
5858 // to combine two lists.
5859 LoopMDList.push_back(MDNode::get(
5860 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5861}
5862
5864OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5865 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5866 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5867 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5868 assert(CLI->isValid() && "Requires a valid canonical loop");
5869 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5870
5871 LLVMContext &Ctx = CLI->getFunction()->getContext();
5872 Value *IV = CLI->getIndVar();
5873 Value *OrigTripCount = CLI->getTripCount();
5874 Type *IVTy = IV->getType();
5875 assert(IVTy->getIntegerBitWidth() <= 64 &&
5876 "Max supported tripcount bitwidth is 64 bits");
5877 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5878 : Type::getInt64Ty(Ctx);
5879 Type *I32Type = Type::getInt32Ty(M.getContext());
5880 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5881 Constant *One = ConstantInt::get(InternalIVTy, 1);
5882
5883 Function *F = CLI->getFunction();
5884 // Blocks must have terminators.
5885 // FIXME: Don't run analyses on incomplete/invalid IR.
5887 for (BasicBlock &BB : *F)
5888 if (!BB.hasTerminator())
5889 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5891 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5892 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5893 LoopAnalysis LIA;
5894 LoopInfo &&LI = LIA.run(*F, FAM);
5895 for (Instruction *I : UIs)
5896 I->eraseFromParent();
5897 Loop *L = LI.getLoopFor(CLI->getHeader());
5898 SmallVector<Metadata *> LoopMDList;
5899 if (ChunkSize || DistScheduleChunkSize)
5900 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5901 addLoopMetadata(CLI, LoopMDList);
5902
5903 // Declare useful OpenMP runtime functions.
5904 FunctionCallee StaticInit =
5905 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5906 FunctionCallee StaticFini =
5907 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5908
5909 // Allocate space for computed loop bounds as expected by the "init" function.
5910 Builder.restoreIP(AllocaIP);
5911 Builder.SetCurrentDebugLocation(DL);
5912 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5913 Value *PLowerBound =
5914 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5915 Value *PUpperBound =
5916 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5917 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5918 CLI->setLastIter(PLastIter);
5919
5920 // Set up the source location value for the OpenMP runtime.
5921 Builder.restoreIP(CLI->getPreheaderIP());
5922 Builder.SetCurrentDebugLocation(DL);
5923
5924 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5925 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5926 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5927 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5928 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5929 "distschedulechunksize");
5930 Value *CastedTripCount =
5931 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5932
5933 Constant *SchedulingType =
5934 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5935 Constant *DistSchedulingType =
5936 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5937 Builder.CreateStore(Zero, PLowerBound);
5938 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5939 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5940 Value *UpperBound =
5941 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5942 Builder.CreateStore(UpperBound, PUpperBound);
5943 Builder.CreateStore(One, PStride);
5944
5945 // Call the "init" function and update the trip count of the loop with the
5946 // value it produced.
5947 uint32_t SrcLocStrSize;
5948 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5949 IdentFlag Flag = OMP_IDENT_FLAG_WORK_LOOP;
5950 if (DistScheduleSchedType != OMPScheduleType::None) {
5951 Flag |= OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5952 }
5953 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5954 Value *ThreadNum =
5955 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5956 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5957 PUpperBound, PStride, One,
5958 this](Value *SchedulingType, Value *ChunkSize,
5959 auto &Builder) {
5961 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5962 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5963 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5964 /*pstride=*/PStride, /*incr=*/One,
5965 /*chunk=*/ChunkSize});
5966 };
5967 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5968 if (DistScheduleSchedType != OMPScheduleType::None &&
5969 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5970 SchedType != OMPScheduleType::OrderedDistribute) {
5971 // We want to emit a second init function call for the dist_schedule clause
5972 // to the Distribute construct. This should only be done however if a
5973 // Workshare Loop is nested within a Distribute Construct
5974 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5975 }
5976
5977 // Load values written by the "init" function.
5978 Value *FirstChunkStart =
5979 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5980 Value *FirstChunkStop =
5981 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5982 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5983 Value *ChunkRange =
5984 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5985 Value *NextChunkStride =
5986 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5987
5988 // Create outer "dispatch" loop for enumerating the chunks.
5989 BasicBlock *DispatchEnter = splitBB(Builder, true);
5990 Value *DispatchCounter;
5991
5992 // It is safe to assume this didn't return an error because the callback
5993 // passed into createCanonicalLoop is the only possible error source, and it
5994 // always returns success.
5995 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5996 {Builder.saveIP(), DL},
5997 [&](InsertPointTy BodyIP, Value *Counter) {
5998 DispatchCounter = Counter;
5999 return Error::success();
6000 },
6001 FirstChunkStart, CastedTripCount, NextChunkStride,
6002 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
6003 "dispatch"));
6004
6005 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
6006 // not have to preserve the canonical invariant.
6007 BasicBlock *DispatchBody = DispatchCLI->getBody();
6008 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
6009 BasicBlock *DispatchExit = DispatchCLI->getExit();
6010 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
6011 DispatchCLI->invalidate();
6012
6013 // Rewire the original loop to become the chunk loop inside the dispatch loop.
6014 redirectTo(DispatchAfter, CLI->getAfter(), DL);
6015 redirectTo(CLI->getExit(), DispatchLatch, DL);
6016 redirectTo(DispatchBody, DispatchEnter, DL);
6017
6018 // Prepare the prolog of the chunk loop.
6019 Builder.restoreIP(CLI->getPreheaderIP());
6020 Builder.SetCurrentDebugLocation(DL);
6021
6022 // Compute the number of iterations of the chunk loop.
6023 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
6024 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
6025 Value *IsLastChunk =
6026 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
6027 Value *CountUntilOrigTripCount =
6028 Builder.CreateSub(CastedTripCount, DispatchCounter);
6029 Value *ChunkTripCount = Builder.CreateSelect(
6030 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
6031 Value *BackcastedChunkTC =
6032 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
6033 CLI->setTripCount(BackcastedChunkTC);
6034
6035 // Update all uses of the induction variable except the one in the condition
6036 // block that compares it with the actual upper bound, and the increment in
6037 // the latch block.
6038 Value *BackcastedDispatchCounter =
6039 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
6040 CLI->mapIndVar([&](Instruction *) -> Value * {
6041 Builder.restoreIP(CLI->getBodyIP());
6042 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
6043 });
6044
6045 // In the "exit" block, call the "fini" function.
6046 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
6047 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
6048
6049 // Add the barrier if requested.
6050 if (NeedsBarrier) {
6051 InsertPointOrErrorTy AfterIP =
6052 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
6053 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
6054 if (!AfterIP)
6055 return AfterIP.takeError();
6056 }
6057
6058#ifndef NDEBUG
6059 // Even though we currently do not support applying additional methods to it,
6060 // the chunk loop should remain a canonical loop.
6061 CLI->assertOK();
6062#endif
6063
6064 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
6065}
6066
6067// Returns an LLVM function to call for executing an OpenMP static worksharing
6068// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
6069// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
6070static FunctionCallee
6072 WorksharingLoopType LoopType) {
6073 unsigned Bitwidth = Ty->getIntegerBitWidth();
6074 Module &M = OMPBuilder->M;
6075 switch (LoopType) {
6076 case WorksharingLoopType::ForStaticLoop:
6077 if (Bitwidth == 32)
6078 return OMPBuilder->getOrCreateRuntimeFunction(
6079 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
6080 if (Bitwidth == 64)
6081 return OMPBuilder->getOrCreateRuntimeFunction(
6082 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
6083 break;
6084 case WorksharingLoopType::DistributeStaticLoop:
6085 if (Bitwidth == 32)
6086 return OMPBuilder->getOrCreateRuntimeFunction(
6087 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
6088 if (Bitwidth == 64)
6089 return OMPBuilder->getOrCreateRuntimeFunction(
6090 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
6091 break;
6092 case WorksharingLoopType::DistributeForStaticLoop:
6093 if (Bitwidth == 32)
6094 return OMPBuilder->getOrCreateRuntimeFunction(
6095 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
6096 if (Bitwidth == 64)
6097 return OMPBuilder->getOrCreateRuntimeFunction(
6098 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
6099 break;
6100 }
6101 if (Bitwidth != 32 && Bitwidth != 64) {
6102 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
6103 }
6104 llvm_unreachable("Unknown type of OpenMP worksharing loop");
6105}
6106
6107// Inserts a call to proper OpenMP Device RTL function which handles
6108// loop worksharing.
6110 WorksharingLoopType LoopType,
6111 BasicBlock *InsertBlock, Value *Ident,
6112 Value *LoopBodyArg, Value *TripCount,
6113 Function &LoopBodyFn, bool NoLoop) {
6114 Type *TripCountTy = TripCount->getType();
6115 Module &M = OMPBuilder->M;
6116 IRBuilder<> &Builder = OMPBuilder->Builder;
6117 FunctionCallee RTLFn =
6118 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
6119 SmallVector<Value *, 8> RealArgs;
6120 RealArgs.push_back(Ident);
6121 RealArgs.push_back(&LoopBodyFn);
6122 RealArgs.push_back(LoopBodyArg);
6123 RealArgs.push_back(TripCount);
6124 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
6125 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6126 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6127 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6128 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6129 return;
6130 }
6131 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
6132 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
6133 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
6134 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
6135
6136 RealArgs.push_back(
6137 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
6138 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6139 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
6140 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
6141 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
6142 } else {
6143 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
6144 }
6145
6146 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
6147}
6148
6150 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
6151 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
6152 WorksharingLoopType LoopType, bool NoLoop) {
6153 IRBuilder<> &Builder = OMPIRBuilder->Builder;
6154 BasicBlock *Preheader = CLI->getPreheader();
6155 Value *TripCount = CLI->getTripCount();
6156
6157 // After loop body outling, the loop body contains only set up
6158 // of loop body argument structure and the call to the outlined
6159 // loop body function. Firstly, we need to move setup of loop body args
6160 // into loop preheader.
6161 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
6162 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
6163
6164 // The next step is to remove the whole loop. We do not it need anymore.
6165 // That's why make an unconditional branch from loop preheader to loop
6166 // exit block
6167 Builder.restoreIP({Preheader, Preheader->end()});
6168 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
6169 Preheader->getTerminator()->eraseFromParent();
6170 Builder.CreateBr(CLI->getExit());
6171
6172 // Delete dead loop blocks
6173 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
6174 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
6175 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
6176 CleanUpInfo.EntryBB = CLI->getHeader();
6177 CleanUpInfo.ExitBB = CLI->getExit();
6178 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
6179 DeleteDeadBlocks(BlocksToBeRemoved);
6180
6181 // Find the instruction which corresponds to loop body argument structure
6182 // and remove the call to loop body function instruction.
6183 Value *LoopBodyArg;
6184 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
6185 assert(OutlinedFnUser &&
6186 "Expected unique undroppable user of outlined function");
6187 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
6188 assert(OutlinedFnCallInstruction && "Expected outlined function call");
6189 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
6190 "Expected outlined function call to be located in loop preheader");
6191 // Check in case no argument structure has been passed.
6192 if (OutlinedFnCallInstruction->arg_size() > 1)
6193 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
6194 else
6195 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
6196 OutlinedFnCallInstruction->eraseFromParent();
6197
6198 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
6199 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
6200
6201 for (auto &ToBeDeletedItem : ToBeDeleted)
6202 ToBeDeletedItem->eraseFromParent();
6203 CLI->invalidate();
6204}
6205
6206OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
6207 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
6208 WorksharingLoopType LoopType, bool NoLoop) {
6209 uint32_t SrcLocStrSize;
6210 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6212 switch (LoopType) {
6213 case WorksharingLoopType::ForStaticLoop:
6214 Flag = OMP_IDENT_FLAG_WORK_LOOP;
6215 break;
6216 case WorksharingLoopType::DistributeStaticLoop:
6217 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
6218 break;
6219 case WorksharingLoopType::DistributeForStaticLoop:
6220 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
6221 break;
6222 }
6223 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
6224
6225 auto OI = std::make_unique<OutlineInfo>();
6226 OI->OuterAllocBB = CLI->getPreheader();
6227 Function *OuterFn = CLI->getPreheader()->getParent();
6228
6229 // Instructions which need to be deleted at the end of code generation
6230 SmallVector<Instruction *, 4> ToBeDeleted;
6231
6232 OI->OuterAllocBB = AllocaIP.getBlock();
6233
6234 // Mark the body loop as region which needs to be extracted
6235 OI->EntryBB = CLI->getBody();
6236 OI->ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
6237 "omp.prelatch");
6238
6239 // Prepare loop body for extraction
6240 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
6241
6242 // Insert new loop counter variable which will be used only in loop
6243 // body.
6244 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
6245 Instruction *NewLoopCntLoad =
6246 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
6247 // New loop counter instructions are redundant in the loop preheader when
6248 // code generation for workshare loop is finshed. That's why mark them as
6249 // ready for deletion.
6250 ToBeDeleted.push_back(NewLoopCntLoad);
6251 ToBeDeleted.push_back(NewLoopCnt);
6252
6253 // Analyse loop body region. Find all input variables which are used inside
6254 // loop body region.
6255 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
6257 OI->collectBlocks(ParallelRegionBlockSet, Blocks);
6258
6259 CodeExtractorAnalysisCache CEAC(*OuterFn);
6260 CodeExtractor Extractor(Blocks,
6261 /* DominatorTree */ nullptr,
6262 /* AggregateArgs */ true,
6263 /* BlockFrequencyInfo */ nullptr,
6264 /* BranchProbabilityInfo */ nullptr,
6265 /* AssumptionCache */ nullptr,
6266 /* AllowVarArgs */ true,
6267 /* AllowAlloca */ true,
6268 /* AllocationBlock */ CLI->getPreheader(),
6269 /* DeallocationBlocks */ {},
6270 /* Suffix */ ".omp_wsloop",
6271 /* AggrArgsIn0AddrSpace */ true);
6272
6273 BasicBlock *CommonExit = nullptr;
6274 SetVector<Value *> SinkingCands, HoistingCands;
6275
6276 // Find allocas outside the loop body region which are used inside loop
6277 // body
6278 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6279
6280 // We need to model loop body region as the function f(cnt, loop_arg).
6281 // That's why we replace loop induction variable by the new counter
6282 // which will be one of loop body function argument
6284 CLI->getIndVar()->user_end());
6285 for (auto Use : Users) {
6286 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6287 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6288 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6289 }
6290 }
6291 }
6292 // Make sure that loop counter variable is not merged into loop body
6293 // function argument structure and it is passed as separate variable
6294 OI->ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6295
6296 // PostOutline CB is invoked when loop body function is outlined and
6297 // loop body is replaced by call to outlined function. We need to add
6298 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6299 // function will handle loop control logic.
6300 //
6301 OI->PostOutlineCB = [=, ToBeDeletedVec =
6302 std::move(ToBeDeleted)](Function &OutlinedFn) {
6303 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6304 LoopType, NoLoop);
6305 };
6306 addOutlineInfo(std::move(OI));
6307 return CLI->getAfterIP();
6308}
6309
6312 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6313 bool HasSimdModifier, bool HasMonotonicModifier,
6314 bool HasNonmonotonicModifier, bool HasOrderedClause,
6315 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6316 Value *DistScheduleChunkSize) {
6317 if (Config.isTargetDevice())
6318 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6319 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6320 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6321 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6322
6323 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6324 OMPScheduleType::ModifierOrdered;
6325 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6326 if (HasDistSchedule) {
6327 DistScheduleSchedType = DistScheduleChunkSize
6328 ? OMPScheduleType::OrderedDistributeChunked
6329 : OMPScheduleType::OrderedDistribute;
6330 }
6331 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6332 case OMPScheduleType::BaseStatic:
6333 case OMPScheduleType::BaseDistribute:
6334 assert((!ChunkSize || !DistScheduleChunkSize) &&
6335 "No chunk size with static-chunked schedule");
6336 if (IsOrdered && !HasDistSchedule)
6337 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6338 NeedsBarrier, ChunkSize);
6339 // FIXME: Monotonicity ignored?
6340 if (DistScheduleChunkSize)
6341 return applyStaticChunkedWorkshareLoop(
6342 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6343 DistScheduleChunkSize, DistScheduleSchedType);
6344 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6345 HasDistSchedule);
6346
6347 case OMPScheduleType::BaseStaticChunked:
6348 case OMPScheduleType::BaseDistributeChunked:
6349 if (IsOrdered && !HasDistSchedule)
6350 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6351 NeedsBarrier, ChunkSize);
6352 // FIXME: Monotonicity ignored?
6353 return applyStaticChunkedWorkshareLoop(
6354 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6355 DistScheduleChunkSize, DistScheduleSchedType);
6356
6357 case OMPScheduleType::BaseRuntime:
6358 case OMPScheduleType::BaseAuto:
6359 case OMPScheduleType::BaseGreedy:
6360 case OMPScheduleType::BaseBalanced:
6361 case OMPScheduleType::BaseSteal:
6362 case OMPScheduleType::BaseRuntimeSimd:
6363 assert(!ChunkSize &&
6364 "schedule type does not support user-defined chunk sizes");
6365 [[fallthrough]];
6366 case OMPScheduleType::BaseGuidedSimd:
6367 case OMPScheduleType::BaseDynamicChunked:
6368 case OMPScheduleType::BaseGuidedChunked:
6369 case OMPScheduleType::BaseGuidedIterativeChunked:
6370 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6371 case OMPScheduleType::BaseStaticBalancedChunked:
6372 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6373 NeedsBarrier, ChunkSize);
6374
6375 default:
6376 llvm_unreachable("Unknown/unimplemented schedule kind");
6377 }
6378}
6379
6380/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6381/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6382/// the runtime. Always interpret integers as unsigned similarly to
6383/// CanonicalLoopInfo.
6384static FunctionCallee
6386 unsigned Bitwidth = Ty->getIntegerBitWidth();
6387 if (Bitwidth == 32)
6388 return OMPBuilder.getOrCreateRuntimeFunction(
6389 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6390 if (Bitwidth == 64)
6391 return OMPBuilder.getOrCreateRuntimeFunction(
6392 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6393 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6394}
6395
6396/// Returns an LLVM function to call for updating the next loop using OpenMP
6397/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6398/// the runtime. Always interpret integers as unsigned similarly to
6399/// CanonicalLoopInfo.
6400static FunctionCallee
6402 unsigned Bitwidth = Ty->getIntegerBitWidth();
6403 if (Bitwidth == 32)
6404 return OMPBuilder.getOrCreateRuntimeFunction(
6405 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6406 if (Bitwidth == 64)
6407 return OMPBuilder.getOrCreateRuntimeFunction(
6408 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6409 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6410}
6411
6412/// Returns an LLVM function to call for finalizing the dynamic loop using
6413/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6414/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6415static FunctionCallee
6417 unsigned Bitwidth = Ty->getIntegerBitWidth();
6418 if (Bitwidth == 32)
6419 return OMPBuilder.getOrCreateRuntimeFunction(
6420 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6421 if (Bitwidth == 64)
6422 return OMPBuilder.getOrCreateRuntimeFunction(
6423 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6424 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6425}
6426
6428OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6429 InsertPointTy AllocaIP,
6430 OMPScheduleType SchedType,
6431 bool NeedsBarrier, Value *Chunk) {
6432 assert(CLI->isValid() && "Requires a valid canonical loop");
6433 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6434 "Require dedicated allocate IP");
6436 "Require valid schedule type");
6437
6438 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6439 OMPScheduleType::ModifierOrdered;
6440
6441 // Set up the source location value for OpenMP runtime.
6442 Builder.SetCurrentDebugLocation(DL);
6443
6444 uint32_t SrcLocStrSize;
6445 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6446 Value *SrcLoc =
6447 getOrCreateIdent(SrcLocStr, SrcLocStrSize, OMP_IDENT_FLAG_WORK_LOOP);
6448
6449 // Declare useful OpenMP runtime functions.
6450 Value *IV = CLI->getIndVar();
6451 Type *IVTy = IV->getType();
6452 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6453 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6454
6455 // Allocate space for computed loop bounds as expected by the "init" function.
6456 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6457 Type *I32Type = Type::getInt32Ty(M.getContext());
6458 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6459 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6460 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6461 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6462 CLI->setLastIter(PLastIter);
6463
6464 // At the end of the preheader, prepare for calling the "init" function by
6465 // storing the current loop bounds into the allocated space. A canonical loop
6466 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6467 // and produces an inclusive upper bound.
6468 BasicBlock *PreHeader = CLI->getPreheader();
6469 Builder.SetInsertPoint(PreHeader->getTerminator());
6470 Constant *One = ConstantInt::get(IVTy, 1);
6471 Builder.CreateStore(One, PLowerBound);
6472 Value *UpperBound = CLI->getTripCount();
6473 Builder.CreateStore(UpperBound, PUpperBound);
6474 Builder.CreateStore(One, PStride);
6475
6476 BasicBlock *Header = CLI->getHeader();
6477 BasicBlock *Exit = CLI->getExit();
6478 BasicBlock *Cond = CLI->getCond();
6479 BasicBlock *Latch = CLI->getLatch();
6480 InsertPointTy AfterIP = CLI->getAfterIP();
6481
6482 // The CLI will be "broken" in the code below, as the loop is no longer
6483 // a valid canonical loop.
6484
6485 if (!Chunk)
6486 Chunk = One;
6487
6488 Value *ThreadNum =
6489 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
6490
6491 Constant *SchedulingType =
6492 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6493
6494 // Call the "init" function.
6495 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6496 /* LowerBound */ One, UpperBound,
6497 /* step */ One, Chunk});
6498
6499 // An outer loop around the existing one.
6500 BasicBlock *OuterCond = BasicBlock::Create(
6501 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6502 PreHeader->getParent());
6503 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6504 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6506 DynamicNext,
6507 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6508 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6509 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6510 Value *LowerBound =
6511 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6512 Builder.CreateCondBr(MoreWork, Header, Exit);
6513
6514 // Change PHI-node in loop header to use outer cond rather than preheader,
6515 // and set IV to the LowerBound.
6516 Instruction *Phi = &Header->front();
6517 auto *PI = cast<PHINode>(Phi);
6518 PI->setIncomingBlock(0, OuterCond);
6519 PI->setIncomingValue(0, LowerBound);
6520
6521 // Then set the pre-header to jump to the OuterCond
6522 Instruction *Term = PreHeader->getTerminator();
6523 auto *Br = cast<UncondBrInst>(Term);
6524 Br->setSuccessor(OuterCond);
6525
6526 // Modify the inner condition:
6527 // * Use the UpperBound returned from the DynamicNext call.
6528 // * jump to the loop outer loop when done with one of the inner loops.
6529 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6530 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6531 Instruction *Comp = &*Builder.GetInsertPoint();
6532 auto *CI = cast<CmpInst>(Comp);
6533 CI->setOperand(1, UpperBound);
6534 // Redirect the inner exit to branch to outer condition.
6535 Instruction *Branch = &Cond->back();
6536 auto *BI = cast<CondBrInst>(Branch);
6537 assert(BI->getSuccessor(1) == Exit);
6538 BI->setSuccessor(1, OuterCond);
6539
6540 // Call the "fini" function if "ordered" is present in wsloop directive.
6541 if (Ordered) {
6542 Builder.SetInsertPoint(&Latch->back());
6543 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6544 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6545 }
6546
6547 // Add the barrier if requested.
6548 if (NeedsBarrier) {
6549 Builder.SetInsertPoint(&Exit->back());
6550 InsertPointOrErrorTy BarrierIP =
6552 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6553 /* CheckCancelFlag */ false);
6554 if (!BarrierIP)
6555 return BarrierIP.takeError();
6556 }
6557
6558 CLI->invalidate();
6559 return AfterIP;
6560}
6561
6562/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6563/// after this \p OldTarget will be orphaned.
6565 BasicBlock *NewTarget, DebugLoc DL) {
6566 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6567 redirectTo(Pred, NewTarget, DL);
6568}
6569
6571 SmallPtrSet<BasicBlock *, 8> InternalBBs(from_range, BBs);
6572 // We add a block to BBsToKeep iff we have proven it has an external use.
6574
6575 while (true) {
6576 bool Changed = false;
6577
6578 for (BasicBlock *BB : BBs) {
6579 if (BBsToKeep.contains(BB))
6580 continue;
6581
6582 for (Use &U : BB->uses()) {
6583 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6584 if (!UseInst)
6585 continue;
6586 BasicBlock *UseBB = UseInst->getParent();
6587 if (!InternalBBs.contains(UseBB) || BBsToKeep.contains(UseBB)) {
6588 BBsToKeep.insert(BB);
6589 Changed = true;
6590 break;
6591 }
6592 }
6593 }
6594
6595 if (!Changed)
6596 break;
6597 }
6598
6600 BBs, [&BBsToKeep](BasicBlock *BB) { return !BBsToKeep.contains(BB); });
6601 DeleteDeadBlocks(BBsToDelete);
6602}
6603
6604CanonicalLoopInfo *
6606 InsertPointTy ComputeIP) {
6607 assert(Loops.size() >= 1 && "At least one loop required");
6608 size_t NumLoops = Loops.size();
6609
6610 // Nothing to do if there is already just one loop.
6611 if (NumLoops == 1)
6612 return Loops.front();
6613
6614 CanonicalLoopInfo *Outermost = Loops.front();
6615 CanonicalLoopInfo *Innermost = Loops.back();
6616 BasicBlock *OrigPreheader = Outermost->getPreheader();
6617 BasicBlock *OrigAfter = Outermost->getAfter();
6618 Function *F = OrigPreheader->getParent();
6619
6620 // Loop control blocks that may become orphaned later.
6621 SmallVector<BasicBlock *, 12> OldControlBBs;
6622 OldControlBBs.reserve(6 * Loops.size());
6624 Loop->collectControlBlocks(OldControlBBs);
6625
6626 // Setup the IRBuilder for inserting the trip count computation.
6627 Builder.SetCurrentDebugLocation(DL);
6628 if (ComputeIP.isSet())
6629 Builder.restoreIP(ComputeIP);
6630 else
6631 Builder.restoreIP(Outermost->getPreheaderIP());
6632
6633 // Derive the collapsed' loop trip count.
6634 // TODO: Find common/largest indvar type.
6635 Value *CollapsedTripCount = nullptr;
6636 for (CanonicalLoopInfo *L : Loops) {
6637 assert(L->isValid() &&
6638 "All loops to collapse must be valid canonical loops");
6639 Value *OrigTripCount = L->getTripCount();
6640 if (!CollapsedTripCount) {
6641 CollapsedTripCount = OrigTripCount;
6642 continue;
6643 }
6644
6645 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6646 CollapsedTripCount =
6647 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6648 }
6649
6650 // Create the collapsed loop control flow.
6651 CanonicalLoopInfo *Result =
6652 createLoopSkeleton(DL, CollapsedTripCount, F,
6653 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6654
6655 // Build the collapsed loop body code.
6656 // Start with deriving the input loop induction variables from the collapsed
6657 // one, using a divmod scheme. To preserve the original loops' order, the
6658 // innermost loop use the least significant bits.
6659 Builder.restoreIP(Result->getBodyIP());
6660
6661 Value *Leftover = Result->getIndVar();
6662 SmallVector<Value *> NewIndVars;
6663 NewIndVars.resize(NumLoops);
6664 for (int i = NumLoops - 1; i >= 1; --i) {
6665 Value *OrigTripCount = Loops[i]->getTripCount();
6666
6667 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6668 NewIndVars[i] = NewIndVar;
6669
6670 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6671 }
6672 // Outermost loop gets all the remaining bits.
6673 NewIndVars[0] = Leftover;
6674
6675 // Construct the loop body control flow.
6676 // We progressively construct the branch structure following in direction of
6677 // the control flow, from the leading in-between code, the loop nest body, the
6678 // trailing in-between code, and rejoining the collapsed loop's latch.
6679 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6680 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6681 // its predecessors as sources.
6682 BasicBlock *ContinueBlock = Result->getBody();
6683 BasicBlock *ContinuePred = nullptr;
6684 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6685 BasicBlock *NextSrc) {
6686 if (ContinueBlock)
6687 redirectTo(ContinueBlock, Dest, DL);
6688 else
6689 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6690
6691 ContinueBlock = nullptr;
6692 ContinuePred = NextSrc;
6693 };
6694
6695 // The code before the nested loop of each level.
6696 // Because we are sinking it into the nest, it will be executed more often
6697 // that the original loop. More sophisticated schemes could keep track of what
6698 // the in-between code is and instantiate it only once per thread.
6699 for (size_t i = 0; i < NumLoops - 1; ++i)
6700 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6701
6702 // Connect the loop nest body.
6703 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6704
6705 // The code after the nested loop at each level.
6706 for (size_t i = NumLoops - 1; i > 0; --i)
6707 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6708
6709 // Connect the finished loop to the collapsed loop latch.
6710 ContinueWith(Result->getLatch(), nullptr);
6711
6712 // Replace the input loops with the new collapsed loop.
6713 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6714 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6715
6716 // Replace the input loop indvars with the derived ones.
6717 for (size_t i = 0; i < NumLoops; ++i)
6718 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6719
6720 // Remove unused parts of the input loops.
6721 removeUnusedBlocksFromParent(OldControlBBs);
6722
6723 for (CanonicalLoopInfo *L : Loops)
6724 L->invalidate();
6725
6726#ifndef NDEBUG
6727 Result->assertOK();
6728#endif
6729 return Result;
6730}
6731
6732std::vector<CanonicalLoopInfo *>
6734 ArrayRef<Value *> TileSizes) {
6735 assert(TileSizes.size() == Loops.size() &&
6736 "Must pass as many tile sizes as there are loops");
6737 int NumLoops = Loops.size();
6738 assert(NumLoops >= 1 && "At least one loop to tile required");
6739
6740 CanonicalLoopInfo *OutermostLoop = Loops.front();
6741 CanonicalLoopInfo *InnermostLoop = Loops.back();
6742 Function *F = OutermostLoop->getBody()->getParent();
6743 BasicBlock *InnerEnter = InnermostLoop->getBody();
6744 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6745
6746 // Loop control blocks that may become orphaned later.
6747 SmallVector<BasicBlock *, 12> OldControlBBs;
6748 OldControlBBs.reserve(6 * Loops.size());
6750 Loop->collectControlBlocks(OldControlBBs);
6751
6752 // Collect original trip counts and induction variable to be accessible by
6753 // index. Also, the structure of the original loops is not preserved during
6754 // the construction of the tiled loops, so do it before we scavenge the BBs of
6755 // any original CanonicalLoopInfo.
6756 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6757 for (CanonicalLoopInfo *L : Loops) {
6758 assert(L->isValid() && "All input loops must be valid canonical loops");
6759 OrigTripCounts.push_back(L->getTripCount());
6760 OrigIndVars.push_back(L->getIndVar());
6761 }
6762
6763 // Collect the code between loop headers. These may contain SSA definitions
6764 // that are used in the loop nest body. To be usable with in the innermost
6765 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6766 // these instructions may be executed more often than before the tiling.
6767 // TODO: It would be sufficient to only sink them into body of the
6768 // corresponding tile loop.
6770 for (int i = 0; i < NumLoops - 1; ++i) {
6771 CanonicalLoopInfo *Surrounding = Loops[i];
6772 CanonicalLoopInfo *Nested = Loops[i + 1];
6773
6774 BasicBlock *EnterBB = Surrounding->getBody();
6775 BasicBlock *ExitBB = Nested->getHeader();
6776 InbetweenCode.emplace_back(EnterBB, ExitBB);
6777 }
6778
6779 // Compute the trip counts of the floor loops.
6780 Builder.SetCurrentDebugLocation(DL);
6781 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6782 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6783 for (int i = 0; i < NumLoops; ++i) {
6784 Value *TileSize = TileSizes[i];
6785 Value *OrigTripCount = OrigTripCounts[i];
6786 Type *IVType = OrigTripCount->getType();
6787
6788 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6789 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6790
6791 // 0 if tripcount divides the tilesize, 1 otherwise.
6792 // 1 means we need an additional iteration for a partial tile.
6793 //
6794 // Unfortunately we cannot just use the roundup-formula
6795 // (tripcount + tilesize - 1)/tilesize
6796 // because the summation might overflow. We do not want introduce undefined
6797 // behavior when the untiled loop nest did not.
6798 Value *FloorTripOverflow =
6799 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6800
6801 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6802 Value *FloorTripCount =
6803 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6804 "omp_floor" + Twine(i) + ".tripcount", true);
6805
6806 // Remember some values for later use.
6807 FloorCompleteCount.push_back(FloorCompleteTripCount);
6808 FloorCount.push_back(FloorTripCount);
6809 FloorRems.push_back(FloorTripRem);
6810 }
6811
6812 // Generate the new loop nest, from the outermost to the innermost.
6813 std::vector<CanonicalLoopInfo *> Result;
6814 Result.reserve(NumLoops * 2);
6815
6816 // The basic block of the surrounding loop that enters the nest generated
6817 // loop.
6818 BasicBlock *Enter = OutermostLoop->getPreheader();
6819
6820 // The basic block of the surrounding loop where the inner code should
6821 // continue.
6822 BasicBlock *Continue = OutermostLoop->getAfter();
6823
6824 // Where the next loop basic block should be inserted.
6825 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6826
6827 auto EmbeddNewLoop =
6828 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6829 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6830 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6831 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6832 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6833 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6834
6835 // Setup the position where the next embedded loop connects to this loop.
6836 Enter = EmbeddedLoop->getBody();
6837 Continue = EmbeddedLoop->getLatch();
6838 OutroInsertBefore = EmbeddedLoop->getLatch();
6839 return EmbeddedLoop;
6840 };
6841
6842 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6843 const Twine &NameBase) {
6844 for (auto P : enumerate(TripCounts)) {
6845 CanonicalLoopInfo *EmbeddedLoop =
6846 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6847 Result.push_back(EmbeddedLoop);
6848 }
6849 };
6850
6851 EmbeddNewLoops(FloorCount, "floor");
6852
6853 // Within the innermost floor loop, emit the code that computes the tile
6854 // sizes.
6855 Builder.SetInsertPoint(Enter->getTerminator());
6856 SmallVector<Value *, 4> TileCounts;
6857 for (int i = 0; i < NumLoops; ++i) {
6858 CanonicalLoopInfo *FloorLoop = Result[i];
6859 Value *TileSize = TileSizes[i];
6860
6861 Value *FloorIsEpilogue =
6862 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6863 Value *TileTripCount =
6864 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6865
6866 TileCounts.push_back(TileTripCount);
6867 }
6868
6869 // Create the tile loops.
6870 EmbeddNewLoops(TileCounts, "tile");
6871
6872 // Insert the inbetween code into the body.
6873 BasicBlock *BodyEnter = Enter;
6874 BasicBlock *BodyEntered = nullptr;
6875 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6876 BasicBlock *EnterBB = P.first;
6877 BasicBlock *ExitBB = P.second;
6878
6879 if (BodyEnter)
6880 redirectTo(BodyEnter, EnterBB, DL);
6881 else
6882 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6883
6884 BodyEnter = nullptr;
6885 BodyEntered = ExitBB;
6886 }
6887
6888 // Append the original loop nest body into the generated loop nest body.
6889 if (BodyEnter)
6890 redirectTo(BodyEnter, InnerEnter, DL);
6891 else
6892 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6894
6895 // Replace the original induction variable with an induction variable computed
6896 // from the tile and floor induction variables.
6897 Builder.restoreIP(Result.back()->getBodyIP());
6898 for (int i = 0; i < NumLoops; ++i) {
6899 CanonicalLoopInfo *FloorLoop = Result[i];
6900 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6901 Value *OrigIndVar = OrigIndVars[i];
6902 Value *Size = TileSizes[i];
6903
6904 Value *Scale =
6905 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6906 Value *Shift =
6907 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6908 OrigIndVar->replaceAllUsesWith(Shift);
6909 }
6910
6911 // Remove unused parts of the original loops.
6912 removeUnusedBlocksFromParent(OldControlBBs);
6913
6914 for (CanonicalLoopInfo *L : Loops)
6915 L->invalidate();
6916
6917#ifndef NDEBUG
6918 for (CanonicalLoopInfo *GenL : Result)
6919 GenL->assertOK();
6920#endif
6921 return Result;
6922}
6923
6924/// Attach metadata \p Properties to the basic block described by \p BB. If the
6925/// basic block already has metadata, the basic block properties are appended.
6927 ArrayRef<Metadata *> Properties) {
6928 // Nothing to do if no property to attach.
6929 if (Properties.empty())
6930 return;
6931
6932 LLVMContext &Ctx = BB->getContext();
6933 SmallVector<Metadata *> NewProperties;
6934 NewProperties.push_back(nullptr);
6935
6936 // If the basic block already has metadata, prepend it to the new metadata.
6937 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6938 if (Existing)
6939 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6940
6941 append_range(NewProperties, Properties);
6942 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6943 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6944
6945 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6946}
6947
6948/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6949/// loop already has metadata, the loop properties are appended.
6951 ArrayRef<Metadata *> Properties) {
6952 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6953
6954 // Attach metadata to the loop's latch
6955 BasicBlock *Latch = Loop->getLatch();
6956 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6957 addBasicBlockMetadata(Latch, Properties);
6958}
6959
6960/// Attach llvm.access.group metadata to the memref instructions of \p Block
6962 LoopInfo &LI) {
6963 for (Instruction &I : *Block) {
6964 if (I.mayReadOrWriteMemory()) {
6965 // TODO: This instruction may already have access group from
6966 // other pragmas e.g. #pragma clang loop vectorize. Append
6967 // so that the existing metadata is not overwritten.
6968 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6969 }
6970 }
6971}
6972
6973CanonicalLoopInfo *
6975 CanonicalLoopInfo *firstLoop = Loops.front();
6976 CanonicalLoopInfo *lastLoop = Loops.back();
6977 Function *F = firstLoop->getPreheader()->getParent();
6978
6979 // Loop control blocks that will become orphaned later
6980 SmallVector<BasicBlock *> oldControlBBs;
6982 Loop->collectControlBlocks(oldControlBBs);
6983
6984 // Collect original trip counts
6985 SmallVector<Value *> origTripCounts;
6986 for (CanonicalLoopInfo *L : Loops) {
6987 assert(L->isValid() && "All input loops must be valid canonical loops");
6988 origTripCounts.push_back(L->getTripCount());
6989 }
6990
6991 Builder.SetCurrentDebugLocation(DL);
6992
6993 // Compute max trip count.
6994 // The fused loop will be from 0 to max(origTripCounts)
6995 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6996 F, firstLoop->getHeader());
6997 Builder.SetInsertPoint(TCBlock);
6998 Value *fusedTripCount = nullptr;
6999 for (CanonicalLoopInfo *L : Loops) {
7000 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
7001 Value *origTripCount = L->getTripCount();
7002 if (!fusedTripCount) {
7003 fusedTripCount = origTripCount;
7004 continue;
7005 }
7006 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
7007 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
7008 ".omp.fuse.tc");
7009 }
7010
7011 // Generate new loop
7012 CanonicalLoopInfo *fused =
7013 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
7014 lastLoop->getLatch(), "fused");
7015
7016 // Replace original loops with the fused loop
7017 // Preheader and After are not considered inside the CLI.
7018 // These are used to compute the individual TCs of the loops
7019 // so they have to be put before the resulting fused loop.
7020 // Moving them up for readability.
7021 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7022 Loops[i]->getPreheader()->moveBefore(TCBlock);
7023 Loops[i]->getAfter()->moveBefore(TCBlock);
7024 }
7025 lastLoop->getPreheader()->moveBefore(TCBlock);
7026
7027 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7028 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
7029 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
7030 }
7031 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
7032 redirectTo(TCBlock, fused->getPreheader(), DL);
7033 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
7034
7035 // Build the fused body
7036 // Create new Blocks with conditions that jump to the original loop bodies
7038 SmallVector<Value *> condValues;
7039 for (size_t i = 0; i < Loops.size(); ++i) {
7040 BasicBlock *condBlock = BasicBlock::Create(
7041 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
7042 Builder.SetInsertPoint(condBlock);
7043 Value *condValue =
7044 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
7045 condBBs.push_back(condBlock);
7046 condValues.push_back(condValue);
7047 }
7048 // Join the condition blocks with the bodies of the original loops
7049 redirectTo(fused->getBody(), condBBs[0], DL);
7050 for (size_t i = 0; i < Loops.size() - 1; ++i) {
7051 Builder.SetInsertPoint(condBBs[i]);
7052 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
7053 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
7054 // Replace the IV with the fused IV
7055 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7056 }
7057 // Last body jumps to the created end body block
7058 Builder.SetInsertPoint(condBBs.back());
7059 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
7060 fused->getLatch());
7061 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
7062 // Replace the IV with the fused IV
7063 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
7064
7065 // The loop latch must have only one predecessor. Currently it is branched to
7066 // from both the last condition block and the last loop body
7067 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
7068 "omp.fused.pre_latch");
7069
7070 // Remove unused parts
7071 removeUnusedBlocksFromParent(oldControlBBs);
7072
7073 // Invalidate old CLIs
7074 for (CanonicalLoopInfo *L : Loops)
7075 L->invalidate();
7076
7077#ifndef NDEBUG
7078 fused->assertOK();
7079#endif
7080 return fused;
7081}
7082
7084 LLVMContext &Ctx = Builder.getContext();
7086 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7087 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
7088}
7089
7091 LLVMContext &Ctx = Builder.getContext();
7093 Loop, {
7094 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7095 });
7096}
7097
7098void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
7099 Value *IfCond, ValueToValueMapTy &VMap,
7100 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
7101 const Twine &NamePrefix) {
7102 Function *F = CanonicalLoop->getFunction();
7103
7104 // We can't do
7105 // if (cond) {
7106 // simd_loop;
7107 // } else {
7108 // non_simd_loop;
7109 // }
7110 // because then the CanonicalLoopInfo would only point to one of the loops:
7111 // leading to other constructs operating on the same loop to malfunction.
7112 // Instead generate
7113 // while (...) {
7114 // if (cond) {
7115 // simd_body;
7116 // } else {
7117 // not_simd_body;
7118 // }
7119 // }
7120 // At least for simple loops, LLVM seems able to hoist the if out of the loop
7121 // body at -O3
7122
7123 // Define where if branch should be inserted
7124 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
7125
7126 // Create additional blocks for the if statement
7127 BasicBlock *Cond = SplitBeforeIt->getParent();
7128 llvm::LLVMContext &C = Cond->getContext();
7130 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
7132 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
7133
7134 // Create if condition branch.
7135 Builder.SetInsertPoint(SplitBeforeIt);
7136 Instruction *BrInstr =
7137 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
7138 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
7139 // Then block contains branch to omp loop body which needs to be vectorized
7140 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
7141 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
7142
7143 Builder.SetInsertPoint(ElseBlock);
7144
7145 // Clone loop for the else branch
7147
7148 SmallVector<BasicBlock *, 8> ExistingBlocks;
7149 ExistingBlocks.reserve(L->getNumBlocks() + 1);
7150 ExistingBlocks.push_back(ThenBlock);
7151 ExistingBlocks.append(L->block_begin(), L->block_end());
7152 // Cond is the block that has the if clause condition
7153 // LoopCond is omp_loop.cond
7154 // LoopHeader is omp_loop.header
7155 BasicBlock *LoopCond = Cond->getUniquePredecessor();
7156 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
7157 assert(LoopCond && LoopHeader && "Invalid loop structure");
7158 for (BasicBlock *Block : ExistingBlocks) {
7159 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
7160 Block == LoopHeader || Block == LoopCond || Block == Cond) {
7161 continue;
7162 }
7163 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
7164
7165 // fix name not to be omp.if.then
7166 if (Block == ThenBlock)
7167 NewBB->setName(NamePrefix + ".if.else");
7168
7169 NewBB->moveBefore(CanonicalLoop->getExit());
7170 VMap[Block] = NewBB;
7171 NewBlocks.push_back(NewBB);
7172 }
7173 remapInstructionsInBlocks(NewBlocks, VMap);
7174 Builder.CreateBr(NewBlocks.front());
7175
7176 // The loop latch must have only one predecessor. Currently it is branched to
7177 // from both the 'then' and 'else' branches.
7178 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
7179 NamePrefix + ".pre_latch");
7180
7181 // Ensure that the then block is added to the loop so we add the attributes in
7182 // the next step
7183 L->addBasicBlockToLoop(ThenBlock, LI);
7184}
7185
7186unsigned
7188 const StringMap<bool> &Features) {
7189 if (TargetTriple.isX86()) {
7190 if (Features.lookup("avx512f"))
7191 return 512;
7192 else if (Features.lookup("avx"))
7193 return 256;
7194 return 128;
7195 }
7196 if (TargetTriple.isPPC())
7197 return 128;
7198 if (TargetTriple.isWasm())
7199 return 128;
7200 return 0;
7201}
7202
7204 MapVector<Value *, Value *> AlignedVars,
7205 Value *IfCond, OrderKind Order,
7206 ConstantInt *Simdlen, ConstantInt *Safelen) {
7207 LLVMContext &Ctx = Builder.getContext();
7208
7209 Function *F = CanonicalLoop->getFunction();
7210
7211 // Blocks must have terminators.
7212 // FIXME: Don't run analyses on incomplete/invalid IR.
7214 for (BasicBlock &BB : *F)
7215 if (!BB.hasTerminator())
7216 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7217
7218 // TODO: We should not rely on pass manager. Currently we use pass manager
7219 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
7220 // object. We should have a method which returns all blocks between
7221 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
7223 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7224 FAM.registerPass([]() { return LoopAnalysis(); });
7225 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7226
7227 LoopAnalysis LIA;
7228 LoopInfo &&LI = LIA.run(*F, FAM);
7229
7230 for (Instruction *I : UIs)
7231 I->eraseFromParent();
7232
7233 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
7234 if (AlignedVars.size()) {
7235 InsertPointTy IP = Builder.saveIP();
7236 for (auto &AlignedItem : AlignedVars) {
7237 Value *AlignedPtr = AlignedItem.first;
7238 Value *Alignment = AlignedItem.second;
7239 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
7240 Builder.SetInsertPoint(loadInst->getNextNode());
7241 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
7242 Alignment);
7243 }
7244 Builder.restoreIP(IP);
7245 }
7246
7247 if (IfCond) {
7248 ValueToValueMapTy VMap;
7249 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
7250 }
7251
7253
7254 // Get the basic blocks from the loop in which memref instructions
7255 // can be found.
7256 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
7257 // preferably without running any passes.
7258 for (BasicBlock *Block : L->getBlocks()) {
7259 if (Block == CanonicalLoop->getCond() ||
7260 Block == CanonicalLoop->getHeader())
7261 continue;
7262 Reachable.insert(Block);
7263 }
7264
7265 SmallVector<Metadata *> LoopMDList;
7266
7267 // In presence of finite 'safelen', it may be unsafe to mark all
7268 // the memory instructions parallel, because loop-carried
7269 // dependences of 'safelen' iterations are possible.
7270 // If clause order(concurrent) is specified then the memory instructions
7271 // are marked parallel even if 'safelen' is finite.
7272 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
7273 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
7274
7275 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
7276 // versions so we can't add the loop attributes in that case.
7277 if (IfCond) {
7278 // we can still add llvm.loop.parallel_access
7279 addLoopMetadata(CanonicalLoop, LoopMDList);
7280 return;
7281 }
7282
7283 // Use the above access group metadata to create loop level
7284 // metadata, which should be distinct for each loop.
7285 ConstantAsMetadata *BoolConst =
7287 LoopMDList.push_back(MDNode::get(
7288 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7289
7290 if (Simdlen || Safelen) {
7291 // If both simdlen and safelen clauses are specified, the value of the
7292 // simdlen parameter must be less than or equal to the value of the safelen
7293 // parameter. Therefore, use safelen only in the absence of simdlen.
7294 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7295 LoopMDList.push_back(
7296 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7297 ConstantAsMetadata::get(VectorizeWidth)}));
7298 }
7299
7300 addLoopMetadata(CanonicalLoop, LoopMDList);
7301}
7302
7303/// Create the TargetMachine object to query the backend for optimization
7304/// preferences.
7305///
7306/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7307/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7308/// needed for the LLVM pass pipline. We use some default options to avoid
7309/// having to pass too many settings from the frontend that probably do not
7310/// matter.
7311///
7312/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7313/// method. If we are going to use TargetMachine for more purposes, especially
7314/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7315/// might become be worth requiring front-ends to pass on their TargetMachine,
7316/// or at least cache it between methods. Note that while fontends such as Clang
7317/// have just a single main TargetMachine per translation unit, "target-cpu" and
7318/// "target-features" that determine the TargetMachine are per-function and can
7319/// be overrided using __attribute__((target("OPTIONS"))).
7320static std::unique_ptr<TargetMachine>
7322 Module *M = F->getParent();
7323
7324 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7325 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7326 const llvm::Triple &Triple = M->getTargetTriple();
7327
7328 std::string Error;
7330 if (!TheTarget)
7331 return {};
7332
7334 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7335 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7336 /*CodeModel=*/std::nullopt, OptLevel));
7337}
7338
7339/// Heuristically determine the best-performant unroll factor for \p CLI. This
7340/// depends on the target processor. We are re-using the same heuristics as the
7341/// LoopUnrollPass.
7343 Function *F = CLI->getFunction();
7344
7345 // Assume the user requests the most aggressive unrolling, even if the rest of
7346 // the code is optimized using a lower setting.
7348 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7349
7350 // Blocks must have terminators.
7351 // FIXME: Don't run analyses on incomplete/invalid IR.
7353 for (BasicBlock &BB : *F)
7354 if (!BB.hasTerminator())
7355 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7356
7358 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7359 FAM.registerPass([]() { return AssumptionAnalysis(); });
7360 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7361 FAM.registerPass([]() { return LoopAnalysis(); });
7362 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7363 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7364 TargetIRAnalysis TIRA;
7365 if (TM)
7366 TIRA = TargetIRAnalysis(
7367 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7368 FAM.registerPass([&]() { return TIRA; });
7369
7370 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7372 ScalarEvolution &&SE = SEA.run(*F, FAM);
7374 DominatorTree &&DT = DTA.run(*F, FAM);
7375 LoopAnalysis LIA;
7376 LoopInfo &&LI = LIA.run(*F, FAM);
7378 AssumptionCache &&AC = ACT.run(*F, FAM);
7380
7381 for (Instruction *I : UIs)
7382 I->eraseFromParent();
7383
7384 Loop *L = LI.getLoopFor(CLI->getHeader());
7385 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7386
7388 L, SE, TTI,
7389 /*BlockFrequencyInfo=*/nullptr,
7390 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7391 /*UserThreshold=*/std::nullopt,
7392 /*UserCount=*/std::nullopt,
7393 /*UserAllowPartial=*/true,
7394 /*UserAllowRuntime=*/true,
7395 /*UserUpperBound=*/std::nullopt,
7396 /*UserFullUnrollMaxCount=*/std::nullopt);
7397
7398 UP.Force = true;
7399
7400 // Account for additional optimizations taking place before the LoopUnrollPass
7401 // would unroll the loop.
7404
7405 // Use normal unroll factors even if the rest of the code is optimized for
7406 // size.
7409
7410 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7411 << " Threshold=" << UP.Threshold << "\n"
7412 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7413 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7414 << " PartialOptSizeThreshold="
7415 << UP.PartialOptSizeThreshold << "\n");
7416
7417 // Disable peeling.
7420 /*UserAllowPeeling=*/false,
7421 /*UserAllowProfileBasedPeeling=*/false,
7422 /*UnrollingSpecficValues=*/false);
7423
7425 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7426
7427 // Assume that reads and writes to stack variables can be eliminated by
7428 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7429 // size.
7430 for (BasicBlock *BB : L->blocks()) {
7431 for (Instruction &I : *BB) {
7432 Value *Ptr;
7433 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7434 Ptr = Load->getPointerOperand();
7435 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7436 Ptr = Store->getPointerOperand();
7437 } else
7438 continue;
7439
7440 Ptr = Ptr->stripPointerCasts();
7441
7442 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7443 if (Alloca->getParent() == &F->getEntryBlock())
7444 EphValues.insert(&I);
7445 }
7446 }
7447 }
7448
7449 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7450
7451 // Loop is not unrollable if the loop contains certain instructions.
7452 if (!UCE.canUnroll()) {
7453 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7454 return 1;
7455 }
7456
7457 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7458 << "\n");
7459
7460 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7461 // be able to use it.
7462 int TripCount = 0;
7463 int MaxTripCount = 0;
7464 bool MaxOrZero = false;
7465 unsigned TripMultiple = 0;
7466
7467 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7468 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7469 unsigned Factor = UP.Count;
7470 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7471
7472 // This function returns 1 to signal to not unroll a loop.
7473 if (Factor == 0)
7474 return 1;
7475 return Factor;
7476}
7477
7479 int32_t Factor,
7480 CanonicalLoopInfo **UnrolledCLI) {
7481 assert(Factor >= 0 && "Unroll factor must not be negative");
7482
7483 Function *F = Loop->getFunction();
7484 LLVMContext &Ctx = F->getContext();
7485
7486 // If the unrolled loop is not used for another loop-associated directive, it
7487 // is sufficient to add metadata for the LoopUnrollPass.
7488 if (!UnrolledCLI) {
7489 SmallVector<Metadata *, 2> LoopMetadata;
7490 LoopMetadata.push_back(
7491 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7492
7493 if (Factor >= 1) {
7495 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7496 LoopMetadata.push_back(MDNode::get(
7497 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7498 }
7499
7500 addLoopMetadata(Loop, LoopMetadata);
7501 return;
7502 }
7503
7504 // Heuristically determine the unroll factor.
7505 if (Factor == 0)
7507
7508 // No change required with unroll factor 1.
7509 if (Factor == 1) {
7510 *UnrolledCLI = Loop;
7511 return;
7512 }
7513
7514 assert(Factor >= 2 &&
7515 "unrolling only makes sense with a factor of 2 or larger");
7516
7517 Type *IndVarTy = Loop->getIndVarType();
7518
7519 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7520 // unroll the inner loop.
7521 Value *FactorVal =
7522 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7523 /*isSigned=*/false));
7524 std::vector<CanonicalLoopInfo *> LoopNest =
7525 tileLoops(DL, {Loop}, {FactorVal});
7526 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7527 *UnrolledCLI = LoopNest[0];
7528 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7529
7530 // LoopUnrollPass can only fully unroll loops with constant trip count.
7531 // Unroll by the unroll factor with a fallback epilog for the remainder
7532 // iterations if necessary.
7534 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7536 InnerLoop,
7537 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7539 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7540
7541#ifndef NDEBUG
7542 (*UnrolledCLI)->assertOK();
7543#endif
7544}
7545
7548 llvm::Value *BufSize, llvm::Value *CpyBuf,
7549 llvm::Value *CpyFn, llvm::Value *DidIt) {
7550 if (!updateToLocation(Loc))
7551 return Loc.IP;
7552
7553 uint32_t SrcLocStrSize;
7554 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7555 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7556 Value *ThreadId = getOrCreateThreadID(Ident);
7557
7558 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7559
7560 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7561
7562 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7563 createRuntimeFunctionCall(Fn, Args);
7564
7565 return Builder.saveIP();
7566}
7567
7569 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7570 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7572
7573 if (!updateToLocation(Loc))
7574 return Loc.IP;
7575
7576 // If needed allocate and initialize `DidIt` with 0.
7577 // DidIt: flag variable: 1=single thread; 0=not single thread.
7578 llvm::Value *DidIt = nullptr;
7579 if (!CPVars.empty()) {
7580 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7581 Builder.CreateStore(Builder.getInt32(0), DidIt);
7582 }
7583
7584 Directive OMPD = Directive::OMPD_single;
7585 uint32_t SrcLocStrSize;
7586 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7587 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7588 Value *ThreadId = getOrCreateThreadID(Ident);
7589 Value *Args[] = {Ident, ThreadId};
7590
7591 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7592 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7593
7594 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7595 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7596
7597 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7598 if (Error Err = FiniCB(IP))
7599 return Err;
7600
7601 // The thread that executes the single region must set `DidIt` to 1.
7602 // This is used by __kmpc_copyprivate, to know if the caller is the
7603 // single thread or not.
7604 if (DidIt)
7605 Builder.CreateStore(Builder.getInt32(1), DidIt);
7606
7607 return Error::success();
7608 };
7609
7610 // generates the following:
7611 // if (__kmpc_single()) {
7612 // .... single region ...
7613 // __kmpc_end_single
7614 // }
7615 // __kmpc_copyprivate
7616 // __kmpc_barrier
7617
7618 InsertPointOrErrorTy AfterIP =
7619 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7620 /*Conditional*/ true,
7621 /*hasFinalize*/ true);
7622 if (!AfterIP)
7623 return AfterIP.takeError();
7624
7625 if (DidIt) {
7626 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7627 // NOTE BufSize is currently unused, so just pass 0.
7629 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7630 CPFuncs[I], DidIt);
7631 // NOTE __kmpc_copyprivate already inserts a barrier
7632 } else if (!IsNowait) {
7633 InsertPointOrErrorTy AfterIP =
7635 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7636 /* CheckCancelFlag */ false);
7637 if (!AfterIP)
7638 return AfterIP.takeError();
7639 }
7640 return Builder.saveIP();
7641}
7642
7645 BodyGenCallbackTy BodyGenCB,
7646 FinalizeCallbackTy FiniCB, bool IsNowait) {
7647
7648 if (!updateToLocation(Loc))
7649 return Loc.IP;
7650
7651 // All threads execute the scope body — no conditional entry.
7652 InsertPointOrErrorTy AfterIP = EmitOMPInlinedRegion(
7653 Directive::OMPD_scope, /*EntryCall=*/nullptr, /*ExitCall=*/nullptr,
7654 BodyGenCB, FiniCB, /*Conditional=*/false, /*HasFinalize=*/true,
7655 /*IsCancellable=*/false);
7656 if (!AfterIP)
7657 return AfterIP.takeError();
7658
7659 Builder.restoreIP(*AfterIP);
7660 if (!IsNowait) {
7661 AfterIP = createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
7662 omp::Directive::OMPD_unknown,
7663 /*ForceSimpleCall=*/false,
7664 /*CheckCancelFlag=*/false);
7665 if (!AfterIP)
7666 return AfterIP.takeError();
7667 }
7668 return Builder.saveIP();
7669}
7670
7672 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7673 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7674
7675 if (!updateToLocation(Loc))
7676 return Loc.IP;
7677
7678 Directive OMPD = Directive::OMPD_critical;
7679 uint32_t SrcLocStrSize;
7680 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7681 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7682 Value *ThreadId = getOrCreateThreadID(Ident);
7683 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7684 Value *Args[] = {Ident, ThreadId, LockVar};
7685
7686 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7687 Function *RTFn = nullptr;
7688 if (HintInst) {
7689 // Add Hint to entry Args and create call
7690 EnterArgs.push_back(HintInst);
7691 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7692 } else {
7693 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7694 }
7695 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7696
7697 Function *ExitRTLFn =
7698 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7699 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7700
7701 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7702 /*Conditional*/ false, /*hasFinalize*/ true);
7703}
7704
7707 InsertPointTy AllocaIP, unsigned NumLoops,
7708 ArrayRef<llvm::Value *> StoreValues,
7709 const Twine &Name, bool IsDependSource) {
7710 assert(
7711 llvm::all_of(StoreValues,
7712 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7713 "OpenMP runtime requires depend vec with i64 type");
7714
7715 if (!updateToLocation(Loc))
7716 return Loc.IP;
7717
7718 // Allocate space for vector and generate alloc instruction.
7719 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7720 Builder.restoreIP(AllocaIP);
7721 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7722 ArgsBase->setAlignment(Align(8));
7724
7725 // Store the index value with offset in depend vector.
7726 for (unsigned I = 0; I < NumLoops; ++I) {
7727 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7728 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7729 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7730 STInst->setAlignment(Align(8));
7731 }
7732
7733 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7734 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7735
7736 uint32_t SrcLocStrSize;
7737 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7738 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7739 Value *ThreadId = getOrCreateThreadID(Ident);
7740 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7741
7742 Function *RTLFn = nullptr;
7743 if (IsDependSource)
7744 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7745 else
7746 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7747 createRuntimeFunctionCall(RTLFn, Args);
7748
7749 return Builder.saveIP();
7750}
7751
7753 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7754 FinalizeCallbackTy FiniCB, bool IsThreads) {
7755 if (!updateToLocation(Loc))
7756 return Loc.IP;
7757
7758 Directive OMPD = Directive::OMPD_ordered;
7759 Instruction *EntryCall = nullptr;
7760 Instruction *ExitCall = nullptr;
7761
7762 if (IsThreads) {
7763 uint32_t SrcLocStrSize;
7764 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7765 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7766 Value *ThreadId = getOrCreateThreadID(Ident);
7767 Value *Args[] = {Ident, ThreadId};
7768
7769 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7770 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7771
7772 Function *ExitRTLFn =
7773 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7774 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7775 }
7776
7777 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7778 /*Conditional*/ false, /*hasFinalize*/ true);
7779}
7780
7781OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7782 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7783 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7784 bool HasFinalize, bool IsCancellable) {
7785
7786 if (HasFinalize)
7787 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7788
7789 // Create inlined region's entry and body blocks, in preparation
7790 // for conditional creation
7791 BasicBlock *EntryBB = Builder.GetInsertBlock();
7792 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7794 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7795 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7796 BasicBlock *FiniBB =
7797 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7798
7799 Builder.SetInsertPoint(EntryBB->getTerminator());
7800 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7801
7802 // generate body
7803 if (Error Err =
7804 BodyGenCB(/* AllocaIP */ InsertPointTy(),
7805 /* CodeGenIP */ Builder.saveIP(), /* DeallocBlocks */ {}))
7806 return Err;
7807
7808 // emit exit call and do any needed finalization.
7809 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7810 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7811 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7812 "Unexpected control flow graph state!!");
7813 InsertPointOrErrorTy AfterIP =
7814 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7815 if (!AfterIP)
7816 return AfterIP.takeError();
7817
7818 // If we are skipping the region of a non conditional, remove the exit
7819 // block, and clear the builder's insertion point.
7820 assert(SplitPos->getParent() == ExitBB &&
7821 "Unexpected Insertion point location!");
7822 auto merged = MergeBlockIntoPredecessor(ExitBB);
7823 BasicBlock *ExitPredBB = SplitPos->getParent();
7824 auto InsertBB = merged ? ExitPredBB : ExitBB;
7826 SplitPos->eraseFromParent();
7827 Builder.SetInsertPoint(InsertBB);
7828
7829 return Builder.saveIP();
7830}
7831
7832OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7833 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7834 // if nothing to do, Return current insertion point.
7835 if (!Conditional || !EntryCall)
7836 return Builder.saveIP();
7837
7838 BasicBlock *EntryBB = Builder.GetInsertBlock();
7839 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7840 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7841 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7842
7843 // Emit thenBB and set the Builder's insertion point there for
7844 // body generation next. Place the block after the current block.
7845 Function *CurFn = EntryBB->getParent();
7846 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7847
7848 // Move Entry branch to end of ThenBB, and replace with conditional
7849 // branch (If-stmt)
7850 Instruction *EntryBBTI = EntryBB->getTerminator();
7851 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7852 EntryBBTI->removeFromParent();
7853 Builder.SetInsertPoint(UI);
7854 Builder.Insert(EntryBBTI);
7855 UI->eraseFromParent();
7856 Builder.SetInsertPoint(ThenBB->getTerminator());
7857
7858 // return an insertion point to ExitBB.
7859 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7860}
7861
7862OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7863 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7864 bool HasFinalize) {
7865
7866 Builder.restoreIP(FinIP);
7867
7868 // If there is finalization to do, emit it before the exit call
7869 if (HasFinalize) {
7870 assert(!FinalizationStack.empty() &&
7871 "Unexpected finalization stack state!");
7872
7873 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7874 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7875
7876 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7877 return std::move(Err);
7878
7879 // Exit condition: insertion point is before the terminator of the new Fini
7880 // block
7881 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7882 }
7883
7884 if (!ExitCall)
7885 return Builder.saveIP();
7886
7887 // place the Exitcall as last instruction before Finalization block terminator
7888 ExitCall->removeFromParent();
7889 Builder.Insert(ExitCall);
7890
7891 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7892 ExitCall->getIterator());
7893}
7894
7896 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7897 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7898 if (!IP.isSet())
7899 return IP;
7900
7902
7903 // creates the following CFG structure
7904 // OMP_Entry : (MasterAddr != PrivateAddr)?
7905 // F T
7906 // | \
7907 // | copin.not.master
7908 // | /
7909 // v /
7910 // copyin.not.master.end
7911 // |
7912 // v
7913 // OMP.Entry.Next
7914
7915 BasicBlock *OMP_Entry = IP.getBlock();
7916 Function *CurFn = OMP_Entry->getParent();
7917 BasicBlock *CopyBegin =
7918 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7919 BasicBlock *CopyEnd = nullptr;
7920
7921 // If entry block is terminated, split to preserve the branch to following
7922 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7924 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7925 "copyin.not.master.end");
7926 OMP_Entry->getTerminator()->eraseFromParent();
7927 } else {
7928 CopyEnd =
7929 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7930 }
7931
7932 Builder.SetInsertPoint(OMP_Entry);
7933 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7934 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7935 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7936 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7937
7938 Builder.SetInsertPoint(CopyBegin);
7939 if (BranchtoEnd)
7940 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7941
7942 return Builder.saveIP();
7943}
7944
7946 Value *Size, Value *Allocator,
7947 std::string Name) {
7949 if (!updateToLocation(Loc))
7950 return nullptr;
7951
7952 uint32_t SrcLocStrSize;
7953 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7954 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7955 Value *ThreadId = getOrCreateThreadID(Ident);
7956 Value *Args[] = {ThreadId, Size, Allocator};
7957
7958 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7959
7960 return createRuntimeFunctionCall(Fn, Args, Name);
7961}
7962
7964 Value *Align, Value *Size,
7965 Value *Allocator,
7966 std::string Name) {
7968 if (!updateToLocation(Loc))
7969 return nullptr;
7970
7971 uint32_t SrcLocStrSize;
7972 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7973 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7974 Value *ThreadId = getOrCreateThreadID(Ident);
7975 Value *Args[] = {ThreadId, Align, Size, Allocator};
7976
7977 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_aligned_alloc);
7978
7979 return Builder.CreateCall(Fn, Args, Name);
7980}
7981
7983 Value *Addr, Value *Allocator,
7984 std::string Name) {
7986 if (!updateToLocation(Loc))
7987 return nullptr;
7988
7989 uint32_t SrcLocStrSize;
7990 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7991 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7992 Value *ThreadId = getOrCreateThreadID(Ident);
7993 Value *Args[] = {ThreadId, Addr, Allocator};
7994 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7995 return createRuntimeFunctionCall(Fn, Args, Name);
7996}
7997
7999 Value *Size,
8000 const Twine &Name) {
8003
8004 Value *Args[] = {Size};
8005 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc_shared);
8006 CallInst *Call = Builder.CreateCall(Fn, Args, Name);
8008 M.getContext(), M.getDataLayout().getPrefTypeAlign(Int64)));
8009 return Call;
8010}
8011
8013 Type *VarType,
8014 const Twine &Name) {
8015 return createOMPAllocShared(
8016 Loc, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)), Name);
8017}
8018
8020 Value *Addr, Value *Size,
8021 const Twine &Name) {
8024
8025 Value *Args[] = {Addr, Size};
8026 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free_shared);
8027 return Builder.CreateCall(Fn, Args, Name);
8028}
8029
8031 Value *Addr, Type *VarType,
8032 const Twine &Name) {
8033 return createOMPFreeShared(
8034 Loc, Addr, Builder.getInt64(M.getDataLayout().getTypeAllocSize(VarType)),
8035 Name);
8036}
8037
8039 const LocationDescription &Loc, Value *InteropVar,
8040 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
8041 Value *DependenceAddress, bool HaveNowaitClause) {
8044
8045 uint32_t SrcLocStrSize;
8046 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8047 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8048 Value *ThreadId = getOrCreateThreadID(Ident);
8049 if (Device == nullptr)
8050 Device = Constant::getAllOnesValue(Int32);
8051 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
8052 if (NumDependences == nullptr) {
8053 NumDependences = ConstantInt::get(Int32, 0);
8054 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8055 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8056 }
8057 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8058 Value *Args[] = {
8059 Ident, ThreadId, InteropVar, InteropTypeVal,
8060 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
8061
8062 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
8063
8064 return createRuntimeFunctionCall(Fn, Args);
8065}
8066
8068 const LocationDescription &Loc, Value *InteropVar, Value *Device,
8069 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
8072
8073 uint32_t SrcLocStrSize;
8074 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8075 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8076 Value *ThreadId = getOrCreateThreadID(Ident);
8077 if (Device == nullptr)
8078 Device = Constant::getAllOnesValue(Int32);
8079 if (NumDependences == nullptr) {
8080 NumDependences = ConstantInt::get(Int32, 0);
8081 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8082 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8083 }
8084 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8085 Value *Args[] = {
8086 Ident, ThreadId, InteropVar, Device,
8087 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8088
8089 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
8090
8091 return createRuntimeFunctionCall(Fn, Args);
8092}
8093
8095 Value *InteropVar, Value *Device,
8096 Value *NumDependences,
8097 Value *DependenceAddress,
8098 bool HaveNowaitClause) {
8101 uint32_t SrcLocStrSize;
8102 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8103 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8104 Value *ThreadId = getOrCreateThreadID(Ident);
8105 if (Device == nullptr)
8106 Device = Constant::getAllOnesValue(Int32);
8107 if (NumDependences == nullptr) {
8108 NumDependences = ConstantInt::get(Int32, 0);
8109 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
8110 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
8111 }
8112 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
8113 Value *Args[] = {
8114 Ident, ThreadId, InteropVar, Device,
8115 NumDependences, DependenceAddress, HaveNowaitClauseVal};
8116
8117 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
8118
8119 return createRuntimeFunctionCall(Fn, Args);
8120}
8121
8124 llvm::ConstantInt *Size, const llvm::Twine &Name) {
8127
8128 uint32_t SrcLocStrSize;
8129 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8130 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8131 Value *ThreadId = getOrCreateThreadID(Ident);
8132 Constant *ThreadPrivateCache =
8133 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
8134 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
8135
8136 Function *Fn =
8137 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
8138
8139 return createRuntimeFunctionCall(Fn, Args);
8140}
8141
8143 const LocationDescription &Loc,
8145 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
8146 "expected num_threads and num_teams to be specified");
8147
8148 if (!updateToLocation(Loc))
8149 return Loc.IP;
8150
8151 uint32_t SrcLocStrSize;
8152 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8153 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8154 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
8155 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
8156 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD &&
8157 Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD_NO_LOOP);
8158 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
8159 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
8160
8161 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
8162 Function *Kernel = DebugKernelWrapper;
8163
8164 // We need to strip the debug prefix to get the correct kernel name.
8165 StringRef KernelName = Kernel->getName();
8166 const std::string DebugPrefix = "_debug__";
8167 if (KernelName.ends_with(DebugPrefix)) {
8168 KernelName = KernelName.drop_back(DebugPrefix.length());
8169 Kernel = M.getFunction(KernelName);
8170 assert(Kernel && "Expected the real kernel to exist");
8171 }
8172
8173 // Manifest the launch configuration in the metadata matching the kernel
8174 // environment.
8175 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
8176 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
8177
8178 // If MaxThreads is not set and needs adjustment, select the maximum between
8179 // the default workgroup size and the MinThreads value.
8180 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
8181 if (MaxThreadsVal < 0 && UseDefaultMaxThreads) {
8182 if (hasGridValue(T)) {
8183 MaxThreadsVal =
8184 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
8185 Attrs.MinThreads);
8186 } else {
8187 MaxThreadsVal = Attrs.MinThreads;
8188 }
8189 }
8190
8191 if (MaxThreadsVal > 0)
8192 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
8193
8194 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
8195 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
8196 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
8197 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
8198 Constant *ReductionDataSize =
8199 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
8200 Constant *ReductionBufferLength =
8201 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
8202
8204 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
8205 const DataLayout &DL = Fn->getDataLayout();
8206
8207 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
8208 Constant *DynamicEnvironmentInitializer =
8209 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
8210 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
8211 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
8212 DynamicEnvironmentInitializer, DynamicEnvironmentName,
8213 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8214 DL.getDefaultGlobalsAddressSpace());
8215 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8216
8217 Constant *DynamicEnvironment =
8218 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
8219 ? DynamicEnvironmentGV
8220 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
8221 DynamicEnvironmentPtr);
8222
8223 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
8224 ConfigurationEnvironment, {
8225 UseGenericStateMachineVal,
8226 MayUseNestedParallelismVal,
8227 IsSPMDVal,
8228 MinThreads,
8229 MaxThreads,
8230 MinTeams,
8231 MaxTeams,
8232 ReductionDataSize,
8233 ReductionBufferLength,
8234 });
8235 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
8236 KernelEnvironment, {
8237 ConfigurationEnvironmentInitializer,
8238 Ident,
8239 DynamicEnvironment,
8240 });
8241 std::string KernelEnvironmentName =
8242 (KernelName + "_kernel_environment").str();
8243 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
8244 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
8245 KernelEnvironmentInitializer, KernelEnvironmentName,
8246 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
8247 DL.getDefaultGlobalsAddressSpace());
8248 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
8249
8250 Constant *KernelEnvironment =
8251 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
8252 ? KernelEnvironmentGV
8253 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
8254 KernelEnvironmentPtr);
8255 Value *KernelLaunchEnvironment =
8256 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
8257 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
8258 KernelLaunchEnvironment =
8259 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
8260 ? KernelLaunchEnvironment
8261 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
8262 KernelLaunchEnvParamTy);
8263 CallInst *ThreadKind = createRuntimeFunctionCall(
8264 Fn, {KernelEnvironment, KernelLaunchEnvironment});
8265
8266 Value *ExecUserCode = Builder.CreateICmpEQ(
8267 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
8268 "exec_user_code");
8269
8270 // ThreadKind = __kmpc_target_init(...)
8271 // if (ThreadKind == -1)
8272 // user_code
8273 // else
8274 // return;
8275
8276 auto *UI = Builder.CreateUnreachable();
8277 BasicBlock *CheckBB = UI->getParent();
8278 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
8279
8280 BasicBlock *WorkerExitBB = BasicBlock::Create(
8281 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
8282 Builder.SetInsertPoint(WorkerExitBB);
8283 Builder.CreateRetVoid();
8284
8285 auto *CheckBBTI = CheckBB->getTerminator();
8286 Builder.SetInsertPoint(CheckBBTI);
8287 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
8288
8289 CheckBBTI->eraseFromParent();
8290 UI->eraseFromParent();
8291
8292 // Continue in the "user_code" block, see diagram above and in
8293 // openmp/libomptarget/deviceRTLs/common/include/target.h .
8294 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
8295}
8296
8298 int32_t TeamsReductionDataSize,
8299 int32_t TeamsReductionBufferLength) {
8300 if (!updateToLocation(Loc))
8301 return;
8302
8304 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
8305
8307
8308 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
8309 return;
8310
8311 Function *Kernel = Builder.GetInsertBlock()->getParent();
8312 // We need to strip the debug prefix to get the correct kernel name.
8313 StringRef KernelName = Kernel->getName();
8314 const std::string DebugPrefix = "_debug__";
8315 if (KernelName.ends_with(DebugPrefix))
8316 KernelName = KernelName.drop_back(DebugPrefix.length());
8317 auto *KernelEnvironmentGV =
8318 M.getNamedGlobal((KernelName + "_kernel_environment").str());
8319 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
8320 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
8321 auto *NewInitializer = ConstantFoldInsertValueInstruction(
8322 KernelEnvironmentInitializer,
8323 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
8324 NewInitializer = ConstantFoldInsertValueInstruction(
8325 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
8326 {0, 8});
8327 KernelEnvironmentGV->setInitializer(NewInitializer);
8328}
8329
8330static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
8331 bool Min) {
8332 if (Kernel.hasFnAttribute(Name)) {
8333 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
8334 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
8335 }
8336 Kernel.addFnAttr(Name, llvm::utostr(Value));
8337}
8338
8339std::pair<int32_t, int32_t>
8341 int32_t ThreadLimit =
8342 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
8343
8344 if (T.isAMDGPU()) {
8345 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
8346 if (!Attr.isValid() || !Attr.isStringAttribute())
8347 return {0, ThreadLimit};
8348 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
8349 int32_t LB, UB;
8350 if (!llvm::to_integer(UBStr, UB, 10))
8351 return {0, ThreadLimit};
8352 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
8353 if (!llvm::to_integer(LBStr, LB, 10))
8354 return {0, UB};
8355 return {LB, UB};
8356 }
8357
8358 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
8359 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
8360 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
8361 }
8362 return {0, ThreadLimit};
8363}
8364
8366 Function &Kernel, int32_t LB,
8367 int32_t UB) {
8368 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
8369
8370 if (T.isAMDGPU()) {
8371 Kernel.addFnAttr("amdgpu-flat-work-group-size",
8372 llvm::utostr(LB) + "," + llvm::utostr(UB));
8373 return;
8374 }
8375
8377}
8378
8379std::pair<int32_t, int32_t>
8381 // TODO: Read from backend annotations if available.
8382 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8383}
8384
8386 int32_t LB, int32_t UB) {
8387 if (UB > 0) {
8388 if (T.isNVPTX())
8390 if (T.isAMDGPU())
8391 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(UB) + ",1,1");
8392 }
8393
8394 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8395}
8396
8397void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8398 Function *OutlinedFn) {
8399 if (Config.isTargetDevice()) {
8401 // TODO: Determine if DSO local can be set to true.
8402 OutlinedFn->setDSOLocal(false);
8404 if (T.isAMDGCN())
8406 else if (T.isNVPTX())
8408 else if (T.isSPIRV())
8410 }
8411}
8412
8413Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8414 StringRef EntryFnIDName) {
8415 if (Config.isTargetDevice()) {
8416 assert(OutlinedFn && "The outlined function must exist if embedded");
8417 return OutlinedFn;
8418 }
8419
8420 return new GlobalVariable(
8421 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8422 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8423}
8424
8425Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8426 StringRef EntryFnName) {
8427 if (OutlinedFn)
8428 return OutlinedFn;
8429
8430 assert(!M.getGlobalVariable(EntryFnName, true) &&
8431 "Named kernel already exists?");
8432 return new GlobalVariable(
8433 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8434 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8435}
8436
8438 TargetRegionEntryInfo &EntryInfo,
8439 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8440 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8441
8442 SmallString<64> EntryFnName;
8443 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8444
8445 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8446 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8447 if (!CBResult)
8448 return CBResult.takeError();
8449 OutlinedFn = *CBResult;
8450 } else {
8451 OutlinedFn = nullptr;
8452 }
8453
8454 // If this target outline function is not an offload entry, we don't need to
8455 // register it. This may be in the case of a false if clause, or if there are
8456 // no OpenMP targets.
8457 if (!IsOffloadEntry)
8458 return Error::success();
8459
8460 std::string EntryFnIDName =
8461 Config.isTargetDevice()
8462 ? std::string(EntryFnName)
8463 : createPlatformSpecificName({EntryFnName, "region_id"});
8464
8465 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8466 EntryFnName, EntryFnIDName);
8467 return Error::success();
8468}
8469
8471 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8472 StringRef EntryFnName, StringRef EntryFnIDName) {
8473 if (OutlinedFn)
8474 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8475 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8476 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8477 OffloadInfoManager.registerTargetRegionEntryInfo(
8478 EntryInfo, EntryAddr, OutlinedFnID,
8480 return OutlinedFnID;
8481}
8482
8484 const LocationDescription &Loc, InsertPointTy AllocaIP,
8485 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
8486 Value *DeviceID, Value *IfCond, TargetDataInfo &Info,
8487 GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB,
8488 omp::RuntimeFunction *MapperFunc,
8490 BodyGenTy BodyGenType)>
8491 BodyGenCB,
8492 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8493 if (!updateToLocation(Loc))
8494 return InsertPointTy();
8495
8496 Builder.restoreIP(CodeGenIP);
8497
8498 bool IsStandAlone = !BodyGenCB;
8499 MapInfosTy *MapInfo;
8500 // Generate the code for the opening of the data environment. Capture all the
8501 // arguments of the runtime call by reference because they are used in the
8502 // closing of the region.
8503 auto BeginThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8504 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8505 MapInfo = &GenMapInfoCB(Builder.saveIP());
8506 if (Error Err = emitOffloadingArrays(
8507 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8508 /*IsNonContiguous=*/true, DeviceAddrCB))
8509 return Err;
8510
8511 TargetDataRTArgs RTArgs;
8513
8514 // Emit the number of elements in the offloading arrays.
8515 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8516
8517 // Source location for the ident struct
8518 if (!SrcLocInfo) {
8519 uint32_t SrcLocStrSize;
8520 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8521 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8522 }
8523
8524 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8525 SrcLocInfo, DeviceID,
8526 PointerNum, RTArgs.BasePointersArray,
8527 RTArgs.PointersArray, RTArgs.SizesArray,
8528 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8529 RTArgs.MappersArray};
8530
8531 if (IsStandAlone) {
8532 assert(MapperFunc && "MapperFunc missing for standalone target data");
8533
8534 auto TaskBodyCB = [&](Value *, Value *,
8536 if (Info.HasNoWait) {
8537 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8541 }
8542
8544 OffloadingArgs);
8545
8546 if (Info.HasNoWait) {
8547 BasicBlock *OffloadContBlock =
8548 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8549 Function *CurFn = Builder.GetInsertBlock()->getParent();
8550 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8551 Builder.restoreIP(Builder.saveIP());
8552 }
8553 return Error::success();
8554 };
8555
8556 bool RequiresOuterTargetTask = Info.HasNoWait;
8557 if (!RequiresOuterTargetTask)
8558 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8559 /*TargetTaskAllocaIP=*/{}));
8560 else
8561 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8562 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8563 } else {
8564 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8565 omp::OMPRTL___tgt_target_data_begin_mapper);
8566
8567 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8568
8569 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8570 if (isa<AllocaInst>(DeviceMap.second.second)) {
8571 auto *LI =
8572 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8573 Builder.CreateStore(LI, DeviceMap.second.second);
8574 }
8575 }
8576
8577 // If device pointer privatization is required, emit the body of the
8578 // region here. It will have to be duplicated: with and without
8579 // privatization.
8580 InsertPointOrErrorTy AfterIP =
8581 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8582 if (!AfterIP)
8583 return AfterIP.takeError();
8584 Builder.restoreIP(*AfterIP);
8585 }
8586 return Error::success();
8587 };
8588
8589 // If we need device pointer privatization, we need to emit the body of the
8590 // region with no privatization in the 'else' branch of the conditional.
8591 // Otherwise, we don't have to do anything.
8592 auto BeginElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8593 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
8594 InsertPointOrErrorTy AfterIP =
8595 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8596 if (!AfterIP)
8597 return AfterIP.takeError();
8598 Builder.restoreIP(*AfterIP);
8599 return Error::success();
8600 };
8601
8602 // Generate code for the closing of the data region.
8603 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8604 ArrayRef<BasicBlock *> DeallocBlocks) {
8605 TargetDataRTArgs RTArgs;
8606 Info.EmitDebug = !MapInfo->Names.empty();
8607 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8608
8609 // Emit the number of elements in the offloading arrays.
8610 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8611
8612 // Source location for the ident struct
8613 if (!SrcLocInfo) {
8614 uint32_t SrcLocStrSize;
8615 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8616 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8617 }
8618
8619 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8620 PointerNum, RTArgs.BasePointersArray,
8621 RTArgs.PointersArray, RTArgs.SizesArray,
8622 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8623 RTArgs.MappersArray};
8624 Function *EndMapperFunc =
8625 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8626
8627 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8628 return Error::success();
8629 };
8630
8631 // We don't have to do anything to close the region if the if clause evaluates
8632 // to false.
8633 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP,
8634 ArrayRef<BasicBlock *> DeallocBlocks) {
8635 return Error::success();
8636 };
8637
8638 Error Err = [&]() -> Error {
8639 if (BodyGenCB) {
8640 Error Err = [&]() {
8641 if (IfCond)
8642 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8643 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8644 }();
8645
8646 if (Err)
8647 return Err;
8648
8649 // If we don't require privatization of device pointers, we emit the body
8650 // in between the runtime calls. This avoids duplicating the body code.
8651 InsertPointOrErrorTy AfterIP =
8652 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8653 if (!AfterIP)
8654 return AfterIP.takeError();
8655 restoreIPandDebugLoc(Builder, *AfterIP);
8656
8657 if (IfCond)
8658 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8659 return EndThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8660 }
8661 if (IfCond)
8662 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8663 return BeginThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
8664 }();
8665
8666 if (Err)
8667 return Err;
8668
8669 return Builder.saveIP();
8670}
8671
8674 bool IsGPUDistribute) {
8675 assert((IVSize == 32 || IVSize == 64) &&
8676 "IV size is not compatible with the omp runtime");
8677 RuntimeFunction Name;
8678 if (IsGPUDistribute)
8679 Name = IVSize == 32
8680 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8681 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8682 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8683 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8684 else
8685 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8686 : omp::OMPRTL___kmpc_for_static_init_4u)
8687 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8688 : omp::OMPRTL___kmpc_for_static_init_8u);
8689
8690 return getOrCreateRuntimeFunction(M, Name);
8691}
8692
8694 bool IVSigned) {
8695 assert((IVSize == 32 || IVSize == 64) &&
8696 "IV size is not compatible with the omp runtime");
8697 RuntimeFunction Name = IVSize == 32
8698 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8699 : omp::OMPRTL___kmpc_dispatch_init_4u)
8700 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8701 : omp::OMPRTL___kmpc_dispatch_init_8u);
8702
8703 return getOrCreateRuntimeFunction(M, Name);
8704}
8705
8707 bool IVSigned) {
8708 assert((IVSize == 32 || IVSize == 64) &&
8709 "IV size is not compatible with the omp runtime");
8710 RuntimeFunction Name = IVSize == 32
8711 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8712 : omp::OMPRTL___kmpc_dispatch_next_4u)
8713 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8714 : omp::OMPRTL___kmpc_dispatch_next_8u);
8715
8716 return getOrCreateRuntimeFunction(M, Name);
8717}
8718
8720 bool IVSigned) {
8721 assert((IVSize == 32 || IVSize == 64) &&
8722 "IV size is not compatible with the omp runtime");
8723 RuntimeFunction Name = IVSize == 32
8724 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8725 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8726 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8727 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8728
8729 return getOrCreateRuntimeFunction(M, Name);
8730}
8731
8733 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8734}
8735
8737 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8738 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8739
8740 DISubprogram *NewSP = Func->getSubprogram();
8741 if (!NewSP)
8742 return;
8743
8745
8746 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8747 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8748 // Only use cached variable if the arg number matches. This is important
8749 // so that DIVariable created for privatized variables are not discarded.
8750 if (NewVar && (arg == NewVar->getArg()))
8751 return NewVar;
8752
8754 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8755 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8756 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8757 return NewVar;
8758 };
8759
8760 auto UpdateDebugRecord = [&](auto *DR) {
8761 DILocalVariable *OldVar = DR->getVariable();
8762 unsigned ArgNo = 0;
8763 for (auto Loc : DR->location_ops()) {
8764 auto Iter = ValueReplacementMap.find(Loc);
8765 if (Iter != ValueReplacementMap.end()) {
8766 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8767 ArgNo = std::get<1>(Iter->second) + 1;
8768 }
8769 }
8770 if (ArgNo != 0)
8771 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8772 };
8773
8775 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8776 if (DVR->getNumVariableLocationOps() != 1u) {
8777 DVR->setKillLocation();
8778 return;
8779 }
8780 Value *Loc = DVR->getVariableLocationOp(0u);
8781 BasicBlock *CurBB = DVR->getParent();
8782 BasicBlock *RequiredBB = nullptr;
8783
8784 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8785 RequiredBB = LocInst->getParent();
8786 else if (isa<llvm::Argument>(Loc))
8787 RequiredBB = &DVR->getFunction()->getEntryBlock();
8788
8789 if (RequiredBB && RequiredBB != CurBB) {
8790 assert(!RequiredBB->empty());
8791 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8792 RequiredBB->back().getIterator());
8793 DVRsToDelete.push_back(DVR);
8794 }
8795 };
8796
8797 // The location and scope of variable intrinsics and records still point to
8798 // the parent function of the target region. Update them.
8799 for (Instruction &I : instructions(Func)) {
8801 "Unexpected debug intrinsic");
8802 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8803 UpdateDebugRecord(&DVR);
8804 MoveDebugRecordToCorrectBlock(&DVR);
8805 }
8806 }
8807 for (auto *DVR : DVRsToDelete)
8808 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8809 // An extra argument is passed to the device. Create the debug data for it.
8810 if (OMPBuilder.Config.isTargetDevice()) {
8811 DICompileUnit *CU = NewSP->getUnit();
8812 Module *M = Func->getParent();
8813 DIBuilder DB(*M, true, CU);
8814 DIType *VoidPtrTy =
8815 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8816 unsigned ArgNo = Func->arg_size();
8817 DILocalVariable *Var = DB.createParameterVariable(
8818 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8819 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8820 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8821 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8822 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8823 &(*Func->begin()));
8824 }
8825}
8826
8828 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8829 return cast<Operator>(V)->getOperand(0);
8830 return V;
8831}
8832
8834 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8836 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8839 SmallVector<Type *> ParameterTypes;
8840 if (OMPBuilder.Config.isTargetDevice()) {
8841 // All parameters to target devices are passed as pointers
8842 // or i64. This assumes 64-bit address spaces/pointers.
8843 for (auto &Arg : Inputs)
8844 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8845 ? Arg->getType()
8846 : Type::getInt64Ty(Builder.getContext()));
8847 } else {
8848 for (auto &Arg : Inputs)
8849 ParameterTypes.push_back(Arg->getType());
8850 }
8851
8852 // The implicit dyn_ptr argument is always the last parameter on both host
8853 // and device so the argument counts match without runtime manipulation.
8854 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8855 ParameterTypes.push_back(PtrTy);
8856
8857 auto BB = Builder.GetInsertBlock();
8858 auto M = BB->getModule();
8859 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8860 /*isVarArg*/ false);
8861 auto Func =
8862 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8863
8864 // Forward target-cpu and target-features function attributes from the
8865 // original function to the new outlined function.
8866 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8867
8868 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8869 if (TargetCpuAttr.isStringAttribute())
8870 Func->addFnAttr(TargetCpuAttr);
8871
8872 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8873 if (TargetFeaturesAttr.isStringAttribute())
8874 Func->addFnAttr(TargetFeaturesAttr);
8875
8876 if (OMPBuilder.Config.isTargetDevice()) {
8877 Value *ExecMode =
8878 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8879 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8880 }
8881
8882 // Save insert point.
8883 IRBuilder<>::InsertPointGuard IPG(Builder);
8884 // We will generate the entries in the outlined function but the debug
8885 // location may still be pointing to the parent function. Reset it now.
8886 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8887
8888 // Generate the region into the function.
8889 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8890 Builder.SetInsertPoint(EntryBB);
8891
8892 // Insert target init call in the device compilation pass.
8893 if (OMPBuilder.Config.isTargetDevice())
8894 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8895
8896 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8897
8898 // As we embed the user code in the middle of our target region after we
8899 // generate entry code, we must move what allocas we can into the entry
8900 // block to avoid possible breaking optimisations for device
8901 if (OMPBuilder.Config.isTargetDevice())
8903
8904 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "target.exit");
8905 BasicBlock *OutlinedBodyBB =
8906 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8908 Builder.saveIP(),
8909 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()),
8910 ExitBB);
8911 if (!AfterIP)
8912 return AfterIP.takeError();
8913 Builder.SetInsertPoint(ExitBB);
8914
8915 // Insert target deinit call in the device compilation pass.
8916 if (OMPBuilder.Config.isTargetDevice())
8917 OMPBuilder.createTargetDeinit(Builder);
8918
8919 // Insert return instruction.
8920 Builder.CreateRetVoid();
8921
8922 // New Alloca IP at entry point of created device function.
8923 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8924 auto AllocaIP = Builder.saveIP();
8925
8926 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8927
8928 // Do not include the artificial dyn_ptr argument.
8929 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8930
8932
8933 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8934 // Things like GEP's can come in the form of Constants. Constants and
8935 // ConstantExpr's do not have access to the knowledge of what they're
8936 // contained in, so we must dig a little to find an instruction so we
8937 // can tell if they're used inside of the function we're outlining. We
8938 // also replace the original constant expression with a new instruction
8939 // equivalent; an instruction as it allows easy modification in the
8940 // following loop, as we can now know the constant (instruction) is
8941 // owned by our target function and replaceUsesOfWith can now be invoked
8942 // on it (cannot do this with constants it seems). A brand new one also
8943 // allows us to be cautious as it is perhaps possible the old expression
8944 // was used inside of the function but exists and is used externally
8945 // (unlikely by the nature of a Constant, but still).
8946 // NOTE: We cannot remove dead constants that have been rewritten to
8947 // instructions at this stage, we run the risk of breaking later lowering
8948 // by doing so as we could still be in the process of lowering the module
8949 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8950 // constants we have created rewritten versions of.
8951 if (auto *Const = dyn_cast<Constant>(Input))
8952 convertUsersOfConstantsToInstructions(Const, Func, false);
8953
8954 // Collect users before iterating over them to avoid invalidating the
8955 // iteration in case a user uses Input more than once (e.g. a call
8956 // instruction).
8957 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8958 // Collect all the instructions
8960 if (auto *Instr = dyn_cast<Instruction>(User))
8961 if (Instr->getFunction() == Func)
8962 Instr->replaceUsesOfWith(Input, InputCopy);
8963 };
8964
8965 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8966
8967 // Rewrite uses of input valus to parameters.
8968 for (auto InArg : zip(Inputs, ArgRange)) {
8969 Value *Input = std::get<0>(InArg);
8970 Argument &Arg = std::get<1>(InArg);
8971 Value *InputCopy = nullptr;
8972
8973 llvm::OpenMPIRBuilder::InsertPointOrErrorTy AfterIP = ArgAccessorFuncCB(
8974 Arg, Input, InputCopy, AllocaIP, Builder.saveIP(),
8975 OpenMPIRBuilder::InsertPointTy(ExitBB, ExitBB->begin()));
8976 if (!AfterIP)
8977 return AfterIP.takeError();
8978 Builder.restoreIP(*AfterIP);
8979 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8980
8981 // In certain cases a Global may be set up for replacement, however, this
8982 // Global may be used in multiple arguments to the kernel, just segmented
8983 // apart, for example, if we have a global array, that is sectioned into
8984 // multiple mappings (technically not legal in OpenMP, but there is a case
8985 // in Fortran for Common Blocks where this is neccesary), we will end up
8986 // with GEP's into this array inside the kernel, that refer to the Global
8987 // but are technically separate arguments to the kernel for all intents and
8988 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8989 // index, it will fold into an referal to the Global, if we then encounter
8990 // this folded GEP during replacement all of the references to the
8991 // Global in the kernel will be replaced with the argument we have generated
8992 // that corresponds to it, including any other GEP's that refer to the
8993 // Global that may be other arguments. This will invalidate all of the other
8994 // preceding mapped arguments that refer to the same global that may be
8995 // separate segments. To prevent this, we defer global processing until all
8996 // other processing has been performed.
8999 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
9000 continue;
9001 }
9002
9004 continue;
9005
9006 ReplaceValue(Input, InputCopy, Func);
9007 }
9008
9009 // Replace all of our deferred Input values, currently just Globals.
9010 for (auto Deferred : DeferredReplacement)
9011 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
9012
9013 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
9014 ValueReplacementMap);
9015 return Func;
9016}
9017/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
9018/// of pointers containing shared data between the parent task and the created
9019/// task.
9021 IRBuilderBase &Builder,
9022 Value *TaskWithPrivates,
9023 Type *TaskWithPrivatesTy) {
9024
9025 Type *TaskTy = OMPIRBuilder.Task;
9026 LLVMContext &Ctx = Builder.getContext();
9027 Value *TaskT =
9028 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
9029 Value *Shareds = TaskT;
9030 // TaskWithPrivatesTy can be one of the following
9031 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9032 // %struct.privates }
9033 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
9034 //
9035 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
9036 // its first member has to be the task descriptor. TaskTy is the type of the
9037 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
9038 // first member of TaskT, gives us the pointer to shared data.
9039 if (TaskWithPrivatesTy != TaskTy)
9040 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
9041 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
9042}
9043/// Create an entry point for a target task with the following.
9044/// It'll have the following signature
9045/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
9046/// This function is called from emitTargetTask once the
9047/// code to launch the target kernel has been outlined already.
9048/// NumOffloadingArrays is the number of offloading arrays that we need to copy
9049/// into the task structure so that the deferred target task can access this
9050/// data even after the stack frame of the generating task has been rolled
9051/// back. Offloading arrays contain base pointers, pointers, sizes etc
9052/// of the data that the target kernel will access. These in effect are the
9053/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
9055 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
9056 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
9057 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
9058
9059 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
9060 // This is because PrivatesTy is the type of the structure in which
9061 // we pass the offloading arrays to the deferred target task.
9062 assert((!NumOffloadingArrays || PrivatesTy) &&
9063 "PrivatesTy cannot be nullptr when there are offloadingArrays"
9064 "to privatize");
9065
9066 Module &M = OMPBuilder.M;
9067 // KernelLaunchFunction is the target launch function, i.e.
9068 // the function that sets up kernel arguments and calls
9069 // __tgt_target_kernel to launch the kernel on the device.
9070 //
9071 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
9072
9073 // StaleCI is the CallInst which is the call to the outlined
9074 // target kernel launch function. If there are local live-in values
9075 // that the outlined function uses then these are aggregated into a structure
9076 // which is passed as the second argument. If there are no local live-in
9077 // values or if all values used by the outlined kernel are global variables,
9078 // then there's only one argument, the threadID. So, StaleCI can be
9079 //
9080 // %structArg = alloca { ptr, ptr }, align 8
9081 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
9082 // store ptr %20, ptr %gep_, align 8
9083 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
9084 // store ptr %21, ptr %gep_8, align 8
9085 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
9086 //
9087 // OR
9088 //
9089 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
9091 StaleCI->getIterator());
9092
9093 LLVMContext &Ctx = StaleCI->getParent()->getContext();
9094
9095 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
9096 Type *TaskPtrTy = OMPBuilder.TaskPtr;
9097 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
9098
9099 auto ProxyFnTy =
9100 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
9101 /* isVarArg */ false);
9102 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
9103 ".omp_target_task_proxy_func",
9104 Builder.GetInsertBlock()->getModule());
9105 Value *ThreadId = ProxyFn->getArg(0);
9106 Value *TaskWithPrivates = ProxyFn->getArg(1);
9107 ThreadId->setName("thread.id");
9108 TaskWithPrivates->setName("task");
9109
9110 bool HasShareds = SharedArgsOperandNo > 0;
9111 bool HasOffloadingArrays = NumOffloadingArrays > 0;
9112 BasicBlock *EntryBB =
9113 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
9114 Builder.SetInsertPoint(EntryBB);
9115
9116 SmallVector<Value *> KernelLaunchArgs;
9117 KernelLaunchArgs.reserve(StaleCI->arg_size());
9118 KernelLaunchArgs.push_back(ThreadId);
9119
9120 if (HasOffloadingArrays) {
9121 assert(TaskTy != TaskWithPrivatesTy &&
9122 "If there are offloading arrays to pass to the target"
9123 "TaskTy cannot be the same as TaskWithPrivatesTy");
9124 (void)TaskTy;
9125 Value *Privates =
9126 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
9127 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
9128 KernelLaunchArgs.push_back(
9129 Builder.CreateStructGEP(PrivatesTy, Privates, i));
9130 }
9131
9132 if (HasShareds) {
9133 auto *ArgStructAlloca =
9134 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
9135 assert(ArgStructAlloca &&
9136 "Unable to find the alloca instruction corresponding to arguments "
9137 "for extracted function");
9138 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
9139 std::optional<TypeSize> ArgAllocSize =
9140 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9141 assert(ArgStructType && ArgAllocSize &&
9142 "Unable to determine size of arguments for extracted function");
9143 uint64_t StructSize = ArgAllocSize->getFixedValue();
9144
9145 AllocaInst *NewArgStructAlloca =
9146 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
9147
9148 Value *SharedsSize = Builder.getInt64(StructSize);
9149
9151 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
9152
9153 Builder.CreateMemCpy(
9154 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
9155 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
9156 KernelLaunchArgs.push_back(NewArgStructAlloca);
9157 }
9158 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
9159 Builder.CreateRetVoid();
9160 return ProxyFn;
9161}
9163
9164 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
9165 return GEP->getSourceElementType();
9166 if (auto *Alloca = dyn_cast<AllocaInst>(V))
9167 return Alloca->getAllocatedType();
9168
9169 llvm_unreachable("Unhandled Instruction type");
9170 return nullptr;
9171}
9172// This function returns a struct that has at most two members.
9173// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
9174// descriptor. The second member, if needed, is a struct containing arrays
9175// that need to be passed to the offloaded target kernel. For example,
9176// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
9177// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
9178// respectively, then the types created by this function are
9179//
9180// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
9181// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
9182// %struct.privates }
9183// %struct.task_with_privates is returned by this function.
9184// If there aren't any offloading arrays to pass to the target kernel,
9185// %struct.kmp_task_ompbuilder_t is returned.
9186static StructType *
9188 ArrayRef<Value *> OffloadingArraysToPrivatize) {
9189
9190 if (OffloadingArraysToPrivatize.empty())
9191 return OMPIRBuilder.Task;
9192
9193 SmallVector<Type *, 4> StructFieldTypes;
9194 for (Value *V : OffloadingArraysToPrivatize) {
9195 assert(V->getType()->isPointerTy() &&
9196 "Expected pointer to array to privatize. Got a non-pointer value "
9197 "instead");
9198 Type *ArrayTy = getOffloadingArrayType(V);
9199 assert(ArrayTy && "ArrayType cannot be nullptr");
9200 StructFieldTypes.push_back(ArrayTy);
9201 }
9202 StructType *PrivatesStructTy =
9203 StructType::create(StructFieldTypes, "struct.privates");
9204 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
9205 "struct.task_with_privates");
9206}
9208 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
9209 TargetRegionEntryInfo &EntryInfo,
9211 Function *&OutlinedFn, Constant *&OutlinedFnID,
9215
9216 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
9217 [&](StringRef EntryFnName) {
9218 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
9219 EntryFnName, Inputs, CBFunc,
9220 ArgAccessorFuncCB);
9221 };
9222
9223 return OMPBuilder.emitTargetRegionFunction(
9224 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
9225 OutlinedFnID);
9226}
9227
9229 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
9231 const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs,
9232 bool HasNoWait) {
9233
9234 // The following explains the code-gen scenario for the `target` directive. A
9235 // similar scneario is followed for other device-related directives (e.g.
9236 // `target enter data`) but in similar fashion since we only need to emit task
9237 // that encapsulates the proper runtime call.
9238 //
9239 // When we arrive at this function, the target region itself has been
9240 // outlined into the function OutlinedFn.
9241 // So at ths point, for
9242 // --------------------------------------------------------------
9243 // void user_code_that_offloads(...) {
9244 // omp target depend(..) map(from:a) map(to:b) private(i)
9245 // do i = 1, 10
9246 // a(i) = b(i) + n
9247 // }
9248 //
9249 // --------------------------------------------------------------
9250 //
9251 // we have
9252 //
9253 // --------------------------------------------------------------
9254 //
9255 // void user_code_that_offloads(...) {
9256 // %.offload_baseptrs = alloca [2 x ptr], align 8
9257 // %.offload_ptrs = alloca [2 x ptr], align 8
9258 // %.offload_mappers = alloca [2 x ptr], align 8
9259 // ;; target region has been outlined and now we need to
9260 // ;; offload to it via a target task.
9261 // }
9262 // void outlined_device_function(ptr a, ptr b, ptr n) {
9263 // n = *n_ptr;
9264 // do i = 1, 10
9265 // a(i) = b(i) + n
9266 // }
9267 //
9268 // We have to now do the following
9269 // (i) Make an offloading call to outlined_device_function using the OpenMP
9270 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
9271 // emitted by emitKernelLaunch
9272 // (ii) Create a task entry point function that calls kernel_launch_function
9273 // and is the entry point for the target task. See
9274 // '@.omp_target_task_proxy_func in the pseudocode below.
9275 // (iii) Create a task with the task entry point created in (ii)
9276 //
9277 // That is we create the following
9278 // struct task_with_privates {
9279 // struct kmp_task_ompbuilder_t task_struct;
9280 // struct privates {
9281 // [2 x ptr] ; baseptrs
9282 // [2 x ptr] ; ptrs
9283 // [2 x i64] ; sizes
9284 // }
9285 // }
9286 // void user_code_that_offloads(...) {
9287 // %.offload_baseptrs = alloca [2 x ptr], align 8
9288 // %.offload_ptrs = alloca [2 x ptr], align 8
9289 // %.offload_sizes = alloca [2 x i64], align 8
9290 //
9291 // %structArg = alloca { ptr, ptr, ptr }, align 8
9292 // %strucArg[0] = a
9293 // %strucArg[1] = b
9294 // %strucArg[2] = &n
9295 //
9296 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
9297 // sizeof(kmp_task_ompbuilder_t),
9298 // sizeof(structArg),
9299 // @.omp_target_task_proxy_func,
9300 // ...)
9301 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
9302 // sizeof(structArg))
9303 // memcpy(target_task_with_privates->privates->baseptrs,
9304 // offload_baseptrs, sizeof(offload_baseptrs)
9305 // memcpy(target_task_with_privates->privates->ptrs,
9306 // offload_ptrs, sizeof(offload_ptrs)
9307 // memcpy(target_task_with_privates->privates->sizes,
9308 // offload_sizes, sizeof(offload_sizes)
9309 // dependencies_array = ...
9310 // ;; if nowait not present
9311 // call @__kmpc_omp_wait_deps(..., dependencies_array)
9312 // call @__kmpc_omp_task_begin_if0(...)
9313 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
9314 // %target_task_with_privates)
9315 // call @__kmpc_omp_task_complete_if0(...)
9316 // }
9317 //
9318 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
9319 // ptr %task) {
9320 // %structArg = alloca {ptr, ptr, ptr}
9321 // %task_ptr = getelementptr(%task, 0, 0)
9322 // %shared_data = load (getelementptr %task_ptr, 0, 0)
9323 // mempcy(%structArg, %shared_data, sizeof(%structArg))
9324 //
9325 // %offloading_arrays = getelementptr(%task, 0, 1)
9326 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
9327 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
9328 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
9329 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
9330 // %offload_sizes, %structArg)
9331 // }
9332 //
9333 // We need the proxy function because the signature of the task entry point
9334 // expected by kmpc_omp_task is always the same and will be different from
9335 // that of the kernel_launch function.
9336 //
9337 // kernel_launch_function is generated by emitKernelLaunch and has the
9338 // always_inline attribute. For this example, it'll look like so:
9339 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
9340 // %offload_sizes, %structArg) alwaysinline {
9341 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
9342 // ; load aggregated data from %structArg
9343 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
9344 // ; offload_sizes
9345 // call i32 @__tgt_target_kernel(...,
9346 // outlined_device_function,
9347 // ptr %kernel_args)
9348 // }
9349 // void outlined_device_function(ptr a, ptr b, ptr n) {
9350 // n = *n_ptr;
9351 // do i = 1, 10
9352 // a(i) = b(i) + n
9353 // }
9354 //
9355 BasicBlock *TargetTaskBodyBB =
9356 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
9357 BasicBlock *TargetTaskAllocaBB =
9358 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
9359
9360 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
9361 TargetTaskAllocaBB->begin());
9362 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
9363
9364 auto OI = std::make_unique<OutlineInfo>();
9365 OI->EntryBB = TargetTaskAllocaBB;
9366 OI->OuterAllocBB = AllocaIP.getBlock();
9367
9368 // Add the thread ID argument.
9370 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
9371 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
9372
9373 // Generate the task body which will subsequently be outlined.
9374 Builder.restoreIP(TargetTaskBodyIP);
9375 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
9376 return Err;
9377
9378 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
9379 // it is given. These blocks are enumerated by
9380 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
9381 // to be outside the region. In other words, OI.ExitBlock is expected to be
9382 // the start of the region after the outlining. We used to set OI.ExitBlock
9383 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
9384 // except when the task body is a single basic block. In that case,
9385 // OI.ExitBlock is set to the single task body block and will get left out of
9386 // the outlining process. So, simply create a new empty block to which we
9387 // uncoditionally branch from where TaskBodyCB left off
9388 OI->ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9389 emitBlock(OI->ExitBB, Builder.GetInsertBlock()->getParent(),
9390 /*IsFinished=*/true);
9391
9392 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9393 bool NeedsTargetTask = HasNoWait && DeviceID;
9394 if (NeedsTargetTask) {
9395 for (auto *V :
9396 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9397 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9398 RTArgs.SizesArray}) {
9400 OffloadingArraysToPrivatize.push_back(V);
9401 OI->ExcludeArgsFromAggregate.push_back(V);
9402 }
9403 }
9404 }
9405 OI->PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9406 DeviceID, OffloadingArraysToPrivatize](
9407 Function &OutlinedFn) mutable {
9408 assert(OutlinedFn.hasOneUse() &&
9409 "there must be a single user for the outlined function");
9410
9411 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9412
9413 // The first argument of StaleCI is always the thread id.
9414 // The next few arguments are the pointers to offloading arrays
9415 // if any. (see OffloadingArraysToPrivatize)
9416 // Finally, all other local values that are live-in into the outlined region
9417 // end up in a structure whose pointer is passed as the last argument. This
9418 // piece of data is passed in the "shared" field of the task structure. So,
9419 // we know we have to pass shareds to the task if the number of arguments is
9420 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9421 // thread id. Further, for safety, we assert that the number of arguments of
9422 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9423 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9424 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9425 assert((!HasShareds ||
9426 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9427 "Wrong number of arguments for StaleCI when shareds are present");
9428 int SharedArgOperandNo =
9429 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9430
9431 StructType *TaskWithPrivatesTy =
9432 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9433 StructType *PrivatesTy = nullptr;
9434
9435 if (!OffloadingArraysToPrivatize.empty())
9436 PrivatesTy =
9437 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9438
9440 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9441 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9442
9443 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9444 << "\n");
9445
9446 Builder.SetInsertPoint(StaleCI);
9447
9448 // Gather the arguments for emitting the runtime call.
9449 uint32_t SrcLocStrSize;
9450 Constant *SrcLocStr =
9452 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9453
9454 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9455 //
9456 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9457 // the DeviceID to the deferred task and also since
9458 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9459 Function *TaskAllocFn =
9460 !NeedsTargetTask
9461 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9463 OMPRTL___kmpc_omp_target_task_alloc);
9464
9465 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9466 // call.
9467 Value *ThreadID = getOrCreateThreadID(Ident);
9468
9469 // Argument - `sizeof_kmp_task_t` (TaskSize)
9470 // Tasksize refers to the size in bytes of kmp_task_t data structure
9471 // plus any other data to be passed to the target task, if any, which
9472 // is packed into a struct. kmp_task_t and the struct so created are
9473 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9474 Value *TaskSize = Builder.getInt64(
9475 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9476
9477 // Argument - `sizeof_shareds` (SharedsSize)
9478 // SharedsSize refers to the shareds array size in the kmp_task_t data
9479 // structure.
9480 Value *SharedsSize = Builder.getInt64(0);
9481 if (HasShareds) {
9482 auto *ArgStructAlloca =
9483 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9484 assert(ArgStructAlloca &&
9485 "Unable to find the alloca instruction corresponding to arguments "
9486 "for extracted function");
9487 std::optional<TypeSize> ArgAllocSize =
9488 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9489 assert(ArgAllocSize &&
9490 "Unable to determine size of arguments for extracted function");
9491 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9492 }
9493
9494 // Argument - `flags`
9495 // Task is tied iff (Flags & 1) == 1.
9496 // Task is untied iff (Flags & 1) == 0.
9497 // Task is final iff (Flags & 2) == 2.
9498 // Task is not final iff (Flags & 2) == 0.
9499 // A target task is not final and is untied.
9500 Value *Flags = Builder.getInt32(0);
9501
9502 // Emit the @__kmpc_omp_task_alloc runtime call
9503 // The runtime call returns a pointer to an area where the task captured
9504 // variables must be copied before the task is run (TaskData)
9505 CallInst *TaskData = nullptr;
9506
9507 SmallVector<llvm::Value *> TaskAllocArgs = {
9508 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9509 /*flags=*/Flags,
9510 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9511 /*task_func=*/ProxyFn};
9512
9513 if (NeedsTargetTask) {
9514 assert(DeviceID && "Expected non-empty device ID.");
9515 TaskAllocArgs.push_back(DeviceID);
9516 }
9517
9518 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9519
9520 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9521 if (HasShareds) {
9522 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9524 *this, Builder, TaskData, TaskWithPrivatesTy);
9525 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9526 SharedsSize);
9527 }
9528 if (!OffloadingArraysToPrivatize.empty()) {
9529 Value *Privates =
9530 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9531 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9532 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9533 [[maybe_unused]] Type *ArrayType =
9534 getOffloadingArrayType(PtrToPrivatize);
9535 assert(ArrayType && "ArrayType cannot be nullptr");
9536
9537 Type *ElementType = PrivatesTy->getElementType(i);
9538 assert(ElementType == ArrayType &&
9539 "ElementType should match ArrayType");
9540 (void)ArrayType;
9541
9542 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9543 Builder.CreateMemCpy(
9544 Dst, Alignment, PtrToPrivatize, Alignment,
9545 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9546 }
9547 }
9548
9549 Value *DepArray = nullptr;
9550 Value *NumDeps = nullptr;
9551 if (Dependencies.DepArray) {
9552 DepArray = Dependencies.DepArray;
9553 NumDeps = Dependencies.NumDeps;
9554 } else if (!Dependencies.Deps.empty()) {
9555 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
9556 NumDeps = Builder.getInt32(Dependencies.Deps.size());
9557 }
9558
9559 // ---------------------------------------------------------------
9560 // V5.2 13.8 target construct
9561 // If the nowait clause is present, execution of the target task
9562 // may be deferred. If the nowait clause is not present, the target task is
9563 // an included task.
9564 // ---------------------------------------------------------------
9565 // The above means that the lack of a nowait on the target construct
9566 // translates to '#pragma omp task if(0)'
9567 if (!NeedsTargetTask) {
9568 if (DepArray) {
9569 Function *TaskWaitFn =
9570 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9572 TaskWaitFn,
9573 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9574 /*ndeps=*/NumDeps,
9575 /*dep_list=*/DepArray,
9576 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9577 /*noalias_dep_list=*/
9579 }
9580 // Included task.
9581 Function *TaskBeginFn =
9582 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9583 Function *TaskCompleteFn =
9584 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9585 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9586 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9587 CI->setDebugLoc(StaleCI->getDebugLoc());
9588 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9589 } else if (DepArray) {
9590 // HasNoWait - meaning the task may be deferred. Call
9591 // __kmpc_omp_task_with_deps if there are dependencies,
9592 // else call __kmpc_omp_task
9593 Function *TaskFn =
9594 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9596 TaskFn,
9597 {Ident, ThreadID, TaskData, NumDeps, DepArray,
9598 ConstantInt::get(Builder.getInt32Ty(), 0),
9600 } else {
9601 // Emit the @__kmpc_omp_task runtime call to spawn the task
9602 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9603 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9604 }
9605
9606 StaleCI->eraseFromParent();
9607 for (Instruction *I : llvm::reverse(ToBeDeleted))
9608 I->eraseFromParent();
9609 };
9610 addOutlineInfo(std::move(OI));
9611
9612 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9613 << *(Builder.GetInsertBlock()) << "\n");
9614 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9615 << *(Builder.GetInsertBlock()->getParent()->getParent())
9616 << "\n");
9617 return Builder.saveIP();
9618}
9619
9621 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9622 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9623 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9624 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9625 if (Error Err =
9626 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9627 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9628 return Err;
9629 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9630 return Error::success();
9631}
9632
9633static void emitTargetCall(
9634 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9639 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9643 const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait,
9644 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9645 // Generate a function call to the host fallback implementation of the target
9646 // region. This is called by the host when no offload entry was generated for
9647 // the target region and when the offloading call fails at runtime.
9648 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9650 Builder.restoreIP(IP);
9651 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9652 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9653 FallbackArgs.push_back(
9654 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9655 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9656 return Builder.saveIP();
9657 };
9658
9659 bool HasDependencies = !Dependencies.empty();
9660 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9661
9663
9664 auto TaskBodyCB =
9665 [&](Value *DeviceID, Value *RTLoc,
9666 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9667 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9668 // produce any.
9670 // emitKernelLaunch makes the necessary runtime call to offload the
9671 // kernel. We then outline all that code into a separate function
9672 // ('kernel_launch_function' in the pseudo code above). This function is
9673 // then called by the target task proxy function (see
9674 // '@.omp_target_task_proxy_func' in the pseudo code above)
9675 // "@.omp_target_task_proxy_func' is generated by
9676 // emitTargetTaskProxyFunction.
9677 if (OutlinedFnID && DeviceID)
9678 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9679 EmitTargetCallFallbackCB, KArgs,
9680 DeviceID, RTLoc, TargetTaskAllocaIP);
9681
9682 // We only need to do the outlining if `DeviceID` is set to avoid calling
9683 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9684 // generating the `else` branch of an `if` clause.
9685 //
9686 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9687 // In this case, we execute the host implementation directly.
9688 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9689 }());
9690
9691 OMPBuilder.Builder.restoreIP(AfterIP);
9692 return Error::success();
9693 };
9694
9695 auto &&EmitTargetCallElse =
9696 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9698 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9699 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9700 // produce any.
9702 if (RequiresOuterTargetTask) {
9703 // Arguments that are intended to be directly forwarded to an
9704 // emitKernelLaunch call are pased as nullptr, since
9705 // OutlinedFnID=nullptr results in that call not being done.
9707 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9708 /*RTLoc=*/nullptr, AllocaIP,
9709 Dependencies, EmptyRTArgs, HasNoWait);
9710 }
9711 return EmitTargetCallFallbackCB(Builder.saveIP());
9712 }());
9713
9714 Builder.restoreIP(AfterIP);
9715 return Error::success();
9716 };
9717
9718 auto &&EmitTargetCallThen =
9719 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9721 ArrayRef<BasicBlock *> DeallocBlocks) -> Error {
9722 Info.HasNoWait = HasNoWait;
9723 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9724
9726 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9727 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9728 /*IsNonContiguous=*/true,
9729 /*ForEndCall=*/false))
9730 return Err;
9731
9732 SmallVector<Value *, 3> NumTeamsC;
9733 for (auto [DefaultVal, RuntimeVal] :
9734 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9735 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9736 : Builder.getInt32(DefaultVal));
9737
9738 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9739 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9740 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9741 if (Clause)
9742 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9743 /*isSigned=*/false);
9744 return Clause;
9745 };
9746 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9747 if (Clause)
9748 Result =
9749 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9750 Result, Clause)
9751 : Clause;
9752 };
9753
9754 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9755 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9756 SmallVector<Value *, 3> NumThreadsC;
9757 Value *MaxThreadsClause =
9758 RuntimeAttrs.TeamsThreadLimit.size() == 1
9759 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9760 : nullptr;
9761
9762 for (auto [TeamsVal, TargetVal] : zip_equal(
9763 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9764 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9765 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9766
9767 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9768 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9769
9770 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9771 }
9772
9773 unsigned NumTargetItems = Info.NumberOfPtrs;
9774 uint32_t SrcLocStrSize;
9775 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9776 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9777 llvm::omp::IdentFlag(0), 0);
9778
9779 Value *TripCount = RuntimeAttrs.LoopTripCount
9780 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9781 Builder.getInt64Ty(),
9782 /*isSigned=*/false)
9783 : Builder.getInt64(0);
9784
9785 // Request zero groupprivate bytes by default.
9786 if (!DynCGroupMem)
9787 DynCGroupMem = Builder.getInt32(0);
9788
9790 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9791 HasNoWait, /*StrictBlocksAndThreads=*/false, DynCGroupMemFallback);
9792
9793 // Assume no error was returned because TaskBodyCB and
9794 // EmitTargetCallFallbackCB don't produce any.
9796 // The presence of certain clauses on the target directive require the
9797 // explicit generation of the target task.
9798 if (RequiresOuterTargetTask)
9799 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9800 RTLoc, AllocaIP, Dependencies,
9801 KArgs.RTArgs, Info.HasNoWait);
9802
9803 return OMPBuilder.emitKernelLaunch(
9804 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9805 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9806 }());
9807
9808 Builder.restoreIP(AfterIP);
9809 return Error::success();
9810 };
9811
9812 // If we don't have an ID for the target region, it means an offload entry
9813 // wasn't created. In this case we just run the host fallback directly and
9814 // ignore any potential 'if' clauses.
9815 if (!OutlinedFnID) {
9816 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP(), DeallocBlocks));
9817 return;
9818 }
9819
9820 // If there's no 'if' clause, only generate the kernel launch code path.
9821 if (!IfCond) {
9822 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP(), DeallocBlocks));
9823 return;
9824 }
9825
9826 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9827 EmitTargetCallElse, AllocaIP));
9828}
9829
9831 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9832 InsertPointTy CodeGenIP, ArrayRef<BasicBlock *> DeallocBlocks,
9833 TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo,
9834 const TargetKernelDefaultAttrs &DefaultAttrs,
9835 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9836 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9839 CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies,
9840 bool HasNowait, Value *DynCGroupMem,
9841 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9842
9843 if (!updateToLocation(Loc))
9844 return InsertPointTy();
9845
9846 Builder.restoreIP(CodeGenIP);
9847
9848 Function *OutlinedFn;
9849 Constant *OutlinedFnID = nullptr;
9850 // The target region is outlined into its own function. The LLVM IR for
9851 // the target region itself is generated using the callbacks CBFunc
9852 // and ArgAccessorFuncCB
9854 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9855 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9856 return Err;
9857
9858 // If we are not on the target device, then we need to generate code
9859 // to make a remote call (offload) to the previously outlined function
9860 // that represents the target region. Do that now.
9861 if (!Config.isTargetDevice())
9862 emitTargetCall(*this, Builder, AllocaIP, DeallocBlocks, Info, DefaultAttrs,
9863 RuntimeAttrs, IfCond, OutlinedFn, OutlinedFnID, Inputs,
9864 GenMapInfoCB, CustomMapperCB, Dependencies, HasNowait,
9865 DynCGroupMem, DynCGroupMemFallback);
9866 return Builder.saveIP();
9867}
9868
9869std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9870 StringRef FirstSeparator,
9871 StringRef Separator) {
9872 SmallString<128> Buffer;
9873 llvm::raw_svector_ostream OS(Buffer);
9874 StringRef Sep = FirstSeparator;
9875 for (StringRef Part : Parts) {
9876 OS << Sep << Part;
9877 Sep = Separator;
9878 }
9879 return OS.str().str();
9880}
9881
9882std::string
9884 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9885 Config.separator());
9886}
9887
9889 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9890 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9891 if (Elem.second) {
9892 assert(Elem.second->getValueType() == Ty &&
9893 "OMP internal variable has different type than requested");
9894 } else {
9895 // TODO: investigate the appropriate linkage type used for the global
9896 // variable for possibly changing that to internal or private, or maybe
9897 // create different versions of the function for different OMP internal
9898 // variables.
9899 const DataLayout &DL = M.getDataLayout();
9900 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9901 // default global AS is 1.
9902 // See double-target-call-with-declare-target.f90 and
9903 // declare-target-vars-in-target-region.f90 libomptarget
9904 // tests.
9905 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9906 : M.getTargetTriple().isAMDGPU()
9907 ? 0
9908 : DL.getDefaultGlobalsAddressSpace();
9909 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9912 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9913 Constant::getNullValue(Ty), Elem.first(),
9914 /*InsertBefore=*/nullptr,
9915 GlobalValue::NotThreadLocal, AddressSpaceVal);
9916 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9917 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9918 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9919 Elem.second = GV;
9920 }
9921
9922 return Elem.second;
9923}
9924
9925Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9926 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9927 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9928 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9929}
9930
9932 LLVMContext &Ctx = Builder.getContext();
9933 Value *Null =
9934 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9935 Value *SizeGep =
9936 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9937 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9938 return SizePtrToInt;
9939}
9940
9943 std::string VarName) {
9944 llvm::Constant *MaptypesArrayInit =
9945 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9946 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9947 M, MaptypesArrayInit->getType(),
9948 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9949 VarName);
9950 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9951 return MaptypesArrayGlobal;
9952}
9953
9955 InsertPointTy AllocaIP,
9956 unsigned NumOperands,
9957 struct MapperAllocas &MapperAllocas) {
9958 if (!updateToLocation(Loc))
9959 return;
9960
9961 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9962 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9963 Builder.restoreIP(AllocaIP);
9964 AllocaInst *ArgsBase = Builder.CreateAlloca(
9965 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9966 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9967 ".offload_ptrs");
9968 AllocaInst *ArgSizes = Builder.CreateAlloca(
9969 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9971 MapperAllocas.ArgsBase = ArgsBase;
9972 MapperAllocas.Args = Args;
9973 MapperAllocas.ArgSizes = ArgSizes;
9974}
9975
9977 Function *MapperFunc, Value *SrcLocInfo,
9978 Value *MaptypesArg, Value *MapnamesArg,
9980 int64_t DeviceID, unsigned NumOperands) {
9981 if (!updateToLocation(Loc))
9982 return;
9983
9984 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9985 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9986 Value *ArgsBaseGEP =
9987 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9988 {Builder.getInt32(0), Builder.getInt32(0)});
9989 Value *ArgsGEP =
9990 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9991 {Builder.getInt32(0), Builder.getInt32(0)});
9992 Value *ArgSizesGEP =
9993 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9994 {Builder.getInt32(0), Builder.getInt32(0)});
9995 Value *NullPtr =
9996 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9997 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9998 Builder.getInt32(NumOperands),
9999 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
10000 MaptypesArg, MapnamesArg, NullPtr});
10001}
10002
10004 TargetDataRTArgs &RTArgs,
10005 TargetDataInfo &Info,
10006 bool ForEndCall) {
10007 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
10008 "expected region end call to runtime only when end call is separate");
10009 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
10010 auto VoidPtrTy = UnqualPtrTy;
10011 auto VoidPtrPtrTy = UnqualPtrTy;
10012 auto Int64Ty = Type::getInt64Ty(M.getContext());
10013 auto Int64PtrTy = UnqualPtrTy;
10014
10015 if (!Info.NumberOfPtrs) {
10016 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10017 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10018 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
10019 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
10020 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10021 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10022 return;
10023 }
10024
10025 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
10026 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
10027 Info.RTArgs.BasePointersArray,
10028 /*Idx0=*/0, /*Idx1=*/0);
10029 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
10030 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
10031 /*Idx0=*/0,
10032 /*Idx1=*/0);
10033 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
10034 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10035 /*Idx0=*/0, /*Idx1=*/0);
10036 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
10037 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
10038 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
10039 : Info.RTArgs.MapTypesArray,
10040 /*Idx0=*/0,
10041 /*Idx1=*/0);
10042
10043 // Only emit the mapper information arrays if debug information is
10044 // requested.
10045 if (!Info.EmitDebug)
10046 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
10047 else
10048 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
10049 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
10050 /*Idx0=*/0,
10051 /*Idx1=*/0);
10052 // If there is no user-defined mapper, set the mapper array to nullptr to
10053 // avoid an unnecessary data privatization
10054 if (!Info.HasMapper)
10055 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
10056 else
10057 RTArgs.MappersArray =
10058 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
10059}
10060
10062 InsertPointTy CodeGenIP,
10063 MapInfosTy &CombinedInfo,
10064 TargetDataInfo &Info) {
10066 CombinedInfo.NonContigInfo;
10067
10068 // Build an array of struct descriptor_dim and then assign it to
10069 // offload_args.
10070 //
10071 // struct descriptor_dim {
10072 // uint64_t offset;
10073 // uint64_t count;
10074 // uint64_t stride
10075 // };
10076 Type *Int64Ty = Builder.getInt64Ty();
10078 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
10079 "struct.descriptor_dim");
10080
10081 enum { OffsetFD = 0, CountFD, StrideFD };
10082 // We need two index variable here since the size of "Dims" is the same as
10083 // the size of Components, however, the size of offset, count, and stride is
10084 // equal to the size of base declaration that is non-contiguous.
10085 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
10086 // Skip emitting ir if dimension size is 1 since it cannot be
10087 // non-contiguous.
10088 if (NonContigInfo.Dims[I] == 1)
10089 continue;
10090 Builder.restoreIP(AllocaIP);
10091 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
10092 AllocaInst *DimsAddr =
10093 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
10094 Builder.restoreIP(CodeGenIP);
10095 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
10096 unsigned RevIdx = EE - II - 1;
10097 Value *DimsLVal = Builder.CreateInBoundsGEP(
10098 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
10099 // Offset
10100 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
10101 Builder.CreateAlignedStore(
10102 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
10103 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
10104 // Count
10105 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
10106 Builder.CreateAlignedStore(
10107 NonContigInfo.Counts[L][RevIdx], CountLVal,
10108 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10109 // Stride
10110 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
10111 Builder.CreateAlignedStore(
10112 NonContigInfo.Strides[L][RevIdx], StrideLVal,
10113 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
10114 }
10115 // args[I] = &dims
10116 Builder.restoreIP(CodeGenIP);
10117 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
10118 DimsAddr, Builder.getPtrTy());
10119 Value *P = Builder.CreateConstInBoundsGEP2_32(
10120 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
10121 Info.RTArgs.PointersArray, 0, I);
10122 Builder.CreateAlignedStore(
10123 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
10124 ++L;
10125 }
10126}
10127
10128void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
10129 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
10130 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
10131 BasicBlock *ExitBB, bool IsInit) {
10132 StringRef Prefix = IsInit ? ".init" : ".del";
10133
10134 // Evaluate if this is an array section.
10136 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
10137 Value *IsArray =
10138 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
10139 Value *DeleteBit = Builder.CreateAnd(
10140 MapType,
10141 Builder.getInt64(
10142 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10143 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
10144 Value *DeleteCond;
10145 Value *Cond;
10146 if (IsInit) {
10147 // base != begin?
10148 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
10149 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
10150 DeleteCond = Builder.CreateIsNull(
10151 DeleteBit,
10152 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10153 } else {
10154 Cond = IsArray;
10155 DeleteCond = Builder.CreateIsNotNull(
10156 DeleteBit,
10157 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
10158 }
10159 Cond = Builder.CreateAnd(Cond, DeleteCond);
10160 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
10161
10162 emitBlock(BodyBB, MapperFn);
10163 // Get the array size by multiplying element size and element number (i.e., \p
10164 // Size).
10165 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
10166 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
10167 // memory allocation/deletion purpose only.
10168 Value *MapTypeArg = Builder.CreateAnd(
10169 MapType,
10170 Builder.getInt64(
10171 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10172 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10173 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10174 MapTypeArg = Builder.CreateOr(
10175 MapTypeArg,
10176 Builder.getInt64(
10177 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10178 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
10179
10180 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10181 // data structure.
10182 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
10183 ArraySize, MapTypeArg, MapName};
10185 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10186 OffloadingArgs);
10187}
10188
10191 llvm::Value *BeginArg)>
10192 GenMapInfoCB,
10193 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB,
10194 bool PreserveMemberOfFlags) {
10195 SmallVector<Type *> Params;
10196 Params.emplace_back(Builder.getPtrTy());
10197 Params.emplace_back(Builder.getPtrTy());
10198 Params.emplace_back(Builder.getPtrTy());
10199 Params.emplace_back(Builder.getInt64Ty());
10200 Params.emplace_back(Builder.getInt64Ty());
10201 Params.emplace_back(Builder.getPtrTy());
10202
10203 auto *FnTy =
10204 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
10205
10206 SmallString<64> TyStr;
10207 raw_svector_ostream Out(TyStr);
10208 Function *MapperFn =
10210 MapperFn->addFnAttr(Attribute::NoInline);
10211 MapperFn->addFnAttr(Attribute::NoUnwind);
10212 MapperFn->addParamAttr(0, Attribute::NoUndef);
10213 MapperFn->addParamAttr(1, Attribute::NoUndef);
10214 MapperFn->addParamAttr(2, Attribute::NoUndef);
10215 MapperFn->addParamAttr(3, Attribute::NoUndef);
10216 MapperFn->addParamAttr(4, Attribute::NoUndef);
10217 MapperFn->addParamAttr(5, Attribute::NoUndef);
10218
10219 // Start the mapper function code generation.
10220 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
10221 auto SavedIP = Builder.saveIP();
10222 Builder.SetInsertPoint(EntryBB);
10223
10224 Value *MapperHandle = MapperFn->getArg(0);
10225 Value *BaseIn = MapperFn->getArg(1);
10226 Value *BeginIn = MapperFn->getArg(2);
10227 Value *Size = MapperFn->getArg(3);
10228 Value *MapType = MapperFn->getArg(4);
10229 Value *MapName = MapperFn->getArg(5);
10230
10231 // Compute the starting and end addresses of array elements.
10232 // Prepare common arguments for array initiation and deletion.
10233 // Convert the size in bytes into the number of array elements.
10234 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
10235 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
10236 Value *PtrBegin = BeginIn;
10237 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
10238
10239 // Emit array initiation if this is an array section and \p MapType indicates
10240 // that memory allocation is required.
10241 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
10242 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10243 MapType, MapName, ElementSize, HeadBB,
10244 /*IsInit=*/true);
10245
10246 // Emit a for loop to iterate through SizeArg of elements and map all of them.
10247
10248 // Emit the loop header block.
10249 emitBlock(HeadBB, MapperFn);
10250 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
10251 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
10252 // Evaluate whether the initial condition is satisfied.
10253 Value *IsEmpty =
10254 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
10255 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
10256
10257 // Emit the loop body block.
10258 emitBlock(BodyBB, MapperFn);
10259 BasicBlock *LastBB = BodyBB;
10260 PHINode *PtrPHI =
10261 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
10262 PtrPHI->addIncoming(PtrBegin, HeadBB);
10263
10264 // Get map clause information. Fill up the arrays with all mapped variables.
10265 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
10266 if (!Info)
10267 return Info.takeError();
10268
10269 // Call the runtime API __tgt_mapper_num_components to get the number of
10270 // pre-existing components.
10271 Value *OffloadingArgs[] = {MapperHandle};
10272 Value *PreviousSize = createRuntimeFunctionCall(
10273 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
10274 OffloadingArgs);
10275 Value *ShiftedPreviousSize =
10276 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
10277
10278 // Fill up the runtime mapper handle for all components.
10279 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
10280 Value *CurBaseArg = Info->BasePointers[I];
10281 Value *CurBeginArg = Info->Pointers[I];
10282 Value *CurSizeArg = Info->Sizes[I];
10283 Value *CurNameArg = Info->Names.size()
10284 ? Info->Names[I]
10285 : Constant::getNullValue(Builder.getPtrTy());
10286
10287 // Extract the MEMBER_OF field from the map type.
10288 Value *OriMapType = Builder.getInt64(
10289 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10290 Info->Types[I]));
10291 Value *MemberMapType;
10292 if (PreserveMemberOfFlags) {
10293 constexpr uint64_t MemberOfMask =
10294 static_cast<uint64_t>(OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF);
10295 uint64_t OrigFlags =
10296 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10297 Info->Types[I]);
10298 bool HasMemberOf = (OrigFlags & MemberOfMask) != 0;
10299 if (HasMemberOf)
10300 MemberMapType = Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
10301 else
10302 MemberMapType = OriMapType;
10303 } else {
10304 MemberMapType = Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
10305 }
10306
10307 // Combine the map type inherited from user-defined mapper with that
10308 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
10309 // bits of the \a MapType, which is the input argument of the mapper
10310 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
10311 // bits of MemberMapType.
10312 // [OpenMP 5.0], 1.2.6. map-type decay.
10313 // | alloc | to | from | tofrom | release | delete
10314 // ----------------------------------------------------------
10315 // alloc | alloc | alloc | alloc | alloc | release | delete
10316 // to | alloc | to | alloc | to | release | delete
10317 // from | alloc | alloc | from | from | release | delete
10318 // tofrom | alloc | to | from | tofrom | release | delete
10319 Value *LeftToFrom = Builder.CreateAnd(
10320 MapType,
10321 Builder.getInt64(
10322 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10323 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10324 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10325 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
10326 BasicBlock *AllocElseBB =
10327 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
10328 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
10329 BasicBlock *ToElseBB =
10330 BasicBlock::Create(M.getContext(), "omp.type.to.else");
10331 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
10332 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
10333 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
10334 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
10335 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
10336 emitBlock(AllocBB, MapperFn);
10337 Value *AllocMapType = Builder.CreateAnd(
10338 MemberMapType,
10339 Builder.getInt64(
10340 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10341 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10342 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10343 Builder.CreateBr(EndBB);
10344 emitBlock(AllocElseBB, MapperFn);
10345 Value *IsTo = Builder.CreateICmpEQ(
10346 LeftToFrom,
10347 Builder.getInt64(
10348 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10349 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10350 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
10351 // In case of to, clear OMP_MAP_FROM.
10352 emitBlock(ToBB, MapperFn);
10353 Value *ToMapType = Builder.CreateAnd(
10354 MemberMapType,
10355 Builder.getInt64(
10356 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10357 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10358 Builder.CreateBr(EndBB);
10359 emitBlock(ToElseBB, MapperFn);
10360 Value *IsFrom = Builder.CreateICmpEQ(
10361 LeftToFrom,
10362 Builder.getInt64(
10363 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10364 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10365 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
10366 // In case of from, clear OMP_MAP_TO.
10367 emitBlock(FromBB, MapperFn);
10368 Value *FromMapType = Builder.CreateAnd(
10369 MemberMapType,
10370 Builder.getInt64(
10371 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10372 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10373 // In case of tofrom, do nothing.
10374 emitBlock(EndBB, MapperFn);
10375 LastBB = EndBB;
10376 PHINode *CurMapType =
10377 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
10378 CurMapType->addIncoming(AllocMapType, AllocBB);
10379 CurMapType->addIncoming(ToMapType, ToBB);
10380 CurMapType->addIncoming(FromMapType, FromBB);
10381 CurMapType->addIncoming(MemberMapType, ToElseBB);
10382
10383 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
10384 CurSizeArg, CurMapType, CurNameArg};
10385
10386 auto ChildMapperFn = CustomMapperCB(I);
10387 if (!ChildMapperFn)
10388 return ChildMapperFn.takeError();
10389 if (*ChildMapperFn) {
10390 // Call the corresponding mapper function.
10391 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
10392 ->setDoesNotThrow();
10393 } else {
10394 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10395 // data structure.
10397 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10398 OffloadingArgs);
10399 }
10400 }
10401
10402 // Update the pointer to point to the next element that needs to be mapped,
10403 // and check whether we have mapped all elements.
10404 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
10405 "omp.arraymap.next");
10406 PtrPHI->addIncoming(PtrNext, LastBB);
10407 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
10408 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
10409 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
10410
10411 emitBlock(ExitBB, MapperFn);
10412 // Emit array deletion if this is an array section and \p MapType indicates
10413 // that deletion is required.
10414 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10415 MapType, MapName, ElementSize, DoneBB,
10416 /*IsInit=*/false);
10417
10418 // Emit the function exit block.
10419 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10420
10421 Builder.CreateRetVoid();
10422 Builder.restoreIP(SavedIP);
10423 return MapperFn;
10424}
10425
10427 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10428 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10429 bool IsNonContiguous,
10430 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10431
10432 // Reset the array information.
10433 Info.clearArrayInfo();
10434 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10435
10436 if (Info.NumberOfPtrs == 0)
10437 return Error::success();
10438
10439 Builder.restoreIP(AllocaIP);
10440 // Detect if we have any capture size requiring runtime evaluation of the
10441 // size so that a constant array could be eventually used.
10442 ArrayType *PointerArrayType =
10443 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10444
10445 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10446 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10447
10448 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10449 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10450 AllocaInst *MappersArray = Builder.CreateAlloca(
10451 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10452 Info.RTArgs.MappersArray = MappersArray;
10453
10454 // If we don't have any VLA types or other types that require runtime
10455 // evaluation, we can use a constant array for the map sizes, otherwise we
10456 // need to fill up the arrays as we do for the pointers.
10457 Type *Int64Ty = Builder.getInt64Ty();
10458 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10459 ConstantInt::get(Int64Ty, 0));
10460 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10461 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10462 bool IsNonContigEntry =
10463 IsNonContiguous &&
10464 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10465 CombinedInfo.Types[I] &
10466 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10467 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10468 // descriptor_dim records), not the byte size.
10469 if (IsNonContigEntry) {
10470 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10471 "Index must be in-bounds for NON_CONTIG Dims array");
10472 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10473 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10474 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10475 continue;
10476 }
10477 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10478 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10479 ConstSizes[I] = CI;
10480 continue;
10481 }
10482 }
10483 RuntimeSizes.set(I);
10484 }
10485
10486 if (RuntimeSizes.all()) {
10487 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10488 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10489 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10490 restoreIPandDebugLoc(Builder, CodeGenIP);
10491 } else {
10492 auto *SizesArrayInit = ConstantArray::get(
10493 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10494 std::string Name = createPlatformSpecificName({"offload_sizes"});
10495 auto *SizesArrayGbl =
10496 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10497 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10498 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10499
10500 if (!RuntimeSizes.any()) {
10501 Info.RTArgs.SizesArray = SizesArrayGbl;
10502 } else {
10503 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10504 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10505 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10506 AllocaInst *Buffer = Builder.CreateAlloca(
10507 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10508 Buffer->setAlignment(OffloadSizeAlign);
10509 restoreIPandDebugLoc(Builder, CodeGenIP);
10510 Builder.CreateMemCpy(
10511 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10512 SizesArrayGbl, OffloadSizeAlign,
10513 Builder.getIntN(
10514 IndexSize,
10515 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10516
10517 Info.RTArgs.SizesArray = Buffer;
10518 }
10519 restoreIPandDebugLoc(Builder, CodeGenIP);
10520 }
10521
10522 // The map types are always constant so we don't need to generate code to
10523 // fill arrays. Instead, we create an array constant.
10525 for (auto mapFlag : CombinedInfo.Types)
10526 Mapping.push_back(
10527 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10528 mapFlag));
10529 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10530 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10531 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10532
10533 // The information types are only built if provided.
10534 if (!CombinedInfo.Names.empty()) {
10535 auto *MapNamesArrayGbl = createOffloadMapnames(
10536 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10537 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10538 Info.EmitDebug = true;
10539 } else {
10540 Info.RTArgs.MapNamesArray =
10542 Info.EmitDebug = false;
10543 }
10544
10545 // If there's a present map type modifier, it must not be applied to the end
10546 // of a region, so generate a separate map type array in that case.
10547 if (Info.separateBeginEndCalls()) {
10548 bool EndMapTypesDiffer = false;
10549 for (uint64_t &Type : Mapping) {
10550 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10551 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10552 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10553 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10554 EndMapTypesDiffer = true;
10555 }
10556 }
10557 if (EndMapTypesDiffer) {
10558 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10559 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10560 }
10561 }
10562
10563 PointerType *PtrTy = Builder.getPtrTy();
10564 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10565 Value *BPVal = CombinedInfo.BasePointers[I];
10566 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10567 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10568 0, I);
10569 Builder.CreateAlignedStore(BPVal, BP,
10570 M.getDataLayout().getPrefTypeAlign(PtrTy));
10571
10572 if (Info.requiresDevicePointerInfo()) {
10573 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10574 CodeGenIP = Builder.saveIP();
10575 Builder.restoreIP(AllocaIP);
10576 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10577 Builder.restoreIP(CodeGenIP);
10578 if (DeviceAddrCB)
10579 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10580 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10581 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10582 if (DeviceAddrCB)
10583 DeviceAddrCB(I, BP);
10584 }
10585 }
10586
10587 Value *PVal = CombinedInfo.Pointers[I];
10588 Value *P = Builder.CreateConstInBoundsGEP2_32(
10589 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10590 I);
10591 // TODO: Check alignment correct.
10592 Builder.CreateAlignedStore(PVal, P,
10593 M.getDataLayout().getPrefTypeAlign(PtrTy));
10594
10595 if (RuntimeSizes.test(I)) {
10596 Value *S = Builder.CreateConstInBoundsGEP2_32(
10597 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10598 /*Idx0=*/0,
10599 /*Idx1=*/I);
10600 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10601 Int64Ty,
10602 /*isSigned=*/true),
10603 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10604 }
10605 // Fill up the mapper array.
10606 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10607 Value *MFunc = ConstantPointerNull::get(PtrTy);
10608
10609 auto CustomMFunc = CustomMapperCB(I);
10610 if (!CustomMFunc)
10611 return CustomMFunc.takeError();
10612 if (*CustomMFunc)
10613 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10614
10615 Value *MAddr = Builder.CreateInBoundsGEP(
10616 PointerArrayType, MappersArray,
10617 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10618 Builder.CreateAlignedStore(
10619 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10620 }
10621
10622 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10623 Info.NumberOfPtrs == 0)
10624 return Error::success();
10625 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10626 return Error::success();
10627}
10628
10630 BasicBlock *CurBB = Builder.GetInsertBlock();
10631
10632 if (!CurBB || CurBB->hasTerminator()) {
10633 // If there is no insert point or the previous block is already
10634 // terminated, don't touch it.
10635 } else {
10636 // Otherwise, create a fall-through branch.
10637 Builder.CreateBr(Target);
10638 }
10639
10640 Builder.ClearInsertionPoint();
10641}
10642
10644 bool IsFinished) {
10645 BasicBlock *CurBB = Builder.GetInsertBlock();
10646
10647 // Fall out of the current block (if necessary).
10648 emitBranch(BB);
10649
10650 if (IsFinished && BB->use_empty()) {
10651 BB->eraseFromParent();
10652 return;
10653 }
10654
10655 // Place the block after the current block, if possible, or else at
10656 // the end of the function.
10657 if (CurBB && CurBB->getParent())
10658 CurFn->insert(std::next(CurBB->getIterator()), BB);
10659 else
10660 CurFn->insert(CurFn->end(), BB);
10661 Builder.SetInsertPoint(BB);
10662}
10663
10665 BodyGenCallbackTy ElseGen,
10666 InsertPointTy AllocaIP,
10667 ArrayRef<BasicBlock *> DeallocBlocks) {
10668 // If the condition constant folds and can be elided, try to avoid emitting
10669 // the condition and the dead arm of the if/else.
10670 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10671 auto CondConstant = CI->getSExtValue();
10672 if (CondConstant)
10673 return ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10674
10675 return ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks);
10676 }
10677
10678 Function *CurFn = Builder.GetInsertBlock()->getParent();
10679
10680 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10681 // emit the conditional branch.
10682 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10683 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10684 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10685 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10686 // Emit the 'then' code.
10687 emitBlock(ThenBlock, CurFn);
10688 if (Error Err = ThenGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10689 return Err;
10690 emitBranch(ContBlock);
10691 // Emit the 'else' code if present.
10692 // There is no need to emit line number for unconditional branch.
10693 emitBlock(ElseBlock, CurFn);
10694 if (Error Err = ElseGen(AllocaIP, Builder.saveIP(), DeallocBlocks))
10695 return Err;
10696 // There is no need to emit line number for unconditional branch.
10697 emitBranch(ContBlock);
10698 // Emit the continuation block for code after the if.
10699 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10700 return Error::success();
10701}
10702
10703bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10704 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10707 "Unexpected Atomic Ordering.");
10708
10709 bool Flush = false;
10711
10712 switch (AK) {
10713 case Read:
10716 FlushAO = AtomicOrdering::Acquire;
10717 Flush = true;
10718 }
10719 break;
10720 case Write:
10721 case Compare:
10722 case Update:
10725 FlushAO = AtomicOrdering::Release;
10726 Flush = true;
10727 }
10728 break;
10729 case Capture:
10730 switch (AO) {
10732 FlushAO = AtomicOrdering::Acquire;
10733 Flush = true;
10734 break;
10736 FlushAO = AtomicOrdering::Release;
10737 Flush = true;
10738 break;
10742 Flush = true;
10743 break;
10744 default:
10745 // do nothing - leave silently.
10746 break;
10747 }
10748 }
10749
10750 if (Flush) {
10751 // Currently Flush RT call still doesn't take memory_ordering, so for when
10752 // that happens, this tries to do the resolution of which atomic ordering
10753 // to use with but issue the flush call
10754 // TODO: pass `FlushAO` after memory ordering support is added
10755 (void)FlushAO;
10756 emitFlush(Loc);
10757 }
10758
10759 // for AO == AtomicOrdering::Monotonic and all other case combinations
10760 // do nothing
10761 return Flush;
10762}
10763
10767 AtomicOrdering AO, InsertPointTy AllocaIP) {
10768 if (!updateToLocation(Loc))
10769 return Loc.IP;
10770
10771 assert(X.Var->getType()->isPointerTy() &&
10772 "OMP Atomic expects a pointer to target memory");
10773 Type *XElemTy = X.ElemTy;
10774 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10775 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10776 "OMP atomic read expected a scalar type");
10777
10778 Value *XRead = nullptr;
10779
10780 if (XElemTy->isIntegerTy()) {
10781 LoadInst *XLD =
10782 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10783 XLD->setAtomic(AO);
10784 XRead = cast<Value>(XLD);
10785 } else if (XElemTy->isStructTy()) {
10786 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10787 // target does not support `atomicrmw` of the size of the struct
10788 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10789 OldVal->setAtomic(AO);
10790 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10791 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10792 OpenMPIRBuilder::AtomicInfo atomicInfo(
10793 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10794 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10795 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10796 XRead = AtomicLoadRes.first;
10797 OldVal->eraseFromParent();
10798 } else {
10799 // We need to perform atomic op as integer
10800 IntegerType *IntCastTy =
10801 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10802 LoadInst *XLoad =
10803 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10804 XLoad->setAtomic(AO);
10805 if (XElemTy->isFloatingPointTy()) {
10806 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10807 } else {
10808 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10809 }
10810 }
10811 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10812 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10813 return Builder.saveIP();
10814}
10815
10818 AtomicOpValue &X, Value *Expr,
10819 AtomicOrdering AO, InsertPointTy AllocaIP) {
10820 if (!updateToLocation(Loc))
10821 return Loc.IP;
10822
10823 assert(X.Var->getType()->isPointerTy() &&
10824 "OMP Atomic expects a pointer to target memory");
10825 Type *XElemTy = X.ElemTy;
10826 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10827 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10828 "OMP atomic write expected a scalar type");
10829
10830 if (XElemTy->isIntegerTy()) {
10831 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10832 XSt->setAtomic(AO);
10833 } else if (XElemTy->isStructTy()) {
10834 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10835 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10836 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10837 OpenMPIRBuilder::AtomicInfo atomicInfo(
10838 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10839 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10840 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10841 OldVal->eraseFromParent();
10842 } else {
10843 // We need to bitcast and perform atomic op as integers
10844 IntegerType *IntCastTy =
10845 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10846 Value *ExprCast =
10847 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10848 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10849 XSt->setAtomic(AO);
10850 }
10851
10852 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10853 return Builder.saveIP();
10854}
10855
10858 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10859 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10860 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10861 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10862 if (!updateToLocation(Loc))
10863 return Loc.IP;
10864
10865 LLVM_DEBUG({
10866 Type *XTy = X.Var->getType();
10867 assert(XTy->isPointerTy() &&
10868 "OMP Atomic expects a pointer to target memory");
10869 Type *XElemTy = X.ElemTy;
10870 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10871 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10872 "OMP atomic update expected a scalar or struct type");
10873 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10874 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10875 "OpenMP atomic does not support LT or GT operations");
10876 });
10877
10878 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10879 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10880 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10881 if (!AtomicResult)
10882 return AtomicResult.takeError();
10883 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10884 return Builder.saveIP();
10885}
10886
10887// FIXME: Duplicating AtomicExpand
10888Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10889 AtomicRMWInst::BinOp RMWOp) {
10890 switch (RMWOp) {
10891 case AtomicRMWInst::Add:
10892 return Builder.CreateAdd(Src1, Src2);
10893 case AtomicRMWInst::Sub:
10894 return Builder.CreateSub(Src1, Src2);
10895 case AtomicRMWInst::And:
10896 return Builder.CreateAnd(Src1, Src2);
10898 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10899 case AtomicRMWInst::Or:
10900 return Builder.CreateOr(Src1, Src2);
10901 case AtomicRMWInst::Xor:
10902 return Builder.CreateXor(Src1, Src2);
10907 case AtomicRMWInst::Max:
10908 case AtomicRMWInst::Min:
10921 llvm_unreachable("Unsupported atomic update operation");
10922 }
10923 llvm_unreachable("Unsupported atomic update operation");
10924}
10925
10927 // Loads cannot use Release or AcquireRelease ordering. This load is
10928 // just the initial value for the cmpxchg loop; the cmpxchg itself
10929 // retains the original ordering.
10930 AtomicOrdering LoadAO = AO;
10931
10932 if (AO == AtomicOrdering::Release) {
10934 } else if (AO == AtomicOrdering::AcquireRelease) {
10935 LoadAO = AtomicOrdering::Acquire;
10936 }
10937
10938 return LoadAO;
10939}
10940
10941Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10942 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10944 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10945 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10946 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2.
10947 bool emitRMWOp = false;
10948 switch (RMWOp) {
10949 case AtomicRMWInst::Add:
10950 case AtomicRMWInst::And:
10952 case AtomicRMWInst::Or:
10953 case AtomicRMWInst::Xor:
10955 emitRMWOp = XElemTy;
10956 break;
10957 case AtomicRMWInst::Sub:
10958 emitRMWOp = (IsXBinopExpr && XElemTy);
10959 break;
10960 default:
10961 emitRMWOp = false;
10962 }
10963 emitRMWOp &= XElemTy->isIntegerTy();
10964
10965 std::pair<Value *, Value *> Res;
10966 if (emitRMWOp) {
10967 AtomicRMWInst *RMWInst =
10968 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10969 if (T.isAMDGPU()) {
10970 if (IsIgnoreDenormalMode)
10971 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10972 llvm::MDNode::get(Builder.getContext(), {}));
10973 if (!IsFineGrainedMemory)
10974 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10975 llvm::MDNode::get(Builder.getContext(), {}));
10976 if (!IsRemoteMemory)
10977 RMWInst->setMetadata("amdgpu.no.remote.memory",
10978 llvm::MDNode::get(Builder.getContext(), {}));
10979 }
10980 Res.first = RMWInst;
10981 // not needed except in case of postfix captures. Generate anyway for
10982 // consistency with the else part. Will be removed with any DCE pass.
10983 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10984 if (RMWOp == AtomicRMWInst::Xchg)
10985 Res.second = Res.first;
10986 else
10987 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10988 } else if (XElemTy->isStructTy()) {
10989 LoadInst *OldVal =
10990 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10992 OldVal->setAtomic(LoadAO);
10993 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10994 unsigned LoadSize = LoadDL.getTypeStoreSize(XElemTy);
10995
10996 OpenMPIRBuilder::AtomicInfo atomicInfo(
10997 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10998 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10999 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
11000 BasicBlock *CurBB = Builder.GetInsertBlock();
11001 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11002 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11003 BasicBlock *ExitBB =
11004 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
11005 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
11006 X->getName() + ".atomic.cont");
11007 ContBB->getTerminator()->eraseFromParent();
11008 Builder.restoreIP(AllocaIP);
11009 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
11010 NewAtomicAddr->setName(X->getName() + "x.new.val");
11011 Builder.SetInsertPoint(ContBB);
11012 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
11013 PHI->addIncoming(AtomicLoadRes.first, CurBB);
11014 Value *OldExprVal = PHI;
11015 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
11016 if (!CBResult)
11017 return CBResult.takeError();
11018 Value *Upd = *CBResult;
11019 Builder.CreateStore(Upd, NewAtomicAddr);
11022 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
11023 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
11024 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
11025 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
11026 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
11027 OldVal->eraseFromParent();
11028 Res.first = OldExprVal;
11029 Res.second = Upd;
11030
11031 if (UnreachableInst *ExitTI =
11033 CurBBTI->eraseFromParent();
11034 Builder.SetInsertPoint(ExitBB);
11035 } else {
11036 Builder.SetInsertPoint(ExitTI);
11037 }
11038 } else {
11039 IntegerType *IntCastTy =
11040 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
11041 LoadInst *OldVal =
11042 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
11044 OldVal->setAtomic(LoadAO);
11045 // CurBB
11046 // | /---\
11047 // ContBB |
11048 // | \---/
11049 // ExitBB
11050 BasicBlock *CurBB = Builder.GetInsertBlock();
11051 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11052 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11053 BasicBlock *ExitBB =
11054 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
11055 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
11056 X->getName() + ".atomic.cont");
11057 ContBB->getTerminator()->eraseFromParent();
11058 Builder.restoreIP(AllocaIP);
11059 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
11060 NewAtomicAddr->setName(X->getName() + "x.new.val");
11061 Builder.SetInsertPoint(ContBB);
11062 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
11063 PHI->addIncoming(OldVal, CurBB);
11064 bool IsIntTy = XElemTy->isIntegerTy();
11065 Value *OldExprVal = PHI;
11066 if (!IsIntTy) {
11067 if (XElemTy->isFloatingPointTy()) {
11068 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
11069 X->getName() + ".atomic.fltCast");
11070 } else {
11071 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
11072 X->getName() + ".atomic.ptrCast");
11073 }
11074 }
11075
11076 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
11077 if (!CBResult)
11078 return CBResult.takeError();
11079 Value *Upd = *CBResult;
11080 Builder.CreateStore(Upd, NewAtomicAddr);
11081 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
11084 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
11085 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
11086 Result->setVolatile(VolatileX);
11087 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11088 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11089 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
11090 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
11091
11092 Res.first = OldExprVal;
11093 Res.second = Upd;
11094
11095 // set Insertion point in exit block
11096 if (UnreachableInst *ExitTI =
11098 CurBBTI->eraseFromParent();
11099 Builder.SetInsertPoint(ExitBB);
11100 } else {
11101 Builder.SetInsertPoint(ExitTI);
11102 }
11103 }
11104
11105 return Res;
11106}
11107
11110 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
11111 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
11112 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
11113 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
11114 if (!updateToLocation(Loc))
11115 return Loc.IP;
11116
11117 LLVM_DEBUG({
11118 Type *XTy = X.Var->getType();
11119 assert(XTy->isPointerTy() &&
11120 "OMP Atomic expects a pointer to target memory");
11121 Type *XElemTy = X.ElemTy;
11122 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
11123 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
11124 "OMP atomic capture expected a scalar or struct type");
11125 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
11126 "OpenMP atomic does not support LT or GT operations");
11127 });
11128
11129 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
11130 // 'x' is simply atomically rewritten with 'expr'.
11131 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
11132 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
11133 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
11134 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
11135 if (!AtomicResult)
11136 return AtomicResult.takeError();
11137 Value *CapturedVal =
11138 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
11139 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
11140
11141 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
11142 return Builder.saveIP();
11143}
11144
11148 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11149 bool IsFailOnly) {
11150
11152 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
11153 IsPostfixUpdate, IsFailOnly, Failure);
11154}
11155
11159 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
11160 bool IsFailOnly, AtomicOrdering Failure) {
11161
11162 if (!updateToLocation(Loc))
11163 return Loc.IP;
11164
11165 assert(X.Var->getType()->isPointerTy() &&
11166 "OMP atomic expects a pointer to target memory");
11167 // compare capture
11168 if (V.Var) {
11169 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
11170 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
11171 }
11172
11173 bool IsInteger = E->getType()->isIntegerTy();
11174
11175 if (Op == OMPAtomicCompareOp::EQ) {
11176 // OldValue and SuccessOrFail are set below and used in the shared V.Var /
11177 // R.Var handling.
11178 Value *OldValue = nullptr;
11179 Value *SuccessOrFail = nullptr;
11180
11181 if (!IsInteger && HandleFPNegZero) {
11182 // IEEE 754 special cases for cmpxchg (which is bitwise):
11183 // 1. -0.0 == +0.0 but they have different bit patterns.
11184 // 2. NaN != NaN but identical NaN bit patterns would match.
11185 //
11186 // CurBB:
11187 // %e_int = bitcast E to intN
11188 // %d_int = bitcast D to intN
11189 // %x_curr = load atomic intN, X
11190 // %x_fp = bitcast %x_curr to FP
11191 // %e_is_nan = fcmp uno E, E
11192 // %x_is_nan = fcmp uno %x_fp, %x_fp
11193 // %either_nan = or %e_is_nan, %x_is_nan
11194 // br %either_nan, NaNBB, NotNaNBB
11195 // NaNBB: ; NaN == anything is always false
11196 // br ExitBB
11197 // NotNaNBB:
11198 // %x_is_zero = fcmp oeq %x_fp, 0.0
11199 // %e_is_zero = fcmp oeq E, 0.0
11200 // %both_zero = and %x_is_zero, %e_is_zero
11201 // br %both_zero, ZeroBB, NormalBB
11202 // ZeroBB: ; both ±0.0 → x = d
11203 // cmpxchg X, %x_curr, %d_int
11204 // br ExitBB
11205 // NormalBB: ; original path
11206 // cmpxchg X, %e_int, %d_int
11207 // br ExitBB
11208 // ExitBB:
11209 // phi merge
11210 IntegerType *IntCastTy =
11211 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
11212 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
11213 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
11214
11215 // Load X atomically.
11216 LoadInst *XCurr = Builder.CreateLoad(IntCastTy, X.Var,
11217 X.Var->getName() + ".atomic.load");
11219 Value *XFP = Builder.CreateBitCast(XCurr, X.ElemTy);
11220
11221 // IEEE 754: NaN != NaN, but cmpxchg would succeed if E and X have
11222 // the same NaN bit pattern. Skip cmpxchg when either is NaN.
11223 Value *EIsNaN = Builder.CreateFCmpUNO(E, E, "atomic.e.isnan");
11224 Value *XIsNaN = Builder.CreateFCmpUNO(XFP, XFP, "atomic.x.isnan");
11225 Value *EitherNaN = Builder.CreateOr(EIsNaN, XIsNaN, "atomic.either.nan");
11226
11227 BasicBlock *CurBB = Builder.GetInsertBlock();
11228 Function *F = CurBB->getParent();
11229 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11230 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11231 BasicBlock *ExitBB =
11232 CurBB->splitBasicBlock(CurBBTI, X.Var->getName() + ".atomic.exit");
11234 M.getContext(), X.Var->getName() + ".atomic.nan", F, ExitBB);
11235 BasicBlock *NotNaNBB = BasicBlock::Create(
11236 M.getContext(), X.Var->getName() + ".atomic.notnan", F, ExitBB);
11238 M.getContext(), X.Var->getName() + ".atomic.zero", F, ExitBB);
11239 BasicBlock *NormalBB = BasicBlock::Create(
11240 M.getContext(), X.Var->getName() + ".atomic.normal", F, ExitBB);
11241
11242 // If either E or X is NaN → NaNBB (always fails), else check for ±0.0.
11243 CurBB->getTerminator()->eraseFromParent();
11244 Builder.SetInsertPoint(CurBB);
11245 Builder.CreateCondBr(EitherNaN, NaNBB, NotNaNBB);
11246
11247 // NaNBB: NaN == anything is always false; skip cmpxchg.
11248 Builder.SetInsertPoint(NaNBB);
11249 Builder.CreateBr(ExitBB);
11250
11251 // NotNaNBB: check both X and E for ±0.0.
11252 Builder.SetInsertPoint(NotNaNBB);
11253 Value *XIsZero =
11254 Builder.CreateFCmpOEQ(XFP, ConstantFP::getZero(X.ElemTy),
11255 X.Var->getName() + ".atomic.xiszero");
11256 Value *EIsZero = Builder.CreateFCmpOEQ(E, ConstantFP::getZero(X.ElemTy),
11257 "atomic.e.iszero");
11258 Value *BothZero = Builder.CreateAnd(XIsZero, EIsZero, "atomic.both.zero");
11259 Builder.CreateCondBr(BothZero, ZeroBB, NormalBB);
11260
11261 // ZeroBB: cmpxchg with X's loaded bit-pattern.
11262 Builder.SetInsertPoint(ZeroBB);
11263 AtomicCmpXchgInst *ResZero = Builder.CreateAtomicCmpXchg(
11264 X.Var, XCurr, DBCast, MaybeAlign(), AO, Failure);
11265 Value *OldZero = Builder.CreateExtractValue(ResZero, /*Idxs=*/0);
11266 Value *OkZero = Builder.CreateExtractValue(ResZero, /*Idxs=*/1);
11267 Builder.CreateBr(ExitBB);
11268
11269 // NormalBB: original bitwise cmpxchg.
11270 Builder.SetInsertPoint(NormalBB);
11271 AtomicCmpXchgInst *ResNormal = Builder.CreateAtomicCmpXchg(
11272 X.Var, EBCast, DBCast, MaybeAlign(), AO, Failure);
11273 Value *OldNormal = Builder.CreateExtractValue(ResNormal, /*Idxs=*/0);
11274 Value *OkNormal = Builder.CreateExtractValue(ResNormal, /*Idxs=*/1);
11275 Builder.CreateBr(ExitBB);
11276
11277 // ExitBB: merge results from NaN, Zero, and Normal paths.
11278 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11279 PHINode *OldIntPHI =
11280 Builder.CreatePHI(IntCastTy, 3, X.Var->getName() + ".atomic.old");
11281 OldIntPHI->addIncoming(XCurr, NaNBB);
11282 OldIntPHI->addIncoming(OldZero, ZeroBB);
11283 OldIntPHI->addIncoming(OldNormal, NormalBB);
11284 PHINode *SuccessPHI = Builder.CreatePHI(Builder.getInt1Ty(), 3,
11285 X.Var->getName() + ".atomic.ok");
11286 SuccessPHI->addIncoming(Builder.getFalse(), NaNBB);
11287 SuccessPHI->addIncoming(OkZero, ZeroBB);
11288 SuccessPHI->addIncoming(OkNormal, NormalBB);
11289
11290 if (isa<UnreachableInst>(ExitBB->getTerminator())) {
11291 CurBBTI->eraseFromParent();
11292 Builder.SetInsertPoint(ExitBB);
11293 } else {
11294 Builder.SetInsertPoint(&*ExitBB->getFirstNonPHIIt());
11295 }
11296
11297 OldValue = Builder.CreateBitCast(OldIntPHI, X.ElemTy,
11298 X.Var->getName() + ".atomic.old.fp");
11299 SuccessOrFail = SuccessPHI;
11300 } else {
11301 AtomicCmpXchgInst *Result = nullptr;
11302 if (!IsInteger) {
11303 IntegerType *IntCastTy =
11304 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
11305 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
11306 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
11307 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast,
11308 MaybeAlign(), AO, Failure);
11309 } else {
11310 Result =
11311 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
11312 }
11313
11314 if (V.Var) {
11315 OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
11316 if (!IsInteger)
11317 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
11318 assert(OldValue->getType() == V.ElemTy &&
11319 "OldValue and V must be of same type");
11320 if (IsPostfixUpdate) {
11321 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
11322 } else {
11323 SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
11324 if (IsFailOnly) {
11325 BasicBlock *CurBB = Builder.GetInsertBlock();
11326 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11327 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11328 BasicBlock *ExitBB = CurBB->splitBasicBlock(
11329 CurBBTI, X.Var->getName() + ".atomic.exit");
11330 BasicBlock *ContBB = CurBB->splitBasicBlock(
11331 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
11332 ContBB->getTerminator()->eraseFromParent();
11333 CurBB->getTerminator()->eraseFromParent();
11334
11335 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
11336
11337 Builder.SetInsertPoint(ContBB);
11338 Builder.CreateStore(OldValue, V.Var);
11339 Builder.CreateBr(ExitBB);
11340
11341 if (UnreachableInst *ExitTI =
11343 CurBBTI->eraseFromParent();
11344 Builder.SetInsertPoint(ExitBB);
11345 } else {
11346 Builder.SetInsertPoint(ExitTI);
11347 }
11348 } else {
11349 Value *CapturedValue =
11350 Builder.CreateSelect(SuccessOrFail, E, OldValue);
11351 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11352 }
11353 }
11354 }
11355 // The comparison result has to be stored.
11356 if (R.Var) {
11357 assert(R.Var->getType()->isPointerTy() &&
11358 "r.var must be of pointer type");
11359 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
11360
11361 Value *SuccessFailureVal =
11362 Builder.CreateExtractValue(Result, /*Idxs=*/1);
11363 Value *ResultCast =
11364 R.IsSigned ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
11365 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
11366 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
11367 }
11368 }
11369
11370 // For the HandleFPNegZero path, handle V.Var and R.Var using the
11371 // pre-computed OldValue and SuccessOrFail.
11372 if (HandleFPNegZero && !IsInteger) {
11373 if (V.Var) {
11374 assert(OldValue->getType() == V.ElemTy &&
11375 "OldValue and V must be of same type");
11376 if (IsPostfixUpdate) {
11377 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
11378 } else {
11379 if (IsFailOnly) {
11380 BasicBlock *CurBB = Builder.GetInsertBlock();
11381 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
11382 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
11383 BasicBlock *ExitBB = CurBB->splitBasicBlock(
11384 CurBBTI, X.Var->getName() + ".atomic.exit");
11385 BasicBlock *ContBB = CurBB->splitBasicBlock(
11386 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
11387 ContBB->getTerminator()->eraseFromParent();
11388 CurBB->getTerminator()->eraseFromParent();
11389
11390 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
11391
11392 Builder.SetInsertPoint(ContBB);
11393 Builder.CreateStore(OldValue, V.Var);
11394 Builder.CreateBr(ExitBB);
11395
11396 if (UnreachableInst *ExitTI =
11398 CurBBTI->eraseFromParent();
11399 Builder.SetInsertPoint(ExitBB);
11400 } else {
11401 Builder.SetInsertPoint(ExitTI);
11402 }
11403 } else {
11404 Value *CapturedValue =
11405 Builder.CreateSelect(SuccessOrFail, E, OldValue);
11406 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11407 }
11408 }
11409 }
11410 // The comparison result has to be stored.
11411 if (R.Var) {
11412 assert(R.Var->getType()->isPointerTy() &&
11413 "r.var must be of pointer type");
11414 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
11415
11416 Value *ResultCast = R.IsSigned
11417 ? Builder.CreateSExt(SuccessOrFail, R.ElemTy)
11418 : Builder.CreateZExt(SuccessOrFail, R.ElemTy);
11419 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
11420 }
11421 }
11422 } else {
11423 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
11424 "Op should be either max or min at this point");
11425 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
11426
11427 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
11428 // Let's take max as example.
11429 // OpenMP form:
11430 // x = x > expr ? expr : x;
11431 // LLVM form:
11432 // *ptr = *ptr > val ? *ptr : val;
11433 // We need to transform to LLVM form.
11434 // x = x <= expr ? x : expr;
11436 if (IsXBinopExpr) {
11437 if (IsInteger) {
11438 if (X.IsSigned)
11439 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
11441 else
11442 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
11444 } else {
11445 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
11447 }
11448 } else {
11449 if (IsInteger) {
11450 if (X.IsSigned)
11451 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
11453 else
11454 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
11456 } else {
11457 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
11459 }
11460 }
11461
11462 AtomicRMWInst *OldValue =
11463 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
11464 if (V.Var) {
11465 Value *CapturedValue = nullptr;
11466 if (IsPostfixUpdate) {
11467 CapturedValue = OldValue;
11468 } else {
11469 CmpInst::Predicate Pred;
11470 switch (NewOp) {
11471 case AtomicRMWInst::Max:
11472 Pred = CmpInst::ICMP_SGT;
11473 break;
11475 Pred = CmpInst::ICMP_UGT;
11476 break;
11478 Pred = CmpInst::FCMP_OGT;
11479 break;
11480 case AtomicRMWInst::Min:
11481 Pred = CmpInst::ICMP_SLT;
11482 break;
11484 Pred = CmpInst::ICMP_ULT;
11485 break;
11487 Pred = CmpInst::FCMP_OLT;
11488 break;
11489 default:
11490 llvm_unreachable("unexpected comparison op");
11491 }
11492 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
11493 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
11494 }
11495 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
11496 }
11497 }
11498
11499 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
11500
11501 return Builder.saveIP();
11502}
11503
11506 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
11507 Value *NumTeamsUpper, Value *ThreadLimit,
11508 Value *IfExpr) {
11509 if (!updateToLocation(Loc))
11510 return InsertPointTy();
11511
11512 uint32_t SrcLocStrSize;
11513 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
11514 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
11515 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
11516
11517 // Outer allocation basicblock is the entry block of the current function.
11518 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
11519 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
11520 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
11521 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11522 }
11523
11524 // The current basic block is split into four basic blocks. After outlining,
11525 // they will be mapped as follows:
11526 // ```
11527 // def current_fn() {
11528 // current_basic_block:
11529 // br label %teams.exit
11530 // teams.exit:
11531 // ; instructions after teams
11532 // }
11533 //
11534 // def outlined_fn() {
11535 // teams.alloca:
11536 // br label %teams.body
11537 // teams.body:
11538 // ; instructions within teams body
11539 // }
11540 // ```
11541 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
11542 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
11543 BasicBlock *AllocaBB =
11544 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
11545
11546 bool SubClausesPresent =
11547 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
11548 // Push num_teams
11549 if (!Config.isTargetDevice() && SubClausesPresent) {
11550 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
11551 "if lowerbound is non-null, then upperbound must also be non-null "
11552 "for bounds on num_teams");
11553
11554 if (NumTeamsUpper == nullptr)
11555 NumTeamsUpper = Builder.getInt32(0);
11556
11557 if (NumTeamsLower == nullptr)
11558 NumTeamsLower = NumTeamsUpper;
11559
11560 if (IfExpr) {
11561 assert(IfExpr->getType()->isIntegerTy() &&
11562 "argument to if clause must be an integer value");
11563
11564 // upper = ifexpr ? upper : 1
11565 if (IfExpr->getType() != Int1)
11566 IfExpr = Builder.CreateICmpNE(IfExpr,
11567 ConstantInt::get(IfExpr->getType(), 0));
11568 NumTeamsUpper = Builder.CreateSelect(
11569 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
11570
11571 // lower = ifexpr ? lower : 1
11572 NumTeamsLower = Builder.CreateSelect(
11573 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
11574 }
11575
11576 if (ThreadLimit == nullptr)
11577 ThreadLimit = Builder.getInt32(0);
11578
11579 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
11580 // truncate or sign extend the passed values to match the int32 parameters.
11581 Value *NumTeamsLowerInt32 =
11582 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
11583 Value *NumTeamsUpperInt32 =
11584 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
11585 Value *ThreadLimitInt32 =
11586 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
11587
11588 Value *ThreadNum = getOrCreateThreadID(Ident);
11589
11591 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
11592 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
11593 ThreadLimitInt32});
11594 }
11595 // Generate the body of teams.
11596 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11597 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11598 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11599 return Err;
11600
11601 auto OI = std::make_unique<OutlineInfo>();
11602 OI->EntryBB = AllocaBB;
11603 OI->ExitBB = ExitBB;
11604 OI->OuterAllocBB = &OuterAllocaBB;
11605
11606 // Insert fake values for global tid and bound tid.
11608 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11609 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11610 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11611 OI->ExcludeArgsFromAggregate.push_back(createFakeIntVal(
11612 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11613
11614 auto HostPostOutlineCB = [this, Ident,
11615 ToBeDeleted](Function &OutlinedFn) mutable {
11616 // The stale call instruction will be replaced with a new call instruction
11617 // for runtime call with the outlined function.
11618
11619 assert(OutlinedFn.hasOneUse() &&
11620 "there must be a single user for the outlined function");
11621 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11622 ToBeDeleted.push_back(StaleCI);
11623
11624 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11625 "Outlined function must have two or three arguments only");
11626
11627 bool HasShared = OutlinedFn.arg_size() == 3;
11628
11629 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11630 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11631 if (HasShared)
11632 OutlinedFn.getArg(2)->setName("data");
11633
11634 // Call to the runtime function for teams in the current function.
11635 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11636 "outlined function.");
11637 Builder.SetInsertPoint(StaleCI);
11638 SmallVector<Value *> Args = {
11639 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11640 if (HasShared)
11641 Args.push_back(StaleCI->getArgOperand(2));
11644 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11645 Args);
11646
11647 for (Instruction *I : llvm::reverse(ToBeDeleted))
11648 I->eraseFromParent();
11649 };
11650
11651 if (!Config.isTargetDevice())
11652 OI->PostOutlineCB = HostPostOutlineCB;
11653
11654 addOutlineInfo(std::move(OI));
11655
11656 Builder.SetInsertPoint(ExitBB);
11657
11658 return Builder.saveIP();
11659}
11660
11662 const LocationDescription &Loc, InsertPointTy OuterAllocIP,
11663 ArrayRef<BasicBlock *> OuterDeallocBlocks, BodyGenCallbackTy BodyGenCB) {
11664 if (!updateToLocation(Loc))
11665 return InsertPointTy();
11666
11667 BasicBlock *OuterAllocaBB = OuterAllocIP.getBlock();
11668
11669 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11670 BasicBlock *BodyBB =
11671 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11672 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11673 }
11674 BasicBlock *ExitBB =
11675 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11676 BasicBlock *BodyBB =
11677 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11678 BasicBlock *AllocaBB =
11679 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11680
11681 // Generate the body of distribute clause
11682 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11683 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11684 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP, ExitBB))
11685 return Err;
11686
11687 // When using target we use different runtime functions which require a
11688 // callback.
11689 if (Config.isTargetDevice()) {
11690 auto OI = std::make_unique<OutlineInfo>();
11691 OI->OuterAllocBB = OuterAllocIP.getBlock();
11692 OI->EntryBB = AllocaBB;
11693 OI->ExitBB = ExitBB;
11694 OI->OuterDeallocBBs.reserve(OuterDeallocBlocks.size());
11695 copy(OuterDeallocBlocks, OI->OuterDeallocBBs.end());
11696
11697 addOutlineInfo(std::move(OI));
11698 }
11699 Builder.SetInsertPoint(ExitBB);
11700
11701 return Builder.saveIP();
11702}
11703
11706 std::string VarName) {
11707 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11709 Names.size()),
11710 Names);
11711 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11712 M, MapNamesArrayInit->getType(),
11713 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11714 VarName);
11715 return MapNamesArrayGlobal;
11716}
11717
11718// Create all simple and struct types exposed by the runtime and remember
11719// the llvm::PointerTypes of them for easy access later.
11720void OpenMPIRBuilder::initializeTypes(Module &M) {
11721 LLVMContext &Ctx = M.getContext();
11722 StructType *T;
11723 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11724 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11725#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11726#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11727 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11728 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11729#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11730 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11731 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11732#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11733 T = StructType::getTypeByName(Ctx, StructName); \
11734 if (!T) \
11735 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11736 VarName = T; \
11737 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11738#include "llvm/Frontend/OpenMP/OMPKinds.def"
11739}
11740
11743 SmallVectorImpl<BasicBlock *> &BlockVector) {
11745 BlockSet.insert(EntryBB);
11746 BlockSet.insert(ExitBB);
11747
11748 Worklist.push_back(EntryBB);
11749 while (!Worklist.empty()) {
11750 BasicBlock *BB = Worklist.pop_back_val();
11751 BlockVector.push_back(BB);
11752 for (BasicBlock *SuccBB : successors(BB))
11753 if (BlockSet.insert(SuccBB).second)
11754 Worklist.push_back(SuccBB);
11755 }
11756}
11757
11758std::unique_ptr<CodeExtractor>
11760 bool ArgsInZeroAddressSpace,
11761 Twine Suffix) {
11762 return std::make_unique<CodeExtractor>(
11763 Blocks, /* DominatorTree */ nullptr,
11764 /* AggregateArgs */ true,
11765 /* BlockFrequencyInfo */ nullptr,
11766 /* BranchProbabilityInfo */ nullptr,
11767 /* AssumptionCache */ nullptr,
11768 /* AllowVarArgs */ true,
11769 /* AllowAlloca */ true,
11770 /* AllocationBlock*/ OuterAllocBB,
11771 /* DeallocationBlocks */ ArrayRef<BasicBlock *>(),
11772 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11773}
11774
11775std::unique_ptr<CodeExtractor> DeviceSharedMemOutlineInfo::createCodeExtractor(
11776 ArrayRef<BasicBlock *> Blocks, bool ArgsInZeroAddressSpace, Twine Suffix) {
11777 return std::make_unique<DeviceSharedMemCodeExtractor>(
11778 OMPBuilder, Blocks, /* DominatorTree */ nullptr,
11779 /* AggregateArgs */ true,
11780 /* BlockFrequencyInfo */ nullptr,
11781 /* BranchProbabilityInfo */ nullptr,
11782 /* AssumptionCache */ nullptr,
11783 /* AllowVarArgs */ true,
11784 /* AllowAlloca */ true,
11785 /* AllocationBlock*/ OuterAllocBB,
11786 /* DeallocationBlocks */ OuterDeallocBBs.empty()
11788 : OuterDeallocBBs,
11789 /* Suffix */ Suffix.str(), ArgsInZeroAddressSpace);
11790}
11791
11793 uint64_t Size, int32_t Flags,
11795 StringRef Name) {
11796 if (!Config.isGPU()) {
11799 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11800 return;
11801 }
11802 // TODO: Add support for global variables on the device after declare target
11803 // support.
11804 Function *Fn = dyn_cast<Function>(Addr);
11805 if (!Fn)
11806 return;
11807
11808 // Add a function attribute for the kernel.
11809 Fn->addFnAttr("kernel");
11810 if (T.isAMDGCN())
11811 Fn->addFnAttr("uniform-work-group-size");
11812 Fn->addFnAttr(Attribute::MustProgress);
11813}
11814
11815// We only generate metadata for function that contain target regions.
11818
11819 // If there are no entries, we don't need to do anything.
11820 if (OffloadInfoManager.empty())
11821 return;
11822
11823 LLVMContext &C = M.getContext();
11826 16>
11827 OrderedEntries(OffloadInfoManager.size());
11828
11829 // Auxiliary methods to create metadata values and strings.
11830 auto &&GetMDInt = [this](unsigned V) {
11831 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11832 };
11833
11834 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11835
11836 // Create the offloading info metadata node.
11837 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11838 auto &&TargetRegionMetadataEmitter =
11839 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11840 const TargetRegionEntryInfo &EntryInfo,
11842 // Generate metadata for target regions. Each entry of this metadata
11843 // contains:
11844 // - Entry 0 -> Kind of this type of metadata (0).
11845 // - Entry 1 -> Device ID of the file where the entry was identified.
11846 // - Entry 2 -> File ID of the file where the entry was identified.
11847 // - Entry 3 -> Mangled name of the function where the entry was
11848 // identified.
11849 // - Entry 4 -> Line in the file where the entry was identified.
11850 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11851 // - Entry 6 -> Order the entry was created.
11852 // The first element of the metadata node is the kind.
11853 Metadata *Ops[] = {
11854 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11855 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11856 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11857 GetMDInt(E.getOrder())};
11858
11859 // Save this entry in the right position of the ordered entries array.
11860 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11861
11862 // Add metadata to the named metadata node.
11863 MD->addOperand(MDNode::get(C, Ops));
11864 };
11865
11866 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11867
11868 // Create function that emits metadata for each device global variable entry;
11869 auto &&DeviceGlobalVarMetadataEmitter =
11870 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11871 StringRef MangledName,
11873 // Generate metadata for global variables. Each entry of this metadata
11874 // contains:
11875 // - Entry 0 -> Kind of this type of metadata (1).
11876 // - Entry 1 -> Mangled name of the variable.
11877 // - Entry 2 -> Declare target kind.
11878 // - Entry 3 -> Order the entry was created.
11879 // The first element of the metadata node is the kind.
11880 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11881 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11882
11883 // Save this entry in the right position of the ordered entries array.
11884 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11885 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11886
11887 // Add metadata to the named metadata node.
11888 MD->addOperand(MDNode::get(C, Ops));
11889 };
11890
11891 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11892 DeviceGlobalVarMetadataEmitter);
11893
11894 for (const auto &E : OrderedEntries) {
11895 assert(E.first && "All ordered entries must exist!");
11896 if (const auto *CE =
11898 E.first)) {
11899 if (!CE->getID() || !CE->getAddress()) {
11900 // Do not blame the entry if the parent funtion is not emitted.
11901 TargetRegionEntryInfo EntryInfo = E.second;
11902 StringRef FnName = EntryInfo.ParentName;
11903 if (!M.getNamedValue(FnName))
11904 continue;
11905 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11906 continue;
11907 }
11908 createOffloadEntry(CE->getID(), CE->getAddress(),
11909 /*Size=*/0, CE->getFlags(),
11911 } else if (const auto *CE = dyn_cast<
11913 E.first)) {
11916 CE->getFlags());
11917 switch (Flags) {
11920 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11921 continue;
11922 if (!CE->getAddress()) {
11923 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11924 continue;
11925 }
11926 // The vaiable has no definition - no need to add the entry.
11927 if (CE->getVarSize() == 0)
11928 continue;
11929 break;
11931 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11932 (!Config.isTargetDevice() && CE->getAddress())) &&
11933 "Declaret target link address is set.");
11934 if (Config.isTargetDevice())
11935 continue;
11936 if (!CE->getAddress()) {
11938 continue;
11939 }
11940 break;
11943 if (!CE->getAddress()) {
11944 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11945 continue;
11946 }
11947 break;
11948 default:
11949 break;
11950 }
11951
11952 // Hidden or internal symbols on the device are not externally visible.
11953 // We should not attempt to register them by creating an offloading
11954 // entry. Indirect variables are handled separately on the device.
11955 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11956 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11957 (Flags !=
11959 Flags != OffloadEntriesInfoManager::
11960 OMPTargetGlobalVarEntryIndirectVTable))
11961 continue;
11962
11963 // Indirect globals need to use a special name that doesn't match the name
11964 // of the associated host global.
11966 Flags ==
11968 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11969 Flags, CE->getLinkage(), CE->getVarName());
11970 else
11971 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11972 Flags, CE->getLinkage());
11973
11974 } else {
11975 llvm_unreachable("Unsupported entry kind.");
11976 }
11977 }
11978
11979 // Emit requires directive globals to a special entry so the runtime can
11980 // register them when the device image is loaded.
11981 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11982 // entries should be redesigned to better suit this use-case.
11983 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11987 ".requires", /*Size=*/0,
11989 Config.getRequiresFlags());
11990}
11991
11994 unsigned FileID, unsigned Line, unsigned Count) {
11995 raw_svector_ostream OS(Name);
11996 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11997 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11998 if (Count)
11999 OS << "_" << Count;
12000}
12001
12003 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
12004 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
12006 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
12007 EntryInfo.Line, NewCount);
12008}
12009
12012 vfs::FileSystem &VFS,
12013 StringRef ParentName) {
12014 sys::fs::UniqueID ID(0xdeadf17e, 0);
12015 auto FileIDInfo = CallBack();
12016 uint64_t FileID = 0;
12017 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
12018 ID = Status->getUniqueID();
12019 FileID = Status->getUniqueID().getFile();
12020 } else {
12021 // If the inode ID could not be determined, create a hash value
12022 // the current file name and use that as an ID.
12023 FileID = hash_value(std::get<0>(FileIDInfo));
12024 }
12025
12026 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
12027 std::get<1>(FileIDInfo));
12028}
12029
12031 unsigned Offset = 0;
12032 for (uint64_t Remain =
12033 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12035 !(Remain & 1); Remain = Remain >> 1)
12036 Offset++;
12037 return Offset;
12038}
12039
12042 // Rotate by getFlagMemberOffset() bits.
12043 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
12044 << getFlagMemberOffset());
12045}
12046
12049 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
12050 // If the entry is PTR_AND_OBJ but has not been marked with the special
12051 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
12052 // marked as MEMBER_OF.
12053 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12055 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12058 return;
12059
12060 // Entries with ATTACH are not members-of anything. They are handled
12061 // separately by the runtime after other maps have been handled.
12062 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
12064 return;
12065
12066 // Reset the placeholder value to prepare the flag for the assignment of the
12067 // proper MEMBER_OF value.
12068 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
12069 Flags |= MemberOfFlag;
12070}
12071
12075 bool IsDeclaration, bool IsExternallyVisible,
12076 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
12077 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
12078 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
12079 std::function<Constant *()> GlobalInitializer,
12080 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
12081 // TODO: convert this to utilise the IRBuilder Config rather than
12082 // a passed down argument.
12083 if (OpenMPSIMD)
12084 return nullptr;
12085
12088 CaptureClause ==
12090 Config.hasRequiresUnifiedSharedMemory())) {
12091 SmallString<64> PtrName;
12092 {
12093 raw_svector_ostream OS(PtrName);
12094 OS << MangledName;
12095 if (!IsExternallyVisible)
12096 OS << format("_%x", EntryInfo.FileID);
12097 OS << "_decl_tgt_ref_ptr";
12098 }
12099
12100 Value *Ptr = M.getNamedValue(PtrName);
12101
12102 if (!Ptr) {
12103 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
12104 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
12105
12106 auto *GV = cast<GlobalVariable>(Ptr);
12107 GV->setLinkage(GlobalValue::WeakAnyLinkage);
12108
12109 if (!Config.isTargetDevice()) {
12110 if (GlobalInitializer)
12111 GV->setInitializer(GlobalInitializer());
12112 else
12113 GV->setInitializer(GlobalValue);
12114 }
12115
12117 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
12118 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
12119 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
12120 }
12121
12122 return cast<Constant>(Ptr);
12123 }
12124
12125 return nullptr;
12126}
12127
12131 bool IsDeclaration, bool IsExternallyVisible,
12132 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
12133 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
12134 std::vector<Triple> TargetTriple,
12135 std::function<Constant *()> GlobalInitializer,
12136 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
12137 Constant *Addr) {
12139 (TargetTriple.empty() && !Config.isTargetDevice()))
12140 return;
12141
12143 StringRef VarName;
12144 int64_t VarSize;
12146
12148 CaptureClause ==
12150 !Config.hasRequiresUnifiedSharedMemory()) {
12152 VarName = MangledName;
12153 GlobalValue *LlvmVal = M.getNamedValue(VarName);
12154
12155 if (!IsDeclaration)
12156 VarSize = divideCeil(
12157 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
12158 else
12159 VarSize = 0;
12160 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
12161
12162 // This is a workaround carried over from Clang which prevents undesired
12163 // optimisation of internal variables.
12164 if (Config.isTargetDevice() &&
12165 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
12166 // Do not create a "ref-variable" if the original is not also available
12167 // on the host.
12168 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
12169 return;
12170
12171 std::string RefName = createPlatformSpecificName({VarName, "ref"});
12172
12173 if (!M.getNamedValue(RefName)) {
12174 Constant *AddrRef =
12175 getOrCreateInternalVariable(Addr->getType(), RefName);
12176 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
12177 GvAddrRef->setConstant(true);
12178 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
12179 GvAddrRef->setInitializer(Addr);
12180 GeneratedRefs.push_back(GvAddrRef);
12181 }
12182 }
12183 } else {
12186 else
12188
12189 if (Config.isTargetDevice()) {
12190 VarName = (Addr) ? Addr->getName() : "";
12191 Addr = nullptr;
12192 } else {
12194 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
12195 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
12196 LlvmPtrTy, GlobalInitializer, VariableLinkage);
12197 VarName = (Addr) ? Addr->getName() : "";
12198 }
12199 VarSize = M.getDataLayout().getPointerSize();
12201 }
12202
12203 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
12204 Flags, Linkage);
12205}
12206
12207/// Loads all the offload entries information from the host IR
12208/// metadata.
12210 // If we are in target mode, load the metadata from the host IR. This code has
12211 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
12212
12213 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
12214 if (!MD)
12215 return;
12216
12217 for (MDNode *MN : MD->operands()) {
12218 auto &&GetMDInt = [MN](unsigned Idx) {
12219 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
12220 return cast<ConstantInt>(V->getValue())->getZExtValue();
12221 };
12222
12223 auto &&GetMDString = [MN](unsigned Idx) {
12224 auto *V = cast<MDString>(MN->getOperand(Idx));
12225 return V->getString();
12226 };
12227
12228 switch (GetMDInt(0)) {
12229 default:
12230 llvm_unreachable("Unexpected metadata!");
12231 break;
12232 case OffloadEntriesInfoManager::OffloadEntryInfo::
12233 OffloadingEntryInfoTargetRegion: {
12234 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
12235 /*DeviceID=*/GetMDInt(1),
12236 /*FileID=*/GetMDInt(2),
12237 /*Line=*/GetMDInt(4),
12238 /*Count=*/GetMDInt(5));
12239 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
12240 /*Order=*/GetMDInt(6));
12241 break;
12242 }
12243 case OffloadEntriesInfoManager::OffloadEntryInfo::
12244 OffloadingEntryInfoDeviceGlobalVar:
12245 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
12246 /*MangledName=*/GetMDString(1),
12248 /*Flags=*/GetMDInt(2)),
12249 /*Order=*/GetMDInt(3));
12250 break;
12251 }
12252 }
12253}
12254
12256 StringRef HostFilePath) {
12257 if (HostFilePath.empty())
12258 return;
12259
12260 auto Buf = VFS.getBufferForFile(HostFilePath);
12261 if (std::error_code Err = Buf.getError()) {
12262 report_fatal_error(("error opening host file from host file path inside of "
12263 "OpenMPIRBuilder: " +
12264 Err.message())
12265 .c_str());
12266 }
12267
12268 LLVMContext Ctx;
12270 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
12271 if (std::error_code Err = M.getError()) {
12273 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
12274 .c_str());
12275 }
12276
12277 loadOffloadInfoMetadata(*M.get());
12278}
12279
12282 llvm::StringRef Name) {
12283 Builder.restoreIP(Loc.IP);
12284
12285 BasicBlock *CurBB = Builder.GetInsertBlock();
12286 assert(CurBB &&
12287 "expected a valid insertion block for creating an iterator loop");
12288 Function *F = CurBB->getParent();
12289
12290 InsertPointTy SplitIP = Builder.saveIP();
12291 if (SplitIP.getPoint() == CurBB->end())
12292 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
12293 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
12294
12295 BasicBlock *ContBB =
12296 splitBB(SplitIP, /*CreateBranch=*/false,
12297 Builder.getCurrentDebugLocation(), "omp.it.cont");
12298
12299 CanonicalLoopInfo *CLI =
12300 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
12301 /*PreInsertBefore=*/ContBB,
12302 /*PostInsertBefore=*/ContBB, Name);
12303
12304 // Enter loop from original block.
12305 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
12306
12307 // Remove the unconditional branch inserted by createLoopSkeleton in the body
12308 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
12309 T->eraseFromParent();
12310
12311 InsertPointTy BodyIP = CLI->getBodyIP();
12312 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
12313 return Err;
12314
12315 // Body must either fallthrough to the latch or branch directly to it.
12316 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
12317 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
12318 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
12320 "iterator bodygen must terminate the canonical body with an "
12321 "unconditional branch to the loop latch",
12323 }
12324 } else {
12325 // Ensure we end the loop body by jumping to the latch.
12326 Builder.SetInsertPoint(CLI->getBody());
12327 Builder.CreateBr(CLI->getLatch());
12328 }
12329
12330 // Link After -> ContBB
12331 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
12332 if (!CLI->getAfter()->hasTerminator())
12333 Builder.CreateBr(ContBB);
12334
12335 return InsertPointTy{ContBB, ContBB->begin()};
12336}
12337
12338/// Mangle the parameter part of the vector function name according to
12339/// their OpenMP classification. The mangling function is defined in
12340/// section 4.5 of the AAVFABI(2021Q1).
12341static std::string mangleVectorParameters(
12343 SmallString<256> Buffer;
12344 llvm::raw_svector_ostream Out(Buffer);
12345 for (const auto &ParamAttr : ParamAttrs) {
12346 switch (ParamAttr.Kind) {
12348 Out << 'l';
12349 break;
12351 Out << 'R';
12352 break;
12354 Out << 'U';
12355 break;
12357 Out << 'L';
12358 break;
12360 Out << 'u';
12361 break;
12363 Out << 'v';
12364 break;
12365 }
12366 if (ParamAttr.HasVarStride)
12367 Out << "s" << ParamAttr.StrideOrArg;
12368 else if (ParamAttr.Kind ==
12370 ParamAttr.Kind ==
12372 ParamAttr.Kind ==
12374 ParamAttr.Kind ==
12376 // Don't print the step value if it is not present or if it is
12377 // equal to 1.
12378 if (ParamAttr.StrideOrArg < 0)
12379 Out << 'n' << -ParamAttr.StrideOrArg;
12380 else if (ParamAttr.StrideOrArg != 1)
12381 Out << ParamAttr.StrideOrArg;
12382 }
12383
12384 if (!!ParamAttr.Alignment)
12385 Out << 'a' << ParamAttr.Alignment;
12386 }
12387
12388 return std::string(Out.str());
12389}
12390
12392 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
12394 struct ISADataTy {
12395 char ISA;
12396 unsigned VecRegSize;
12397 };
12398 ISADataTy ISAData[] = {
12399 {'b', 128}, // SSE
12400 {'c', 256}, // AVX
12401 {'d', 256}, // AVX2
12402 {'e', 512}, // AVX512
12403 };
12405 switch (Branch) {
12407 Masked.push_back('N');
12408 Masked.push_back('M');
12409 break;
12411 Masked.push_back('N');
12412 break;
12414 Masked.push_back('M');
12415 break;
12416 }
12417 for (char Mask : Masked) {
12418 for (const ISADataTy &Data : ISAData) {
12420 llvm::raw_svector_ostream Out(Buffer);
12421 Out << "_ZGV" << Data.ISA << Mask;
12422 if (!VLENVal) {
12423 assert(NumElts && "Non-zero simdlen/cdtsize expected");
12424 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
12425 } else {
12426 Out << VLENVal;
12427 }
12428 Out << mangleVectorParameters(ParamAttrs);
12429 Out << '_' << Fn->getName();
12430 Fn->addFnAttr(Out.str());
12431 }
12432 }
12433}
12434
12435// Function used to add the attribute. The parameter `VLEN` is templated to
12436// allow the use of `x` when targeting scalable functions for SVE.
12437template <typename T>
12438static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
12439 char ISA, StringRef ParSeq,
12440 StringRef MangledName, bool OutputBecomesInput,
12441 llvm::Function *Fn) {
12442 SmallString<256> Buffer;
12443 llvm::raw_svector_ostream Out(Buffer);
12444 Out << Prefix << ISA << LMask << VLEN;
12445 if (OutputBecomesInput)
12446 Out << 'v';
12447 Out << ParSeq << '_' << MangledName;
12448 Fn->addFnAttr(Out.str());
12449}
12450
12451// Helper function to generate the Advanced SIMD names depending on the value
12452// of the NDS when simdlen is not present.
12453static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
12454 StringRef Prefix, char ISA,
12455 StringRef ParSeq, StringRef MangledName,
12456 bool OutputBecomesInput,
12457 llvm::Function *Fn) {
12458 switch (NDS) {
12459 case 8:
12460 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12461 OutputBecomesInput, Fn);
12462 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
12463 OutputBecomesInput, Fn);
12464 break;
12465 case 16:
12466 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12467 OutputBecomesInput, Fn);
12468 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
12469 OutputBecomesInput, Fn);
12470 break;
12471 case 32:
12472 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12473 OutputBecomesInput, Fn);
12474 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
12475 OutputBecomesInput, Fn);
12476 break;
12477 case 64:
12478 case 128:
12479 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
12480 OutputBecomesInput, Fn);
12481 break;
12482 default:
12483 llvm_unreachable("Scalar type is too wide.");
12484 }
12485}
12486
12487/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
12489 llvm::Function *Fn, unsigned UserVLEN,
12491 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
12492 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
12493
12494 // Sort out parameter sequence.
12495 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
12496 StringRef Prefix = "_ZGV";
12497 StringRef MangledName = Fn->getName();
12498
12499 // Generate simdlen from user input (if any).
12500 if (UserVLEN) {
12501 if (ISA == 's') {
12502 // SVE generates only a masked function.
12503 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12504 OutputBecomesInput, Fn);
12505 return;
12506 }
12507
12508 switch (Branch) {
12510 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12511 OutputBecomesInput, Fn);
12512 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12513 OutputBecomesInput, Fn);
12514 break;
12516 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
12517 OutputBecomesInput, Fn);
12518 break;
12520 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
12521 OutputBecomesInput, Fn);
12522 break;
12523 }
12524 return;
12525 }
12526
12527 if (ISA == 's') {
12528 // SVE, section 3.4.1, item 1.
12529 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
12530 OutputBecomesInput, Fn);
12531 return;
12532 }
12533
12534 switch (Branch) {
12536 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12537 MangledName, OutputBecomesInput, Fn);
12538 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12539 MangledName, OutputBecomesInput, Fn);
12540 break;
12542 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
12543 MangledName, OutputBecomesInput, Fn);
12544 break;
12546 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
12547 MangledName, OutputBecomesInput, Fn);
12548 break;
12549 }
12550}
12551
12552//===----------------------------------------------------------------------===//
12553// OffloadEntriesInfoManager
12554//===----------------------------------------------------------------------===//
12555
12557 return OffloadEntriesTargetRegion.empty() &&
12558 OffloadEntriesDeviceGlobalVar.empty();
12559}
12560
12561unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
12562 const TargetRegionEntryInfo &EntryInfo) const {
12563 auto It = OffloadEntriesTargetRegionCount.find(
12564 getTargetRegionEntryCountKey(EntryInfo));
12565 if (It == OffloadEntriesTargetRegionCount.end())
12566 return 0;
12567 return It->second;
12568}
12569
12570void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
12571 const TargetRegionEntryInfo &EntryInfo) {
12572 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
12573 EntryInfo.Count + 1;
12574}
12575
12576/// Initialize target region entry.
12578 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
12579 OffloadEntriesTargetRegion[EntryInfo] =
12580 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
12582 ++OffloadingEntriesNum;
12583}
12584
12586 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
12588 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
12589
12590 // Update the EntryInfo with the next available count for this location.
12591 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12592
12593 // If we are emitting code for a target, the entry is already initialized,
12594 // only has to be registered.
12595 if (OMPBuilder->Config.isTargetDevice()) {
12596 // This could happen if the device compilation is invoked standalone.
12597 if (!hasTargetRegionEntryInfo(EntryInfo)) {
12598 return;
12599 }
12600 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
12601 Entry.setAddress(Addr);
12602 Entry.setID(ID);
12603 Entry.setFlags(Flags);
12604 } else {
12606 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
12607 return;
12608 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
12609 "Target region entry already registered!");
12610 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
12611 OffloadEntriesTargetRegion[EntryInfo] = Entry;
12612 ++OffloadingEntriesNum;
12613 }
12614 incrementTargetRegionEntryInfoCount(EntryInfo);
12615}
12616
12618 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
12619
12620 // Update the EntryInfo with the next available count for this location.
12621 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12622
12623 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
12624 if (It == OffloadEntriesTargetRegion.end()) {
12625 return false;
12626 }
12627 // Fail if this entry is already registered.
12628 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
12629 return false;
12630 return true;
12631}
12632
12634 const OffloadTargetRegionEntryInfoActTy &Action) {
12635 // Scan all target region entries and perform the provided action.
12636 for (const auto &It : OffloadEntriesTargetRegion) {
12637 Action(It.first, It.second);
12638 }
12639}
12640
12642 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12643 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12644 ++OffloadingEntriesNum;
12645}
12646
12648 StringRef VarName, Constant *Addr, int64_t VarSize,
12650 if (OMPBuilder->Config.isTargetDevice()) {
12651 // This could happen if the device compilation is invoked standalone.
12652 if (!hasDeviceGlobalVarEntryInfo(VarName))
12653 return;
12654 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12655 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12656 if (Entry.getVarSize() == 0) {
12657 Entry.setVarSize(VarSize);
12658 Entry.setLinkage(Linkage);
12659 }
12660 return;
12661 }
12662 Entry.setVarSize(VarSize);
12663 Entry.setLinkage(Linkage);
12664 Entry.setAddress(Addr);
12665 } else {
12666 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12667 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12668 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12669 "Entry not initialized!");
12670 if (Entry.getVarSize() == 0) {
12671 Entry.setVarSize(VarSize);
12672 Entry.setLinkage(Linkage);
12673 }
12674 return;
12675 }
12677 Flags ==
12679 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12680 Addr, VarSize, Flags, Linkage,
12681 VarName.str());
12682 else
12683 OffloadEntriesDeviceGlobalVar.try_emplace(
12684 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12685 ++OffloadingEntriesNum;
12686 }
12687}
12688
12691 // Scan all target region entries and perform the provided action.
12692 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12693 Action(E.getKey(), E.getValue());
12694}
12695
12696//===----------------------------------------------------------------------===//
12697// CanonicalLoopInfo
12698//===----------------------------------------------------------------------===//
12699
12700void CanonicalLoopInfo::collectControlBlocks(
12702 // We only count those BBs as control block for which we do not need to
12703 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12704 // flow. For consistency, this also means we do not add the Body block, which
12705 // is just the entry to the body code.
12706 BBs.reserve(BBs.size() + 6);
12707 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12708}
12709
12711 assert(isValid() && "Requires a valid canonical loop");
12712 for (BasicBlock *Pred : predecessors(Header)) {
12713 if (Pred != Latch)
12714 return Pred;
12715 }
12716 llvm_unreachable("Missing preheader");
12717}
12718
12719void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12720 assert(isValid() && "Requires a valid canonical loop");
12721
12722 Instruction *CmpI = &getCond()->front();
12723 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12724 CmpI->setOperand(1, TripCount);
12725
12726#ifndef NDEBUG
12727 assertOK();
12728#endif
12729}
12730
12731void CanonicalLoopInfo::mapIndVar(
12732 llvm::function_ref<Value *(Instruction *)> Updater) {
12733 assert(isValid() && "Requires a valid canonical loop");
12734
12735 Instruction *OldIV = getIndVar();
12736
12737 // Record all uses excluding those introduced by the updater. Uses by the
12738 // CanonicalLoopInfo itself to keep track of the number of iterations are
12739 // excluded.
12740 SmallVector<Use *> ReplacableUses;
12741 for (Use &U : OldIV->uses()) {
12742 auto *User = dyn_cast<Instruction>(U.getUser());
12743 if (!User)
12744 continue;
12745 if (User->getParent() == getCond())
12746 continue;
12747 if (User->getParent() == getLatch())
12748 continue;
12749 ReplacableUses.push_back(&U);
12750 }
12751
12752 // Run the updater that may introduce new uses
12753 Value *NewIV = Updater(OldIV);
12754
12755 // Replace the old uses with the value returned by the updater.
12756 for (Use *U : ReplacableUses)
12757 U->set(NewIV);
12758
12759#ifndef NDEBUG
12760 assertOK();
12761#endif
12762}
12763
12765#ifndef NDEBUG
12766 // No constraints if this object currently does not describe a loop.
12767 if (!isValid())
12768 return;
12769
12770 BasicBlock *Preheader = getPreheader();
12771 BasicBlock *Body = getBody();
12772 BasicBlock *After = getAfter();
12773
12774 // Verify standard control-flow we use for OpenMP loops.
12775 assert(Preheader);
12776 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12777 "Preheader must terminate with unconditional branch");
12778 assert(Preheader->getSingleSuccessor() == Header &&
12779 "Preheader must jump to header");
12780
12781 assert(Header);
12782 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12783 "Header must terminate with unconditional branch");
12784 assert(Header->getSingleSuccessor() == Cond &&
12785 "Header must jump to exiting block");
12786
12787 assert(Cond);
12788 assert(Cond->getSinglePredecessor() == Header &&
12789 "Exiting block only reachable from header");
12790
12791 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12792 "Exiting block must terminate with conditional branch");
12793 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12794 "Exiting block's first successor jump to the body");
12795 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12796 "Exiting block's second successor must exit the loop");
12797
12798 assert(Body);
12799 assert(Body->getSinglePredecessor() == Cond &&
12800 "Body only reachable from exiting block");
12801 assert(!isa<PHINode>(Body->front()));
12802
12803 assert(Latch);
12804 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12805 "Latch must terminate with unconditional branch");
12806 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12807 // TODO: To support simple redirecting of the end of the body code that has
12808 // multiple; introduce another auxiliary basic block like preheader and after.
12809 assert(Latch->getSinglePredecessor() != nullptr);
12810 assert(!isa<PHINode>(Latch->front()));
12811
12812 assert(Exit);
12813 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12814 "Exit block must terminate with unconditional branch");
12815 assert(Exit->getSingleSuccessor() == After &&
12816 "Exit block must jump to after block");
12817
12818 assert(After);
12819 assert(After->getSinglePredecessor() == Exit &&
12820 "After block only reachable from exit block");
12821 assert(After->empty() || !isa<PHINode>(After->front()));
12822
12823 Instruction *IndVar = getIndVar();
12824 assert(IndVar && "Canonical induction variable not found?");
12825 assert(isa<IntegerType>(IndVar->getType()) &&
12826 "Induction variable must be an integer");
12827 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12828 "Induction variable must be a PHI in the loop header");
12829 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12830 assert(
12831 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12832 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12833
12834 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12835 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12836 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12837 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12838 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12839 ->isOne());
12840
12841 Value *TripCount = getTripCount();
12842 assert(TripCount && "Loop trip count not found?");
12843 assert(IndVar->getType() == TripCount->getType() &&
12844 "Trip count and induction variable must have the same type");
12845
12846 auto *CmpI = cast<CmpInst>(&Cond->front());
12847 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12848 "Exit condition must be a signed less-than comparison");
12849 assert(CmpI->getOperand(0) == IndVar &&
12850 "Exit condition must compare the induction variable");
12851 assert(CmpI->getOperand(1) == TripCount &&
12852 "Exit condition must compare with the trip count");
12853#endif
12854}
12855
12857 Header = nullptr;
12858 Cond = nullptr;
12859 Latch = nullptr;
12860 Exit = nullptr;
12861}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static Function * createTargetParallelWrapper(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn)
Create wrapper function used to gather the outlined function's argument structure from a shared buffe...
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static bool isGenericKernel(Function &Fn)
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static std::optional< omp::OMPTgtExecModeFlags > getTargetKernelExecMode(Function &Kernel)
Given a function, if it represents the entry point of a target kernel, this returns the execution mod...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static cl::opt< bool > UseDefaultMaxThreads("openmp-ir-builder-use-default-max-threads", cl::Hidden, cl::desc("Use a default max threads if none is provided."), cl::init(true))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static AtomicOrdering TransformReleaseAcquireRelease(AtomicOrdering AO)
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file defines less commonly used SmallVector utilities.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:109
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:150
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:765
@ ICMP_NE
not equal
Definition InstrTypes.h:762
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:766
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:872
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
InsertPoint - A saved insertion point.
Definition IRBuilder.h:298
BasicBlock * getBlock() const
Definition IRBuilder.h:313
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:311
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:314
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:318
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:330
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:350
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:587
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:996
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
size_type size() const
Definition MapVector.h:58
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:68
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:288
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:281
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
LLVM_ABI CallInst * createOMPAllocShared(const LocationDescription &Loc, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_alloc_shared.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
LLVM_ABI InsertPointOrErrorTy createScope(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait)
Generator for 'omp scope'.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies={}, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> TargetBodyGenCallbackTy
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
bool HandleFPNegZero
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB, bool PreserveMemberOfFlags=false)
Emit the user-defined mapper function.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, const DependenciesInfo &Dependencies={}, const AffinityData &Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={}, ArrayRef< BasicBlock * > DeallocBlocks={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
void addOutlineInfo(std::unique_ptr< OutlineInfo > &&OI)
Add a new region that will be outlined later.
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< BasicBlock * > DeallocBlocks, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskDependency(IRBuilderBase &Builder, Value *Entry, const DependData &Dep)
Store one kmp_depend_info entry at the given Entry pointer.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI CallInst * createOMPAlignedAlloc(const LocationDescription &Loc, Value *Align, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_align_alloc.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPFreeShared(const LocationDescription &Loc, Value *Addr, Value *Size, const Twine &Name=Twine(""))
Create a runtime call for kmpc_free_shared.
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
SmallVector< std::unique_ptr< OutlineInfo >, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< InsertPointTy > DeallocIPs)> TargetGenArgAccessorsCallbackTy
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< BasicBlock * > DeallocBlocks)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::string str() const
Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:479
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:685
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1051
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1111
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1125
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:276
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll(OptimizationRemarkEmitter *ORE=nullptr, const Loop *L=nullptr) const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:173
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr)
Definition Utility.cpp:105
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
@ OMP_TGT_EXEC_MODE_SPMD_NO_LOOP
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:558
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2553
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:377
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
LLVM_ABI TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1884
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
auto filter_to_vector(ContainerTy &&C, PredicateFn &&Pred)
Filter a range to a SmallVector with the element types deduced.
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:26
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
omp::RTLDependenceKindTy DepKind
A struct to pack static and dynamic dependency information for a task.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
virtual LLVM_ABI std::unique_ptr< CodeExtractor > createCodeExtractor(ArrayRef< BasicBlock * > Blocks, bool ArgsInZeroAddressSpace, Twine Suffix=Twine(""))
Create a CodeExtractor instance based on the information stored in this structure,...
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool StrictBlocksAndThreads
True if the kernel strictly requires the number of blocks and threads above to run.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static LLVM_ABI const Target * lookupTarget(const Triple &TheTriple, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...