LLVM 22.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
64
65#include <cstdint>
66#include <optional>
67
68#define DEBUG_TYPE "openmp-ir-builder"
69
70using namespace llvm;
71using namespace omp;
72
73static cl::opt<bool>
74 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
75 cl::desc("Use optimistic attributes describing "
76 "'as-if' properties of runtime calls."),
77 cl::init(false));
78
80 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
81 cl::desc("Factor for the unroll threshold to account for code "
82 "simplifications still taking place"),
83 cl::init(1.5));
84
85#ifndef NDEBUG
86/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
87/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
88/// an InsertPoint stores the instruction before something is inserted. For
89/// instance, if both point to the same instruction, two IRBuilders alternating
90/// creating instruction will cause the instructions to be interleaved.
93 if (!IP1.isSet() || !IP2.isSet())
94 return false;
95 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
96}
97
99 // Valid ordered/unordered and base algorithm combinations.
100 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
101 case OMPScheduleType::UnorderedStaticChunked:
102 case OMPScheduleType::UnorderedStatic:
103 case OMPScheduleType::UnorderedDynamicChunked:
104 case OMPScheduleType::UnorderedGuidedChunked:
105 case OMPScheduleType::UnorderedRuntime:
106 case OMPScheduleType::UnorderedAuto:
107 case OMPScheduleType::UnorderedTrapezoidal:
108 case OMPScheduleType::UnorderedGreedy:
109 case OMPScheduleType::UnorderedBalanced:
110 case OMPScheduleType::UnorderedGuidedIterativeChunked:
111 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
112 case OMPScheduleType::UnorderedSteal:
113 case OMPScheduleType::UnorderedStaticBalancedChunked:
114 case OMPScheduleType::UnorderedGuidedSimd:
115 case OMPScheduleType::UnorderedRuntimeSimd:
116 case OMPScheduleType::OrderedStaticChunked:
117 case OMPScheduleType::OrderedStatic:
118 case OMPScheduleType::OrderedDynamicChunked:
119 case OMPScheduleType::OrderedGuidedChunked:
120 case OMPScheduleType::OrderedRuntime:
121 case OMPScheduleType::OrderedAuto:
122 case OMPScheduleType::OrderdTrapezoidal:
123 case OMPScheduleType::NomergeUnorderedStaticChunked:
124 case OMPScheduleType::NomergeUnorderedStatic:
125 case OMPScheduleType::NomergeUnorderedDynamicChunked:
126 case OMPScheduleType::NomergeUnorderedGuidedChunked:
127 case OMPScheduleType::NomergeUnorderedRuntime:
128 case OMPScheduleType::NomergeUnorderedAuto:
129 case OMPScheduleType::NomergeUnorderedTrapezoidal:
130 case OMPScheduleType::NomergeUnorderedGreedy:
131 case OMPScheduleType::NomergeUnorderedBalanced:
132 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
133 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
134 case OMPScheduleType::NomergeUnorderedSteal:
135 case OMPScheduleType::NomergeOrderedStaticChunked:
136 case OMPScheduleType::NomergeOrderedStatic:
137 case OMPScheduleType::NomergeOrderedDynamicChunked:
138 case OMPScheduleType::NomergeOrderedGuidedChunked:
139 case OMPScheduleType::NomergeOrderedRuntime:
140 case OMPScheduleType::NomergeOrderedAuto:
141 case OMPScheduleType::NomergeOrderedTrapezoidal:
142 case OMPScheduleType::OrderedDistributeChunked:
143 case OMPScheduleType::OrderedDistribute:
144 break;
145 default:
146 return false;
147 }
148
149 // Must not set both monotonicity modifiers at the same time.
150 OMPScheduleType MonotonicityFlags =
151 SchedType & OMPScheduleType::MonotonicityMask;
152 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
153 return false;
154
155 return true;
156}
157#endif
158
159/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
160/// debug location to the last instruction in the specified basic block if the
161/// insert point points to the end of the block.
164 Builder.restoreIP(IP);
165 llvm::BasicBlock *BB = Builder.GetInsertBlock();
166 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
167 if (!BB->empty() && I == BB->end())
168 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
169}
170
171static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
172 if (T.isAMDGPU()) {
173 StringRef Features =
174 Kernel->getFnAttribute("target-features").getValueAsString();
175 if (Features.count("+wavefrontsize64"))
178 }
179 if (T.isNVPTX())
181 if (T.isSPIRV())
183 llvm_unreachable("No grid value available for this architecture!");
184}
185
186/// Determine which scheduling algorithm to use, determined from schedule clause
187/// arguments.
188static OMPScheduleType
189getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
190 bool HasSimdModifier, bool HasDistScheduleChunks) {
191 // Currently, the default schedule it static.
192 switch (ClauseKind) {
193 case OMP_SCHEDULE_Default:
194 case OMP_SCHEDULE_Static:
195 return HasChunks ? OMPScheduleType::BaseStaticChunked
196 : OMPScheduleType::BaseStatic;
197 case OMP_SCHEDULE_Dynamic:
198 return OMPScheduleType::BaseDynamicChunked;
199 case OMP_SCHEDULE_Guided:
200 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
201 : OMPScheduleType::BaseGuidedChunked;
202 case OMP_SCHEDULE_Auto:
204 case OMP_SCHEDULE_Runtime:
205 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
206 : OMPScheduleType::BaseRuntime;
207 case OMP_SCHEDULE_Distribute:
208 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
209 : OMPScheduleType::BaseDistribute;
210 }
211 llvm_unreachable("unhandled schedule clause argument");
212}
213
214/// Adds ordering modifier flags to schedule type.
215static OMPScheduleType
217 bool HasOrderedClause) {
218 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
219 OMPScheduleType::None &&
220 "Must not have ordering nor monotonicity flags already set");
221
222 OMPScheduleType OrderingModifier = HasOrderedClause
223 ? OMPScheduleType::ModifierOrdered
224 : OMPScheduleType::ModifierUnordered;
225 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
226
227 // Unsupported combinations
228 if (OrderingScheduleType ==
229 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
230 return OMPScheduleType::OrderedGuidedChunked;
231 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
232 OMPScheduleType::ModifierOrdered))
233 return OMPScheduleType::OrderedRuntime;
234
235 return OrderingScheduleType;
236}
237
238/// Adds monotonicity modifier flags to schedule type.
239static OMPScheduleType
241 bool HasSimdModifier, bool HasMonotonic,
242 bool HasNonmonotonic, bool HasOrderedClause) {
243 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
244 OMPScheduleType::None &&
245 "Must not have monotonicity flags already set");
246 assert((!HasMonotonic || !HasNonmonotonic) &&
247 "Monotonic and Nonmonotonic are contradicting each other");
248
249 if (HasMonotonic) {
250 return ScheduleType | OMPScheduleType::ModifierMonotonic;
251 } else if (HasNonmonotonic) {
252 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
253 } else {
254 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
255 // If the static schedule kind is specified or if the ordered clause is
256 // specified, and if the nonmonotonic modifier is not specified, the
257 // effect is as if the monotonic modifier is specified. Otherwise, unless
258 // the monotonic modifier is specified, the effect is as if the
259 // nonmonotonic modifier is specified.
260 OMPScheduleType BaseScheduleType =
261 ScheduleType & ~OMPScheduleType::ModifierMask;
262 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
263 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
264 HasOrderedClause) {
265 // The monotonic is used by default in openmp runtime library, so no need
266 // to set it.
267 return ScheduleType;
268 } else {
269 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
270 }
271 }
272}
273
274/// Determine the schedule type using schedule and ordering clause arguments.
275static OMPScheduleType
276computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
277 bool HasSimdModifier, bool HasMonotonicModifier,
278 bool HasNonmonotonicModifier, bool HasOrderedClause,
279 bool HasDistScheduleChunks) {
281 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
282 OMPScheduleType OrderedSchedule =
283 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
285 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
286 HasNonmonotonicModifier, HasOrderedClause);
287
289 return Result;
290}
291
292/// Make \p Source branch to \p Target.
293///
294/// Handles two situations:
295/// * \p Source already has an unconditional branch.
296/// * \p Source is a degenerate block (no terminator because the BB is
297/// the current head of the IR construction).
299 if (Instruction *Term = Source->getTerminator()) {
300 auto *Br = cast<BranchInst>(Term);
301 assert(!Br->isConditional() &&
302 "BB's terminator must be an unconditional branch (or degenerate)");
303 BasicBlock *Succ = Br->getSuccessor(0);
304 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
305 Br->setSuccessor(0, Target);
306 return;
307 }
308
309 auto *NewBr = BranchInst::Create(Target, Source);
310 NewBr->setDebugLoc(DL);
311}
312
314 bool CreateBranch, DebugLoc DL) {
315 assert(New->getFirstInsertionPt() == New->begin() &&
316 "Target BB must not have PHI nodes");
317
318 // Move instructions to new block.
319 BasicBlock *Old = IP.getBlock();
320 // If the `Old` block is empty then there are no instructions to move. But in
321 // the new debug scheme, it could have trailing debug records which will be
322 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
323 // reasons:
324 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
325 // 2. Even if `New` is not empty, the rationale to move those records to `New`
326 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
327 // assumes that `Old` is optimized out and is going away. This is not the case
328 // here. The `Old` block is still being used e.g. a branch instruction is
329 // added to it later in this function.
330 // So we call `BasicBlock::splice` only when `Old` is not empty.
331 if (!Old->empty())
332 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
333
334 if (CreateBranch) {
335 auto *NewBr = BranchInst::Create(New, Old);
336 NewBr->setDebugLoc(DL);
337 }
338}
339
340void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
341 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
342 BasicBlock *Old = Builder.GetInsertBlock();
343
344 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
345 if (CreateBranch)
346 Builder.SetInsertPoint(Old->getTerminator());
347 else
348 Builder.SetInsertPoint(Old);
349
350 // SetInsertPoint also updates the Builder's debug location, but we want to
351 // keep the one the Builder was configured to use.
352 Builder.SetCurrentDebugLocation(DebugLoc);
353}
354
356 DebugLoc DL, llvm::Twine Name) {
357 BasicBlock *Old = IP.getBlock();
359 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
360 Old->getParent(), Old->getNextNode());
361 spliceBB(IP, New, CreateBranch, DL);
362 New->replaceSuccessorsPhiUsesWith(Old, New);
363 return New;
364}
365
366BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
367 llvm::Twine Name) {
368 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
369 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
370 if (CreateBranch)
371 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
372 else
373 Builder.SetInsertPoint(Builder.GetInsertBlock());
374 // SetInsertPoint also updates the Builder's debug location, but we want to
375 // keep the one the Builder was configured to use.
376 Builder.SetCurrentDebugLocation(DebugLoc);
377 return New;
378}
379
380BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
381 llvm::Twine Name) {
382 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
383 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
384 if (CreateBranch)
385 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
386 else
387 Builder.SetInsertPoint(Builder.GetInsertBlock());
388 // SetInsertPoint also updates the Builder's debug location, but we want to
389 // keep the one the Builder was configured to use.
390 Builder.SetCurrentDebugLocation(DebugLoc);
391 return New;
392}
393
395 llvm::Twine Suffix) {
396 BasicBlock *Old = Builder.GetInsertBlock();
397 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
398}
399
400// This function creates a fake integer value and a fake use for the integer
401// value. It returns the fake value created. This is useful in modeling the
402// extra arguments to the outlined functions.
404 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
406 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
407 const Twine &Name = "", bool AsPtr = true,
408 bool Is64Bit = false) {
409 Builder.restoreIP(OuterAllocaIP);
410 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
411 Instruction *FakeVal;
412 AllocaInst *FakeValAddr =
413 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
414 ToBeDeleted.push_back(FakeValAddr);
415
416 if (AsPtr) {
417 FakeVal = FakeValAddr;
418 } else {
419 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
420 ToBeDeleted.push_back(FakeVal);
421 }
422
423 // Generate a fake use of this value
424 Builder.restoreIP(InnerAllocaIP);
425 Instruction *UseFakeVal;
426 if (AsPtr) {
427 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
428 } else {
429 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
430 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
431 }
432 ToBeDeleted.push_back(UseFakeVal);
433 return FakeVal;
434}
435
436//===----------------------------------------------------------------------===//
437// OpenMPIRBuilderConfig
438//===----------------------------------------------------------------------===//
439
440namespace {
442/// Values for bit flags for marking which requires clauses have been used.
443enum OpenMPOffloadingRequiresDirFlags {
444 /// flag undefined.
445 OMP_REQ_UNDEFINED = 0x000,
446 /// no requires directive present.
447 OMP_REQ_NONE = 0x001,
448 /// reverse_offload clause.
449 OMP_REQ_REVERSE_OFFLOAD = 0x002,
450 /// unified_address clause.
451 OMP_REQ_UNIFIED_ADDRESS = 0x004,
452 /// unified_shared_memory clause.
453 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
454 /// dynamic_allocators clause.
455 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
456 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
457};
458
459} // anonymous namespace
460
462 : RequiresFlags(OMP_REQ_UNDEFINED) {}
463
466 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
467 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
470 RequiresFlags(OMP_REQ_UNDEFINED) {
471 if (HasRequiresReverseOffload)
472 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
473 if (HasRequiresUnifiedAddress)
474 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
475 if (HasRequiresUnifiedSharedMemory)
476 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
477 if (HasRequiresDynamicAllocators)
478 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
479}
480
482 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
483}
484
486 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
487}
488
490 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
491}
492
494 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
495}
496
498 return hasRequiresFlags() ? RequiresFlags
499 : static_cast<int64_t>(OMP_REQ_NONE);
500}
501
503 if (Value)
504 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
505 else
506 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
507}
508
510 if (Value)
511 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
512 else
513 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
514}
515
517 if (Value)
518 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
519 else
520 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
521}
522
524 if (Value)
525 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
526 else
527 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
528}
529
530//===----------------------------------------------------------------------===//
531// OpenMPIRBuilder
532//===----------------------------------------------------------------------===//
533
536 SmallVector<Value *> &ArgsVector) {
538 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
539 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
540 constexpr size_t MaxDim = 3;
541 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
542
543 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
544
545 Value *DynCGroupMemFallbackFlag =
546 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
547 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
548 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
549
550 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
551
552 Value *NumTeams3D =
553 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
554 Value *NumThreads3D =
555 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
556 for (unsigned I :
557 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
558 NumTeams3D =
559 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
560 for (unsigned I :
561 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
562 NumThreads3D =
563 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
564
565 ArgsVector = {Version,
566 PointerNum,
567 KernelArgs.RTArgs.BasePointersArray,
568 KernelArgs.RTArgs.PointersArray,
569 KernelArgs.RTArgs.SizesArray,
570 KernelArgs.RTArgs.MapTypesArray,
571 KernelArgs.RTArgs.MapNamesArray,
572 KernelArgs.RTArgs.MappersArray,
573 KernelArgs.NumIterations,
574 Flags,
575 NumTeams3D,
576 NumThreads3D,
577 KernelArgs.DynCGroupMem};
578}
579
581 LLVMContext &Ctx = Fn.getContext();
582
583 // Get the function's current attributes.
584 auto Attrs = Fn.getAttributes();
585 auto FnAttrs = Attrs.getFnAttrs();
586 auto RetAttrs = Attrs.getRetAttrs();
588 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
589 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
590
591 // Add AS to FnAS while taking special care with integer extensions.
592 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
593 bool Param = true) -> void {
594 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
595 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
596 if (HasSignExt || HasZeroExt) {
597 assert(AS.getNumAttributes() == 1 &&
598 "Currently not handling extension attr combined with others.");
599 if (Param) {
600 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
601 FnAS = FnAS.addAttribute(Ctx, AK);
602 } else if (auto AK =
603 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else {
606 FnAS = FnAS.addAttributes(Ctx, AS);
607 }
608 };
609
610#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
611#include "llvm/Frontend/OpenMP/OMPKinds.def"
612
613 // Add attributes to the function declaration.
614 switch (FnID) {
615#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
616 case Enum: \
617 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
618 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
619 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
620 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
621 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
622 break;
623#include "llvm/Frontend/OpenMP/OMPKinds.def"
624 default:
625 // Attributes are optional.
626 break;
627 }
628}
629
632 FunctionType *FnTy = nullptr;
633 Function *Fn = nullptr;
634
635 // Try to find the declation in the module first.
636 switch (FnID) {
637#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
638 case Enum: \
639 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
640 IsVarArg); \
641 Fn = M.getFunction(Str); \
642 break;
643#include "llvm/Frontend/OpenMP/OMPKinds.def"
644 }
645
646 if (!Fn) {
647 // Create a new declaration if we need one.
648 switch (FnID) {
649#define OMP_RTL(Enum, Str, ...) \
650 case Enum: \
651 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
652 break;
653#include "llvm/Frontend/OpenMP/OMPKinds.def"
654 }
655 Fn->setCallingConv(Config.getRuntimeCC());
656 // Add information if the runtime function takes a callback function
657 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
658 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
659 LLVMContext &Ctx = Fn->getContext();
660 MDBuilder MDB(Ctx);
661 // Annotate the callback behavior of the runtime function:
662 // - The callback callee is argument number 2 (microtask).
663 // - The first two arguments of the callback callee are unknown (-1).
664 // - All variadic arguments to the runtime function are passed to the
665 // callback callee.
666 Fn->addMetadata(
667 LLVMContext::MD_callback,
669 2, {-1, -1}, /* VarArgsArePassed */ true)}));
670 }
671 }
672
673 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
674 << " with type " << *Fn->getFunctionType() << "\n");
675 addAttributes(FnID, *Fn);
676
677 } else {
678 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
679 << " with type " << *Fn->getFunctionType() << "\n");
680 }
681
682 assert(Fn && "Failed to create OpenMP runtime function");
683
684 return {FnTy, Fn};
685}
686
689 if (!FiniBB) {
690 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
692 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
693 Builder.SetInsertPoint(FiniBB);
694 // FiniCB adds the branch to the exit stub.
695 if (Error Err = FiniCB(Builder.saveIP()))
696 return Err;
697 }
698 return FiniBB;
699}
700
702 BasicBlock *OtherFiniBB) {
703 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
704 if (!FiniBB) {
705 FiniBB = OtherFiniBB;
706
707 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
708 if (Error Err = FiniCB(Builder.saveIP()))
709 return Err;
710
711 return Error::success();
712 }
713
714 // Move instructions from FiniBB to the start of OtherFiniBB.
715 auto EndIt = FiniBB->end();
716 if (FiniBB->size() >= 1)
717 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
718 EndIt = Prev;
719 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
720 EndIt);
721
722 FiniBB->replaceAllUsesWith(OtherFiniBB);
723 FiniBB->eraseFromParent();
724 FiniBB = OtherFiniBB;
725 return Error::success();
726}
727
730 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
731 assert(Fn && "Failed to create OpenMP runtime function pointer");
732 return Fn;
733}
734
737 StringRef Name) {
738 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
739 Call->setCallingConv(Config.getRuntimeCC());
740 return Call;
741}
742
743void OpenMPIRBuilder::initialize() { initializeTypes(M); }
744
747 BasicBlock &EntryBlock = Function->getEntryBlock();
748 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
749
750 // Loop over blocks looking for constant allocas, skipping the entry block
751 // as any allocas there are already in the desired location.
752 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
753 Block++) {
754 for (auto Inst = Block->getReverseIterator()->begin();
755 Inst != Block->getReverseIterator()->end();) {
757 Inst++;
759 continue;
760 AllocaInst->moveBeforePreserving(MoveLocInst);
761 } else {
762 Inst++;
763 }
764 }
765 }
766}
767
770
771 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
772 // TODO: For now, we support simple static allocations, we might need to
773 // move non-static ones as well. However, this will need further analysis to
774 // move the lenght arguments as well.
776 };
777
778 for (llvm::Instruction &Inst : Block)
780 if (ShouldHoistAlloca(*AllocaInst))
781 AllocasToMove.push_back(AllocaInst);
782
783 auto InsertPoint =
784 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
785
786 for (llvm::Instruction *AllocaInst : AllocasToMove)
788}
789
791 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
793 SmallVector<OutlineInfo, 16> DeferredOutlines;
794 for (OutlineInfo &OI : OutlineInfos) {
795 // Skip functions that have not finalized yet; may happen with nested
796 // function generation.
797 if (Fn && OI.getFunction() != Fn) {
798 DeferredOutlines.push_back(OI);
799 continue;
800 }
801
802 ParallelRegionBlockSet.clear();
803 Blocks.clear();
804 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
805
806 Function *OuterFn = OI.getFunction();
807 CodeExtractorAnalysisCache CEAC(*OuterFn);
808 // If we generate code for the target device, we need to allocate
809 // struct for aggregate params in the device default alloca address space.
810 // OpenMP runtime requires that the params of the extracted functions are
811 // passed as zero address space pointers. This flag ensures that
812 // CodeExtractor generates correct code for extracted functions
813 // which are used by OpenMP runtime.
814 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
815 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
816 /* AggregateArgs */ true,
817 /* BlockFrequencyInfo */ nullptr,
818 /* BranchProbabilityInfo */ nullptr,
819 /* AssumptionCache */ nullptr,
820 /* AllowVarArgs */ true,
821 /* AllowAlloca */ true,
822 /* AllocaBlock*/ OI.OuterAllocaBB,
823 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
824
825 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
826 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
827 << " Exit: " << OI.ExitBB->getName() << "\n");
828 assert(Extractor.isEligible() &&
829 "Expected OpenMP outlining to be possible!");
830
831 for (auto *V : OI.ExcludeArgsFromAggregate)
832 Extractor.excludeArgFromAggregate(V);
833
834 Function *OutlinedFn =
835 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
836
837 // Forward target-cpu, target-features attributes to the outlined function.
838 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
839 if (TargetCpuAttr.isStringAttribute())
840 OutlinedFn->addFnAttr(TargetCpuAttr);
841
842 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
843 if (TargetFeaturesAttr.isStringAttribute())
844 OutlinedFn->addFnAttr(TargetFeaturesAttr);
845
846 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
847 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
848 assert(OutlinedFn->getReturnType()->isVoidTy() &&
849 "OpenMP outlined functions should not return a value!");
850
851 // For compability with the clang CG we move the outlined function after the
852 // one with the parallel region.
853 OutlinedFn->removeFromParent();
854 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
855
856 // Remove the artificial entry introduced by the extractor right away, we
857 // made our own entry block after all.
858 {
859 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
860 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
861 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
862 // Move instructions from the to-be-deleted ArtificialEntry to the entry
863 // basic block of the parallel region. CodeExtractor generates
864 // instructions to unwrap the aggregate argument and may sink
865 // allocas/bitcasts for values that are solely used in the outlined region
866 // and do not escape.
867 assert(!ArtificialEntry.empty() &&
868 "Expected instructions to add in the outlined region entry");
869 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
870 End = ArtificialEntry.rend();
871 It != End;) {
872 Instruction &I = *It;
873 It++;
874
875 if (I.isTerminator()) {
876 // Absorb any debug value that terminator may have
877 if (OI.EntryBB->getTerminator())
878 OI.EntryBB->getTerminator()->adoptDbgRecords(
879 &ArtificialEntry, I.getIterator(), false);
880 continue;
881 }
882
883 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
884 }
885
886 OI.EntryBB->moveBefore(&ArtificialEntry);
887 ArtificialEntry.eraseFromParent();
888 }
889 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
890 assert(OutlinedFn && OutlinedFn->hasNUses(1));
891
892 // Run a user callback, e.g. to add attributes.
893 if (OI.PostOutlineCB)
894 OI.PostOutlineCB(*OutlinedFn);
895
896 if (OI.FixUpNonEntryAllocas) {
897 PostDominatorTree PostDomTree(*OutlinedFn);
898 for (llvm::BasicBlock &BB : *OutlinedFn)
899 if (PostDomTree.properlyDominates(&BB, &OutlinedFn->getEntryBlock()))
901 }
902 }
903
904 // Remove work items that have been completed.
905 OutlineInfos = std::move(DeferredOutlines);
906
907 // The createTarget functions embeds user written code into
908 // the target region which may inject allocas which need to
909 // be moved to the entry block of our target or risk malformed
910 // optimisations by later passes, this is only relevant for
911 // the device pass which appears to be a little more delicate
912 // when it comes to optimisations (however, we do not block on
913 // that here, it's up to the inserter to the list to do so).
914 // This notbaly has to occur after the OutlinedInfo candidates
915 // have been extracted so we have an end product that will not
916 // be implicitly adversely affected by any raises unless
917 // intentionally appended to the list.
918 // NOTE: This only does so for ConstantData, it could be extended
919 // to ConstantExpr's with further effort, however, they should
920 // largely be folded when they get here. Extending it to runtime
921 // defined/read+writeable allocation sizes would be non-trivial
922 // (need to factor in movement of any stores to variables the
923 // allocation size depends on, as well as the usual loads,
924 // otherwise it'll yield the wrong result after movement) and
925 // likely be more suitable as an LLVM optimisation pass.
928
929 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
930 [](EmitMetadataErrorKind Kind,
931 const TargetRegionEntryInfo &EntryInfo) -> void {
932 errs() << "Error of kind: " << Kind
933 << " when emitting offload entries and metadata during "
934 "OMPIRBuilder finalization \n";
935 };
936
937 if (!OffloadInfoManager.empty())
939
940 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
941 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
942 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
943 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
944 }
945
946 IsFinalized = true;
947}
948
949bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
950
952 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
953}
954
956 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
957 auto *GV =
958 new GlobalVariable(M, I32Ty,
959 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
960 ConstantInt::get(I32Ty, Value), Name);
961 GV->setVisibility(GlobalValue::HiddenVisibility);
962
963 return GV;
964}
965
967 if (List.empty())
968 return;
969
970 // Convert List to what ConstantArray needs.
972 UsedArray.resize(List.size());
973 for (unsigned I = 0, E = List.size(); I != E; ++I)
975 cast<Constant>(&*List[I]), Builder.getPtrTy());
976
977 if (UsedArray.empty())
978 return;
979 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
980
981 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
982 ConstantArray::get(ATy, UsedArray), Name);
983
984 GV->setSection("llvm.metadata");
985}
986
989 OMPTgtExecModeFlags Mode) {
990 auto *Int8Ty = Builder.getInt8Ty();
991 auto *GVMode = new GlobalVariable(
992 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
993 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
994 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
995 return GVMode;
996}
997
999 uint32_t SrcLocStrSize,
1000 IdentFlag LocFlags,
1001 unsigned Reserve2Flags) {
1002 // Enable "C-mode".
1003 LocFlags |= OMP_IDENT_FLAG_KMPC;
1004
1005 Constant *&Ident =
1006 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1007 if (!Ident) {
1008 Constant *I32Null = ConstantInt::getNullValue(Int32);
1009 Constant *IdentData[] = {I32Null,
1010 ConstantInt::get(Int32, uint32_t(LocFlags)),
1011 ConstantInt::get(Int32, Reserve2Flags),
1012 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1013
1014 size_t SrcLocStrArgIdx = 4;
1015 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1017 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1018 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1019 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1020 Constant *Initializer =
1021 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1022
1023 // Look for existing encoding of the location + flags, not needed but
1024 // minimizes the difference to the existing solution while we transition.
1025 for (GlobalVariable &GV : M.globals())
1026 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1027 if (GV.getInitializer() == Initializer)
1028 Ident = &GV;
1029
1030 if (!Ident) {
1031 auto *GV = new GlobalVariable(
1032 M, OpenMPIRBuilder::Ident,
1033 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1035 M.getDataLayout().getDefaultGlobalsAddressSpace());
1036 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1037 GV->setAlignment(Align(8));
1038 Ident = GV;
1039 }
1040 }
1041
1042 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1043}
1044
1046 uint32_t &SrcLocStrSize) {
1047 SrcLocStrSize = LocStr.size();
1048 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1049 if (!SrcLocStr) {
1050 Constant *Initializer =
1051 ConstantDataArray::getString(M.getContext(), LocStr);
1052
1053 // Look for existing encoding of the location, not needed but minimizes the
1054 // difference to the existing solution while we transition.
1055 for (GlobalVariable &GV : M.globals())
1056 if (GV.isConstant() && GV.hasInitializer() &&
1057 GV.getInitializer() == Initializer)
1058 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1059
1060 SrcLocStr = Builder.CreateGlobalString(
1061 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1062 &M);
1063 }
1064 return SrcLocStr;
1065}
1066
1068 StringRef FileName,
1069 unsigned Line, unsigned Column,
1070 uint32_t &SrcLocStrSize) {
1071 SmallString<128> Buffer;
1072 Buffer.push_back(';');
1073 Buffer.append(FileName);
1074 Buffer.push_back(';');
1075 Buffer.append(FunctionName);
1076 Buffer.push_back(';');
1077 Buffer.append(std::to_string(Line));
1078 Buffer.push_back(';');
1079 Buffer.append(std::to_string(Column));
1080 Buffer.push_back(';');
1081 Buffer.push_back(';');
1082 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1083}
1084
1085Constant *
1087 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1088 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1089}
1090
1092 uint32_t &SrcLocStrSize,
1093 Function *F) {
1094 DILocation *DIL = DL.get();
1095 if (!DIL)
1096 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1097 StringRef FileName = M.getName();
1098 if (DIFile *DIF = DIL->getFile())
1099 if (std::optional<StringRef> Source = DIF->getSource())
1100 FileName = *Source;
1101 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1102 if (Function.empty() && F)
1103 Function = F->getName();
1104 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1105 DIL->getColumn(), SrcLocStrSize);
1106}
1107
1109 uint32_t &SrcLocStrSize) {
1110 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1111 Loc.IP.getBlock()->getParent());
1112}
1113
1116 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1117 "omp_global_thread_num");
1118}
1119
1122 bool ForceSimpleCall, bool CheckCancelFlag) {
1123 if (!updateToLocation(Loc))
1124 return Loc.IP;
1125
1126 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1127 // __kmpc_barrier(loc, thread_id);
1128
1129 IdentFlag BarrierLocFlags;
1130 switch (Kind) {
1131 case OMPD_for:
1132 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1133 break;
1134 case OMPD_sections:
1135 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1136 break;
1137 case OMPD_single:
1138 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1139 break;
1140 case OMPD_barrier:
1141 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1142 break;
1143 default:
1144 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1145 break;
1146 }
1147
1148 uint32_t SrcLocStrSize;
1149 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1150 Value *Args[] = {
1151 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1152 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1153
1154 // If we are in a cancellable parallel region, barriers are cancellation
1155 // points.
1156 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1157 bool UseCancelBarrier =
1158 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1159
1161 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1162 ? OMPRTL___kmpc_cancel_barrier
1163 : OMPRTL___kmpc_barrier),
1164 Args);
1165
1166 if (UseCancelBarrier && CheckCancelFlag)
1167 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1168 return Err;
1169
1170 return Builder.saveIP();
1171}
1172
1175 Value *IfCondition,
1176 omp::Directive CanceledDirective) {
1177 if (!updateToLocation(Loc))
1178 return Loc.IP;
1179
1180 // LLVM utilities like blocks with terminators.
1181 auto *UI = Builder.CreateUnreachable();
1182
1183 Instruction *ThenTI = UI, *ElseTI = nullptr;
1184 if (IfCondition) {
1185 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1186
1187 // Even if the if condition evaluates to false, this should count as a
1188 // cancellation point
1189 Builder.SetInsertPoint(ElseTI);
1190 auto ElseIP = Builder.saveIP();
1191
1193 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1194 if (!IPOrErr)
1195 return IPOrErr;
1196 }
1197
1198 Builder.SetInsertPoint(ThenTI);
1199
1200 Value *CancelKind = nullptr;
1201 switch (CanceledDirective) {
1202#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1203 case DirectiveEnum: \
1204 CancelKind = Builder.getInt32(Value); \
1205 break;
1206#include "llvm/Frontend/OpenMP/OMPKinds.def"
1207 default:
1208 llvm_unreachable("Unknown cancel kind!");
1209 }
1210
1211 uint32_t SrcLocStrSize;
1212 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1213 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1214 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1216 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1217
1218 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1219 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1220 return Err;
1221
1222 // Update the insertion point and remove the terminator we introduced.
1223 Builder.SetInsertPoint(UI->getParent());
1224 UI->eraseFromParent();
1225
1226 return Builder.saveIP();
1227}
1228
1231 omp::Directive CanceledDirective) {
1232 if (!updateToLocation(Loc))
1233 return Loc.IP;
1234
1235 // LLVM utilities like blocks with terminators.
1236 auto *UI = Builder.CreateUnreachable();
1237 Builder.SetInsertPoint(UI);
1238
1239 Value *CancelKind = nullptr;
1240 switch (CanceledDirective) {
1241#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1242 case DirectiveEnum: \
1243 CancelKind = Builder.getInt32(Value); \
1244 break;
1245#include "llvm/Frontend/OpenMP/OMPKinds.def"
1246 default:
1247 llvm_unreachable("Unknown cancel kind!");
1248 }
1249
1250 uint32_t SrcLocStrSize;
1251 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1252 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1253 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1255 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1256
1257 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1258 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1259 return Err;
1260
1261 // Update the insertion point and remove the terminator we introduced.
1262 Builder.SetInsertPoint(UI->getParent());
1263 UI->eraseFromParent();
1264
1265 return Builder.saveIP();
1266}
1267
1269 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1270 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1271 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1272 if (!updateToLocation(Loc))
1273 return Loc.IP;
1274
1275 Builder.restoreIP(AllocaIP);
1276 auto *KernelArgsPtr =
1277 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1279
1280 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1281 llvm::Value *Arg =
1282 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1283 Builder.CreateAlignedStore(
1284 KernelArgs[I], Arg,
1285 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1286 }
1287
1288 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1289 NumThreads, HostPtr, KernelArgsPtr};
1290
1292 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1293 OffloadingArgs);
1294
1295 return Builder.saveIP();
1296}
1297
1299 const LocationDescription &Loc, Value *OutlinedFnID,
1300 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1301 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1302
1303 if (!updateToLocation(Loc))
1304 return Loc.IP;
1305
1306 // On top of the arrays that were filled up, the target offloading call
1307 // takes as arguments the device id as well as the host pointer. The host
1308 // pointer is used by the runtime library to identify the current target
1309 // region, so it only has to be unique and not necessarily point to
1310 // anything. It could be the pointer to the outlined function that
1311 // implements the target region, but we aren't using that so that the
1312 // compiler doesn't need to keep that, and could therefore inline the host
1313 // function if proven worthwhile during optimization.
1314
1315 // From this point on, we need to have an ID of the target region defined.
1316 assert(OutlinedFnID && "Invalid outlined function ID!");
1317 (void)OutlinedFnID;
1318
1319 // Return value of the runtime offloading call.
1320 Value *Return = nullptr;
1321
1322 // Arguments for the target kernel.
1323 SmallVector<Value *> ArgsVector;
1324 getKernelArgsVector(Args, Builder, ArgsVector);
1325
1326 // The target region is an outlined function launched by the runtime
1327 // via calls to __tgt_target_kernel().
1328 //
1329 // Note that on the host and CPU targets, the runtime implementation of
1330 // these calls simply call the outlined function without forking threads.
1331 // The outlined functions themselves have runtime calls to
1332 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1333 // the compiler in emitTeamsCall() and emitParallelCall().
1334 //
1335 // In contrast, on the NVPTX target, the implementation of
1336 // __tgt_target_teams() launches a GPU kernel with the requested number
1337 // of teams and threads so no additional calls to the runtime are required.
1338 // Check the error code and execute the host version if required.
1339 Builder.restoreIP(emitTargetKernel(
1340 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1341 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1342
1343 BasicBlock *OffloadFailedBlock =
1344 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1345 BasicBlock *OffloadContBlock =
1346 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1347 Value *Failed = Builder.CreateIsNotNull(Return);
1348 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1349
1350 auto CurFn = Builder.GetInsertBlock()->getParent();
1351 emitBlock(OffloadFailedBlock, CurFn);
1352 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1353 if (!AfterIP)
1354 return AfterIP.takeError();
1355 Builder.restoreIP(*AfterIP);
1356 emitBranch(OffloadContBlock);
1357 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1358 return Builder.saveIP();
1359}
1360
1362 Value *CancelFlag, omp::Directive CanceledDirective) {
1363 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1364 "Unexpected cancellation!");
1365
1366 // For a cancel barrier we create two new blocks.
1367 BasicBlock *BB = Builder.GetInsertBlock();
1368 BasicBlock *NonCancellationBlock;
1369 if (Builder.GetInsertPoint() == BB->end()) {
1370 // TODO: This branch will not be needed once we moved to the
1371 // OpenMPIRBuilder codegen completely.
1372 NonCancellationBlock = BasicBlock::Create(
1373 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1374 } else {
1375 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1377 Builder.SetInsertPoint(BB);
1378 }
1379 BasicBlock *CancellationBlock = BasicBlock::Create(
1380 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1381
1382 // Jump to them based on the return value.
1383 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1384 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1385 /* TODO weight */ nullptr, nullptr);
1386
1387 // From the cancellation block we finalize all variables and go to the
1388 // post finalization block that is known to the FiniCB callback.
1389 auto &FI = FinalizationStack.back();
1390 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1391 if (!FiniBBOrErr)
1392 return FiniBBOrErr.takeError();
1393 Builder.SetInsertPoint(CancellationBlock);
1394 Builder.CreateBr(*FiniBBOrErr);
1395
1396 // The continuation block is where code generation continues.
1397 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1398 return Error::success();
1399}
1400
1401// Callback used to create OpenMP runtime calls to support
1402// omp parallel clause for the device.
1403// We need to use this callback to replace call to the OutlinedFn in OuterFn
1404// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1406 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1407 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1408 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1409 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1410 // Add some known attributes.
1411 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1412 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1413 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1414 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1415 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1416 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1417
1418 assert(OutlinedFn.arg_size() >= 2 &&
1419 "Expected at least tid and bounded tid as arguments");
1420 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1421
1422 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1423 assert(CI && "Expected call instruction to outlined function");
1424 CI->getParent()->setName("omp_parallel");
1425
1426 Builder.SetInsertPoint(CI);
1427 Type *PtrTy = OMPIRBuilder->VoidPtr;
1428 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1429
1430 // Add alloca for kernel args
1431 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1432 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1433 AllocaInst *ArgsAlloca =
1434 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1435 Value *Args = ArgsAlloca;
1436 // Add address space cast if array for storing arguments is not allocated
1437 // in address space 0
1438 if (ArgsAlloca->getAddressSpace())
1439 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1440 Builder.restoreIP(CurrentIP);
1441
1442 // Store captured vars which are used by kmpc_parallel_60
1443 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1444 Value *V = *(CI->arg_begin() + 2 + Idx);
1445 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1446 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1447 Builder.CreateStore(V, StoreAddress);
1448 }
1449
1450 Value *Cond =
1451 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1452 : Builder.getInt32(1);
1453
1454 // Build kmpc_parallel_60 call
1455 Value *Parallel60CallArgs[] = {
1456 /* identifier*/ Ident,
1457 /* global thread num*/ ThreadID,
1458 /* if expression */ Cond,
1459 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1460 /* Proc bind */ Builder.getInt32(-1),
1461 /* outlined function */ &OutlinedFn,
1462 /* wrapper function */ NullPtrValue,
1463 /* arguments of the outlined funciton*/ Args,
1464 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1465 /* strict for number of threads */ Builder.getInt32(0)};
1466
1467 FunctionCallee RTLFn =
1468 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1469
1470 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1471
1472 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1473 << *Builder.GetInsertBlock()->getParent() << "\n");
1474
1475 // Initialize the local TID stack location with the argument value.
1476 Builder.SetInsertPoint(PrivTID);
1477 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1478 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1479 PrivTIDAddr);
1480
1481 // Remove redundant call to the outlined function.
1482 CI->eraseFromParent();
1483
1484 for (Instruction *I : ToBeDeleted) {
1485 I->eraseFromParent();
1486 }
1487}
1488
1489// Callback used to create OpenMP runtime calls to support
1490// omp parallel clause for the host.
1491// We need to use this callback to replace call to the OutlinedFn in OuterFn
1492// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1493static void
1495 Function *OuterFn, Value *Ident, Value *IfCondition,
1496 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1497 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1498 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1499 FunctionCallee RTLFn;
1500 if (IfCondition) {
1501 RTLFn =
1502 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1503 } else {
1504 RTLFn =
1505 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1506 }
1507 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1508 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1509 LLVMContext &Ctx = F->getContext();
1510 MDBuilder MDB(Ctx);
1511 // Annotate the callback behavior of the __kmpc_fork_call:
1512 // - The callback callee is argument number 2 (microtask).
1513 // - The first two arguments of the callback callee are unknown (-1).
1514 // - All variadic arguments to the __kmpc_fork_call are passed to the
1515 // callback callee.
1516 F->addMetadata(LLVMContext::MD_callback,
1518 2, {-1, -1},
1519 /* VarArgsArePassed */ true)}));
1520 }
1521 }
1522 // Add some known attributes.
1523 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1524 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1525 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1526
1527 assert(OutlinedFn.arg_size() >= 2 &&
1528 "Expected at least tid and bounded tid as arguments");
1529 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1530
1531 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1532 CI->getParent()->setName("omp_parallel");
1533 Builder.SetInsertPoint(CI);
1534
1535 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1536 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1537 &OutlinedFn};
1538
1539 SmallVector<Value *, 16> RealArgs;
1540 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1541 if (IfCondition) {
1542 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1543 RealArgs.push_back(Cond);
1544 }
1545 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1546
1547 // __kmpc_fork_call_if always expects a void ptr as the last argument
1548 // If there are no arguments, pass a null pointer.
1549 auto PtrTy = OMPIRBuilder->VoidPtr;
1550 if (IfCondition && NumCapturedVars == 0) {
1551 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1552 RealArgs.push_back(NullPtrValue);
1553 }
1554
1555 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1556
1557 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1558 << *Builder.GetInsertBlock()->getParent() << "\n");
1559
1560 // Initialize the local TID stack location with the argument value.
1561 Builder.SetInsertPoint(PrivTID);
1562 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1563 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1564 PrivTIDAddr);
1565
1566 // Remove redundant call to the outlined function.
1567 CI->eraseFromParent();
1568
1569 for (Instruction *I : ToBeDeleted) {
1570 I->eraseFromParent();
1571 }
1572}
1573
1575 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1576 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1577 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1578 omp::ProcBindKind ProcBind, bool IsCancellable) {
1579 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1580
1581 if (!updateToLocation(Loc))
1582 return Loc.IP;
1583
1584 uint32_t SrcLocStrSize;
1585 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1586 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1587 Value *ThreadID = getOrCreateThreadID(Ident);
1588 // If we generate code for the target device, we need to allocate
1589 // struct for aggregate params in the device default alloca address space.
1590 // OpenMP runtime requires that the params of the extracted functions are
1591 // passed as zero address space pointers. This flag ensures that extracted
1592 // function arguments are declared in zero address space
1593 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1594
1595 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1596 // only if we compile for host side.
1597 if (NumThreads && !Config.isTargetDevice()) {
1598 Value *Args[] = {
1599 Ident, ThreadID,
1600 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1602 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1603 }
1604
1605 if (ProcBind != OMP_PROC_BIND_default) {
1606 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1607 Value *Args[] = {
1608 Ident, ThreadID,
1609 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1611 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1612 }
1613
1614 BasicBlock *InsertBB = Builder.GetInsertBlock();
1615 Function *OuterFn = InsertBB->getParent();
1616
1617 // Save the outer alloca block because the insertion iterator may get
1618 // invalidated and we still need this later.
1619 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1620
1621 // Vector to remember instructions we used only during the modeling but which
1622 // we want to delete at the end.
1624
1625 // Change the location to the outer alloca insertion point to create and
1626 // initialize the allocas we pass into the parallel region.
1627 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1628 Builder.restoreIP(NewOuter);
1629 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1630 AllocaInst *ZeroAddrAlloca =
1631 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1632 Instruction *TIDAddr = TIDAddrAlloca;
1633 Instruction *ZeroAddr = ZeroAddrAlloca;
1634 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1635 // Add additional casts to enforce pointers in zero address space
1636 TIDAddr = new AddrSpaceCastInst(
1637 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1638 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1639 ToBeDeleted.push_back(TIDAddr);
1640 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1641 PointerType ::get(M.getContext(), 0),
1642 "zero.addr.ascast");
1643 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1644 ToBeDeleted.push_back(ZeroAddr);
1645 }
1646
1647 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1648 // associated arguments in the outlined function, so we delete them later.
1649 ToBeDeleted.push_back(TIDAddrAlloca);
1650 ToBeDeleted.push_back(ZeroAddrAlloca);
1651
1652 // Create an artificial insertion point that will also ensure the blocks we
1653 // are about to split are not degenerated.
1654 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1655
1656 BasicBlock *EntryBB = UI->getParent();
1657 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1658 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1659 BasicBlock *PRegPreFiniBB =
1660 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1661 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1662
1663 auto FiniCBWrapper = [&](InsertPointTy IP) {
1664 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1665 // target to the region exit block.
1666 if (IP.getBlock()->end() == IP.getPoint()) {
1668 Builder.restoreIP(IP);
1669 Instruction *I = Builder.CreateBr(PRegExitBB);
1670 IP = InsertPointTy(I->getParent(), I->getIterator());
1671 }
1672 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1673 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1674 "Unexpected insertion point for finalization call!");
1675 return FiniCB(IP);
1676 };
1677
1678 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1679
1680 // Generate the privatization allocas in the block that will become the entry
1681 // of the outlined function.
1682 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1683 InsertPointTy InnerAllocaIP = Builder.saveIP();
1684
1685 AllocaInst *PrivTIDAddr =
1686 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1687 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1688
1689 // Add some fake uses for OpenMP provided arguments.
1690 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1691 Instruction *ZeroAddrUse =
1692 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1693 ToBeDeleted.push_back(ZeroAddrUse);
1694
1695 // EntryBB
1696 // |
1697 // V
1698 // PRegionEntryBB <- Privatization allocas are placed here.
1699 // |
1700 // V
1701 // PRegionBodyBB <- BodeGen is invoked here.
1702 // |
1703 // V
1704 // PRegPreFiniBB <- The block we will start finalization from.
1705 // |
1706 // V
1707 // PRegionExitBB <- A common exit to simplify block collection.
1708 //
1709
1710 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1711
1712 // Let the caller create the body.
1713 assert(BodyGenCB && "Expected body generation callback!");
1714 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1715 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1716 return Err;
1717
1718 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1719
1720 OutlineInfo OI;
1721 if (Config.isTargetDevice()) {
1722 // Generate OpenMP target specific runtime call
1723 OI.PostOutlineCB = [=, ToBeDeletedVec =
1724 std::move(ToBeDeleted)](Function &OutlinedFn) {
1725 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1726 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1727 ThreadID, ToBeDeletedVec);
1728 };
1729 OI.FixUpNonEntryAllocas = true;
1730 } else {
1731 // Generate OpenMP host runtime call
1732 OI.PostOutlineCB = [=, ToBeDeletedVec =
1733 std::move(ToBeDeleted)](Function &OutlinedFn) {
1734 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1735 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1736 };
1737 OI.FixUpNonEntryAllocas = true;
1738 }
1739
1740 OI.OuterAllocaBB = OuterAllocaBlock;
1741 OI.EntryBB = PRegEntryBB;
1742 OI.ExitBB = PRegExitBB;
1743
1744 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1746 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1747
1748 CodeExtractorAnalysisCache CEAC(*OuterFn);
1749 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1750 /* AggregateArgs */ false,
1751 /* BlockFrequencyInfo */ nullptr,
1752 /* BranchProbabilityInfo */ nullptr,
1753 /* AssumptionCache */ nullptr,
1754 /* AllowVarArgs */ true,
1755 /* AllowAlloca */ true,
1756 /* AllocationBlock */ OuterAllocaBlock,
1757 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1758
1759 // Find inputs to, outputs from the code region.
1760 BasicBlock *CommonExit = nullptr;
1761 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1762 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1763
1764 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1765 /*CollectGlobalInputs=*/true);
1766
1767 Inputs.remove_if([&](Value *I) {
1769 return GV->getValueType() == OpenMPIRBuilder::Ident;
1770
1771 return false;
1772 });
1773
1774 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1775
1776 FunctionCallee TIDRTLFn =
1777 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1778
1779 auto PrivHelper = [&](Value &V) -> Error {
1780 if (&V == TIDAddr || &V == ZeroAddr) {
1782 return Error::success();
1783 }
1784
1786 for (Use &U : V.uses())
1787 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1788 if (ParallelRegionBlockSet.count(UserI->getParent()))
1789 Uses.insert(&U);
1790
1791 // __kmpc_fork_call expects extra arguments as pointers. If the input
1792 // already has a pointer type, everything is fine. Otherwise, store the
1793 // value onto stack and load it back inside the to-be-outlined region. This
1794 // will ensure only the pointer will be passed to the function.
1795 // FIXME: if there are more than 15 trailing arguments, they must be
1796 // additionally packed in a struct.
1797 Value *Inner = &V;
1798 if (!V.getType()->isPointerTy()) {
1800 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1801
1802 Builder.restoreIP(OuterAllocaIP);
1803 Value *Ptr =
1804 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1805
1806 // Store to stack at end of the block that currently branches to the entry
1807 // block of the to-be-outlined region.
1808 Builder.SetInsertPoint(InsertBB,
1809 InsertBB->getTerminator()->getIterator());
1810 Builder.CreateStore(&V, Ptr);
1811
1812 // Load back next to allocations in the to-be-outlined region.
1813 Builder.restoreIP(InnerAllocaIP);
1814 Inner = Builder.CreateLoad(V.getType(), Ptr);
1815 }
1816
1817 Value *ReplacementValue = nullptr;
1818 CallInst *CI = dyn_cast<CallInst>(&V);
1819 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1820 ReplacementValue = PrivTID;
1821 } else {
1822 InsertPointOrErrorTy AfterIP =
1823 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1824 if (!AfterIP)
1825 return AfterIP.takeError();
1826 Builder.restoreIP(*AfterIP);
1827 InnerAllocaIP = {
1828 InnerAllocaIP.getBlock(),
1829 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1830
1831 assert(ReplacementValue &&
1832 "Expected copy/create callback to set replacement value!");
1833 if (ReplacementValue == &V)
1834 return Error::success();
1835 }
1836
1837 for (Use *UPtr : Uses)
1838 UPtr->set(ReplacementValue);
1839
1840 return Error::success();
1841 };
1842
1843 // Reset the inner alloca insertion as it will be used for loading the values
1844 // wrapped into pointers before passing them into the to-be-outlined region.
1845 // Configure it to insert immediately after the fake use of zero address so
1846 // that they are available in the generated body and so that the
1847 // OpenMP-related values (thread ID and zero address pointers) remain leading
1848 // in the argument list.
1849 InnerAllocaIP = IRBuilder<>::InsertPoint(
1850 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1851
1852 // Reset the outer alloca insertion point to the entry of the relevant block
1853 // in case it was invalidated.
1854 OuterAllocaIP = IRBuilder<>::InsertPoint(
1855 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1856
1857 for (Value *Input : Inputs) {
1858 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1859 if (Error Err = PrivHelper(*Input))
1860 return Err;
1861 }
1862 LLVM_DEBUG({
1863 for (Value *Output : Outputs)
1864 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1865 });
1866 assert(Outputs.empty() &&
1867 "OpenMP outlining should not produce live-out values!");
1868
1869 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1870 LLVM_DEBUG({
1871 for (auto *BB : Blocks)
1872 dbgs() << " PBR: " << BB->getName() << "\n";
1873 });
1874
1875 // Adjust the finalization stack, verify the adjustment, and call the
1876 // finalize function a last time to finalize values between the pre-fini
1877 // block and the exit block if we left the parallel "the normal way".
1878 auto FiniInfo = FinalizationStack.pop_back_val();
1879 (void)FiniInfo;
1880 assert(FiniInfo.DK == OMPD_parallel &&
1881 "Unexpected finalization stack state!");
1882
1883 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1884
1885 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1886 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1887 if (!FiniBBOrErr)
1888 return FiniBBOrErr.takeError();
1889 {
1891 Builder.restoreIP(PreFiniIP);
1892 Builder.CreateBr(*FiniBBOrErr);
1893 // There's currently a branch to omp.par.exit. Delete it. We will get there
1894 // via the fini block
1895 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1896 Term->eraseFromParent();
1897 }
1898
1899 // Register the outlined info.
1900 addOutlineInfo(std::move(OI));
1901
1902 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1903 UI->eraseFromParent();
1904
1905 return AfterIP;
1906}
1907
1909 // Build call void __kmpc_flush(ident_t *loc)
1910 uint32_t SrcLocStrSize;
1911 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1912 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1913
1915 Args);
1916}
1917
1919 if (!updateToLocation(Loc))
1920 return;
1921 emitFlush(Loc);
1922}
1923
1925 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1926 // global_tid);
1927 uint32_t SrcLocStrSize;
1928 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1929 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1930 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1931
1932 // Ignore return result until untied tasks are supported.
1934 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1935}
1936
1942
1944 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1945 uint32_t SrcLocStrSize;
1946 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1947 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1948 Constant *I32Null = ConstantInt::getNullValue(Int32);
1949 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1950
1952 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1953}
1954
1960
1961// Processes the dependencies in Dependencies and does the following
1962// - Allocates space on the stack of an array of DependInfo objects
1963// - Populates each DependInfo object with relevant information of
1964// the corresponding dependence.
1965// - All code is inserted in the entry block of the current function.
1967 OpenMPIRBuilder &OMPBuilder,
1969 // Early return if we have no dependencies to process
1970 if (Dependencies.empty())
1971 return nullptr;
1972
1973 // Given a vector of DependData objects, in this function we create an
1974 // array on the stack that holds kmp_dep_info objects corresponding
1975 // to each dependency. This is then passed to the OpenMP runtime.
1976 // For example, if there are 'n' dependencies then the following psedo
1977 // code is generated. Assume the first dependence is on a variable 'a'
1978 //
1979 // \code{c}
1980 // DepArray = alloc(n x sizeof(kmp_depend_info);
1981 // idx = 0;
1982 // DepArray[idx].base_addr = ptrtoint(&a);
1983 // DepArray[idx].len = 8;
1984 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1985 // ++idx;
1986 // DepArray[idx].base_addr = ...;
1987 // \endcode
1988
1989 IRBuilderBase &Builder = OMPBuilder.Builder;
1990 Type *DependInfo = OMPBuilder.DependInfo;
1991 Module &M = OMPBuilder.M;
1992
1993 Value *DepArray = nullptr;
1994 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
1995 Builder.SetInsertPoint(
1997
1998 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
1999 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2000
2001 Builder.restoreIP(OldIP);
2002
2003 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2004 Value *Base =
2005 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2006 // Store the pointer to the variable
2007 Value *Addr = Builder.CreateStructGEP(
2008 DependInfo, Base,
2009 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2010 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2011 Builder.CreateStore(DepValPtr, Addr);
2012 // Store the size of the variable
2013 Value *Size = Builder.CreateStructGEP(
2014 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2015 Builder.CreateStore(
2016 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2017 Size);
2018 // Store the dependency kind
2019 Value *Flags = Builder.CreateStructGEP(
2020 DependInfo, Base,
2021 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2022 Builder.CreateStore(
2023 ConstantInt::get(Builder.getInt8Ty(),
2024 static_cast<unsigned int>(Dep.DepKind)),
2025 Flags);
2026 }
2027 return DepArray;
2028}
2029
2030/// Create the task duplication function passed to kmpc_taskloop.
2031Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2032 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2033 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2034 if (!DupCB)
2036 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2037
2038 // From OpenMP Runtime p_task_dup_t:
2039 // Routine optionally generated by the compiler for setting the lastprivate
2040 // flag and calling needed constructors for private/firstprivate objects (used
2041 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2042 // lastprivate flag.
2043 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2044
2045 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2046
2047 FunctionType *DupFuncTy = FunctionType::get(
2048 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2049 /*isVarArg=*/false);
2050
2051 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2052 "omp_taskloop_dup", M);
2053 Value *DestTaskArg = DupFunction->getArg(0);
2054 Value *SrcTaskArg = DupFunction->getArg(1);
2055 Value *LastprivateFlagArg = DupFunction->getArg(2);
2056 DestTaskArg->setName("dest_task");
2057 SrcTaskArg->setName("src_task");
2058 LastprivateFlagArg->setName("lastprivate_flag");
2059
2060 IRBuilderBase::InsertPointGuard Guard(Builder);
2061 Builder.SetInsertPoint(
2062 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2063
2064 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2065 Type *TaskWithPrivatesTy =
2066 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2067 Value *TaskPrivates = Builder.CreateGEP(
2068 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2069 Value *ContextPtr = Builder.CreateGEP(
2070 PrivatesTy, TaskPrivates,
2071 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2072 return ContextPtr;
2073 };
2074
2075 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2076 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2077
2078 DestTaskContextPtr->setName("destPtr");
2079 SrcTaskContextPtr->setName("srcPtr");
2080
2081 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2082 DupFunction->getEntryBlock().begin());
2083 InsertPointTy CodeGenIP = Builder.saveIP();
2084 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2085 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2086 if (!AfterIPOrError)
2087 return AfterIPOrError.takeError();
2088 Builder.restoreIP(*AfterIPOrError);
2089
2090 Builder.CreateRetVoid();
2091
2092 return DupFunction;
2093}
2094
2095OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2096 const LocationDescription &Loc, InsertPointTy AllocaIP,
2097 BodyGenCallbackTy BodyGenCB,
2098 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2099 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2100 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2101 Value *Priority, TaskDupCallbackTy DupCB, Value *TaskContextStructPtrVal) {
2102
2103 if (!updateToLocation(Loc))
2104 return InsertPointTy();
2105
2106 uint32_t SrcLocStrSize;
2107 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2108 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2109
2110 BasicBlock *TaskloopExitBB =
2111 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2112 BasicBlock *TaskloopBodyBB =
2113 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2114 BasicBlock *TaskloopAllocaBB =
2115 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2116
2117 InsertPointTy TaskloopAllocaIP =
2118 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2119 InsertPointTy TaskloopBodyIP =
2120 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2121
2122 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2123 return Err;
2124
2125 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2126 if (!result) {
2127 return result.takeError();
2128 }
2129
2130 llvm::CanonicalLoopInfo *CLI = result.get();
2131 OutlineInfo OI;
2132 OI.EntryBB = TaskloopAllocaBB;
2133 OI.OuterAllocaBB = AllocaIP.getBlock();
2134 OI.ExitBB = TaskloopExitBB;
2135
2136 // Add the thread ID argument.
2137 SmallVector<Instruction *> ToBeDeleted;
2138 // dummy instruction to be used as a fake argument
2139 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2140 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2141 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2142 TaskloopAllocaIP, "lb", false, true);
2143 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2144 TaskloopAllocaIP, "ub", false, true);
2145 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2146 TaskloopAllocaIP, "step", false, true);
2147 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2148 // aggregate struct
2149 OI.Inputs.insert(FakeLB);
2150 OI.Inputs.insert(FakeUB);
2151 OI.Inputs.insert(FakeStep);
2152 if (TaskContextStructPtrVal)
2153 OI.Inputs.insert(TaskContextStructPtrVal);
2154 assert(
2155 (TaskContextStructPtrVal && DupCB) ||
2156 (!TaskContextStructPtrVal && !DupCB) &&
2157 "Task context struct ptr and duplication callback must be both set "
2158 "or both null");
2159
2160 // It isn't safe to run the duplication bodygen callback inside the post
2161 // outlining callback so this has to be run now before we know the real task
2162 // shareds structure type.
2163 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2164 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2165 Type *FakeSharedsTy = StructType::get(
2166 Builder.getContext(),
2167 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2168 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2169 FakeSharedsTy,
2170 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2171 if (!TaskDupFnOrErr) {
2172 return TaskDupFnOrErr.takeError();
2173 }
2174 Value *TaskDupFn = *TaskDupFnOrErr;
2175
2176 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2177 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2178 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2179 FakeStep, Final, Mergeable,
2180 Priority](Function &OutlinedFn) mutable {
2181 // Replace the Stale CI by appropriate RTL function call.
2182 assert(OutlinedFn.hasOneUse() &&
2183 "there must be a single user for the outlined function");
2184 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2185
2186 /* Create the casting for the Bounds Values that can be used when outlining
2187 * to replace the uses of the fakes with real values */
2188 BasicBlock *CodeReplBB = StaleCI->getParent();
2189 IRBuilderBase::InsertPoint CurrentIp = Builder.saveIP();
2190 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2191 Value *CastedLBVal =
2192 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2193 Value *CastedUBVal =
2194 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2195 Value *CastedStepVal =
2196 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2197 Builder.restoreIP(CurrentIp);
2198
2199 Builder.SetInsertPoint(StaleCI);
2200
2201 // Gather the arguments for emitting the runtime call for
2202 // @__kmpc_omp_task_alloc
2203 Function *TaskAllocFn =
2204 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2205
2206 Value *ThreadID = getOrCreateThreadID(Ident);
2207
2208 if (!NoGroup) {
2209 // Emit runtime call for @__kmpc_taskgroup
2210 Function *TaskgroupFn =
2211 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2212 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2213 }
2214
2215 // `flags` Argument Configuration
2216 // Task is tied if (Flags & 1) == 1.
2217 // Task is untied if (Flags & 1) == 0.
2218 // Task is final if (Flags & 2) == 2.
2219 // Task is not final if (Flags & 2) == 0.
2220 // Task is mergeable if (Flags & 4) == 4.
2221 // Task is not mergeable if (Flags & 4) == 0.
2222 // Task is priority if (Flags & 32) == 32.
2223 // Task is not priority if (Flags & 32) == 0.
2224 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2225 if (Final)
2226 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2227 if (Mergeable)
2228 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2229 if (Priority)
2230 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2231
2232 Value *TaskSize = Builder.getInt64(
2233 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2234
2235 AllocaInst *ArgStructAlloca =
2237 assert(ArgStructAlloca &&
2238 "Unable to find the alloca instruction corresponding to arguments "
2239 "for extracted function");
2240 StructType *ArgStructType =
2241 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2242 assert(ArgStructType && "Unable to find struct type corresponding to "
2243 "arguments for extracted function");
2244 Value *SharedsSize =
2245 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2246
2247 // Emit the @__kmpc_omp_task_alloc runtime call
2248 // The runtime call returns a pointer to an area where the task captured
2249 // variables must be copied before the task is run (TaskData)
2250 CallInst *TaskData = Builder.CreateCall(
2251 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2252 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2253 /*task_func=*/&OutlinedFn});
2254
2255 Value *Shareds = StaleCI->getArgOperand(1);
2256 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2257 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2258 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2259 SharedsSize);
2260 // Get the pointer to loop lb, ub, step from task ptr
2261 // and set up the lowerbound,upperbound and step values
2262 llvm::Value *Lb = Builder.CreateGEP(
2263 ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2264
2265 llvm::Value *Ub = Builder.CreateGEP(
2266 ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2267
2268 llvm::Value *Step = Builder.CreateGEP(
2269 ArgStructType, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2270 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2271
2272 // set up the arguments for emitting kmpc_taskloop runtime call
2273 // setting values for ifval, nogroup, sched, grainsize, task_dup
2274 Value *IfCondVal =
2275 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2276 : Builder.getInt32(1);
2277 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2278 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2279 Value *NoGroupVal = Builder.getInt32(1);
2280 Value *SchedVal = Builder.getInt32(Sched);
2281 Value *GrainSizeVal =
2282 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2283 : Builder.getInt64(0);
2284 Value *TaskDup = TaskDupFn;
2285
2286 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2287 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2288
2289 // taskloop runtime call
2290 Function *TaskloopFn =
2291 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2292 Builder.CreateCall(TaskloopFn, Args);
2293
2294 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2295 // nogroup is not defined
2296 if (!NoGroup) {
2297 Function *EndTaskgroupFn =
2298 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2299 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2300 }
2301
2302 StaleCI->eraseFromParent();
2303
2304 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2305
2306 LoadInst *SharedsOutlined =
2307 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2308 OutlinedFn.getArg(1)->replaceUsesWithIf(
2309 SharedsOutlined,
2310 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2311
2312 Value *IV = CLI->getIndVar();
2313 Type *IVTy = IV->getType();
2314 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2315
2316 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2317 // UpperBound. These GEP's can be reused for loading the tasks respective
2318 // bounds.
2319 Value *TaskLB = nullptr;
2320 Value *TaskUB = nullptr;
2321 Value *LoadTaskLB = nullptr;
2322 Value *LoadTaskUB = nullptr;
2323 for (Instruction &I : *TaskloopAllocaBB) {
2324 if (I.getOpcode() == Instruction::GetElementPtr) {
2325 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2326 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2327 switch (CI->getZExtValue()) {
2328 case 0:
2329 TaskLB = &I;
2330 break;
2331 case 1:
2332 TaskUB = &I;
2333 break;
2334 }
2335 }
2336 } else if (I.getOpcode() == Instruction::Load) {
2337 LoadInst &Load = cast<LoadInst>(I);
2338 if (Load.getPointerOperand() == TaskLB) {
2339 assert(TaskLB != nullptr && "Expected value for TaskLB");
2340 LoadTaskLB = &I;
2341 } else if (Load.getPointerOperand() == TaskUB) {
2342 assert(TaskUB != nullptr && "Expected value for TaskUB");
2343 LoadTaskUB = &I;
2344 }
2345 }
2346 }
2347
2348 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2349
2350 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2351 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2352 Value *TripCountMinusOne =
2353 Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
2354 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2355 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2356 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2357 // set the trip count in the CLI
2358 CLI->setTripCount(CastedTripCount);
2359
2360 Builder.SetInsertPoint(CLI->getBody(),
2361 CLI->getBody()->getFirstInsertionPt());
2362
2363 // The canonical loop is generated with a fixed lower bound. We need to
2364 // update the index calculation code to use the task's lower bound. The
2365 // generated code looks like this:
2366 // %omp_loop.iv = phi ...
2367 // ...
2368 // %tmp = mul [type] %omp_loop.iv, step
2369 // %user_index = add [type] tmp, lb
2370 // OpenMPIRBuilder constructs canonical loops to have exactly three uses of
2371 // the normalised induction variable:
2372 // 1. This one: converting the normalised IV to the user IV
2373 // 2. The increment (add)
2374 // 3. The comparison against the trip count (icmp)
2375 // (1) is the only use that is a mul followed by an add so this cannot match
2376 // other IR.
2377 assert(CLI->getIndVar()->getNumUses() == 3 &&
2378 "Canonical loop should have exactly three uses of the ind var");
2379 for (User *IVUser : CLI->getIndVar()->users()) {
2380 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2381 if (Mul->getOpcode() == Instruction::Mul) {
2382 for (User *MulUser : Mul->users()) {
2383 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2384 if (Add->getOpcode() == Instruction::Add) {
2385 Add->setOperand(1, CastedTaskLB);
2386 }
2387 }
2388 }
2389 }
2390 }
2391 }
2392
2393 FakeLB->replaceAllUsesWith(CastedLBVal);
2394 FakeUB->replaceAllUsesWith(CastedUBVal);
2395 FakeStep->replaceAllUsesWith(CastedStepVal);
2396 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2397 I->eraseFromParent();
2398 }
2399 };
2400
2401 addOutlineInfo(std::move(OI));
2402 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2403 return Builder.saveIP();
2404}
2405
2407 const LocationDescription &Loc, InsertPointTy AllocaIP,
2408 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2409 SmallVector<DependData> Dependencies, bool Mergeable, Value *EventHandle,
2410 Value *Priority) {
2411
2412 if (!updateToLocation(Loc))
2413 return InsertPointTy();
2414
2415 uint32_t SrcLocStrSize;
2416 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2417 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2418 // The current basic block is split into four basic blocks. After outlining,
2419 // they will be mapped as follows:
2420 // ```
2421 // def current_fn() {
2422 // current_basic_block:
2423 // br label %task.exit
2424 // task.exit:
2425 // ; instructions after task
2426 // }
2427 // def outlined_fn() {
2428 // task.alloca:
2429 // br label %task.body
2430 // task.body:
2431 // ret void
2432 // }
2433 // ```
2434 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2435 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2436 BasicBlock *TaskAllocaBB =
2437 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2438
2439 InsertPointTy TaskAllocaIP =
2440 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2441 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2442 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2443 return Err;
2444
2445 OutlineInfo OI;
2446 OI.EntryBB = TaskAllocaBB;
2447 OI.OuterAllocaBB = AllocaIP.getBlock();
2448 OI.ExitBB = TaskExitBB;
2449
2450 // Add the thread ID argument.
2453 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2454
2455 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2456 Mergeable, Priority, EventHandle, TaskAllocaBB,
2457 ToBeDeleted](Function &OutlinedFn) mutable {
2458 // Replace the Stale CI by appropriate RTL function call.
2459 assert(OutlinedFn.hasOneUse() &&
2460 "there must be a single user for the outlined function");
2461 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2462
2463 // HasShareds is true if any variables are captured in the outlined region,
2464 // false otherwise.
2465 bool HasShareds = StaleCI->arg_size() > 1;
2466 Builder.SetInsertPoint(StaleCI);
2467
2468 // Gather the arguments for emitting the runtime call for
2469 // @__kmpc_omp_task_alloc
2470 Function *TaskAllocFn =
2471 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2472
2473 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2474 // call.
2475 Value *ThreadID = getOrCreateThreadID(Ident);
2476
2477 // Argument - `flags`
2478 // Task is tied iff (Flags & 1) == 1.
2479 // Task is untied iff (Flags & 1) == 0.
2480 // Task is final iff (Flags & 2) == 2.
2481 // Task is not final iff (Flags & 2) == 0.
2482 // Task is mergeable iff (Flags & 4) == 4.
2483 // Task is not mergeable iff (Flags & 4) == 0.
2484 // Task is priority iff (Flags & 32) == 32.
2485 // Task is not priority iff (Flags & 32) == 0.
2486 // TODO: Handle the other flags.
2487 Value *Flags = Builder.getInt32(Tied);
2488 if (Final) {
2489 Value *FinalFlag =
2490 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2491 Flags = Builder.CreateOr(FinalFlag, Flags);
2492 }
2493
2494 if (Mergeable)
2495 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2496 if (Priority)
2497 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2498
2499 // Argument - `sizeof_kmp_task_t` (TaskSize)
2500 // Tasksize refers to the size in bytes of kmp_task_t data structure
2501 // including private vars accessed in task.
2502 // TODO: add kmp_task_t_with_privates (privates)
2503 Value *TaskSize = Builder.getInt64(
2504 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2505
2506 // Argument - `sizeof_shareds` (SharedsSize)
2507 // SharedsSize refers to the shareds array size in the kmp_task_t data
2508 // structure.
2509 Value *SharedsSize = Builder.getInt64(0);
2510 if (HasShareds) {
2511 AllocaInst *ArgStructAlloca =
2513 assert(ArgStructAlloca &&
2514 "Unable to find the alloca instruction corresponding to arguments "
2515 "for extracted function");
2516 StructType *ArgStructType =
2517 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
2518 assert(ArgStructType && "Unable to find struct type corresponding to "
2519 "arguments for extracted function");
2520 SharedsSize =
2521 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
2522 }
2523 // Emit the @__kmpc_omp_task_alloc runtime call
2524 // The runtime call returns a pointer to an area where the task captured
2525 // variables must be copied before the task is run (TaskData)
2527 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2528 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2529 /*task_func=*/&OutlinedFn});
2530
2531 // Emit detach clause initialization.
2532 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2533 // task_descriptor);
2534 if (EventHandle) {
2536 OMPRTL___kmpc_task_allow_completion_event);
2537 llvm::Value *EventVal =
2538 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2539 llvm::Value *EventHandleAddr =
2540 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2541 Builder.getPtrTy(0));
2542 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2543 Builder.CreateStore(EventVal, EventHandleAddr);
2544 }
2545 // Copy the arguments for outlined function
2546 if (HasShareds) {
2547 Value *Shareds = StaleCI->getArgOperand(1);
2548 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2549 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2550 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2551 SharedsSize);
2552 }
2553
2554 if (Priority) {
2555 //
2556 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2557 // we populate the priority information into the "kmp_task_t" here
2558 //
2559 // The struct "kmp_task_t" definition is available in kmp.h
2560 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2561 // data2 is used for priority
2562 //
2563 Type *Int32Ty = Builder.getInt32Ty();
2564 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2565 // kmp_task_t* => { ptr }
2566 Type *TaskPtr = StructType::get(VoidPtr);
2567 Value *TaskGEP =
2568 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2569 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2570 Type *TaskStructType = StructType::get(
2571 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2572 Value *PriorityData = Builder.CreateInBoundsGEP(
2573 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2574 // kmp_cmplrdata_t => { ptr, ptr }
2575 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2576 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2577 PriorityData, {Zero, Zero});
2578 Builder.CreateStore(Priority, CmplrData);
2579 }
2580
2581 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2582
2583 // In the presence of the `if` clause, the following IR is generated:
2584 // ...
2585 // %data = call @__kmpc_omp_task_alloc(...)
2586 // br i1 %if_condition, label %then, label %else
2587 // then:
2588 // call @__kmpc_omp_task(...)
2589 // br label %exit
2590 // else:
2591 // ;; Wait for resolution of dependencies, if any, before
2592 // ;; beginning the task
2593 // call @__kmpc_omp_wait_deps(...)
2594 // call @__kmpc_omp_task_begin_if0(...)
2595 // call @outlined_fn(...)
2596 // call @__kmpc_omp_task_complete_if0(...)
2597 // br label %exit
2598 // exit:
2599 // ...
2600 if (IfCondition) {
2601 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2602 // terminator.
2603 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2604 Instruction *IfTerminator =
2605 Builder.GetInsertPoint()->getParent()->getTerminator();
2606 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2607 Builder.SetInsertPoint(IfTerminator);
2608 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2609 &ElseTI);
2610 Builder.SetInsertPoint(ElseTI);
2611
2612 if (Dependencies.size()) {
2613 Function *TaskWaitFn =
2614 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2616 TaskWaitFn,
2617 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2618 ConstantInt::get(Builder.getInt32Ty(), 0),
2620 }
2621 Function *TaskBeginFn =
2622 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2623 Function *TaskCompleteFn =
2624 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2625 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2626 CallInst *CI = nullptr;
2627 if (HasShareds)
2628 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2629 else
2630 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2631 CI->setDebugLoc(StaleCI->getDebugLoc());
2632 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2633 Builder.SetInsertPoint(ThenTI);
2634 }
2635
2636 if (Dependencies.size()) {
2637 Function *TaskFn =
2638 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2640 TaskFn,
2641 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2642 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2644
2645 } else {
2646 // Emit the @__kmpc_omp_task runtime call to spawn the task
2647 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2648 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2649 }
2650
2651 StaleCI->eraseFromParent();
2652
2653 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2654 if (HasShareds) {
2655 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2656 OutlinedFn.getArg(1)->replaceUsesWithIf(
2657 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2658 }
2659
2660 for (Instruction *I : llvm::reverse(ToBeDeleted))
2661 I->eraseFromParent();
2662 };
2663
2664 addOutlineInfo(std::move(OI));
2665 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2666
2667 return Builder.saveIP();
2668}
2669
2672 InsertPointTy AllocaIP,
2673 BodyGenCallbackTy BodyGenCB) {
2674 if (!updateToLocation(Loc))
2675 return InsertPointTy();
2676
2677 uint32_t SrcLocStrSize;
2678 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2679 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2680 Value *ThreadID = getOrCreateThreadID(Ident);
2681
2682 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2683 Function *TaskgroupFn =
2684 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2685 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2686
2687 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2688 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2689 return Err;
2690
2691 Builder.SetInsertPoint(TaskgroupExitBB);
2692 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2693 Function *EndTaskgroupFn =
2694 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2695 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2696
2697 return Builder.saveIP();
2698}
2699
2701 const LocationDescription &Loc, InsertPointTy AllocaIP,
2703 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2704 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2705
2706 if (!updateToLocation(Loc))
2707 return Loc.IP;
2708
2709 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2710
2711 // Each section is emitted as a switch case
2712 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2713 // -> OMP.createSection() which generates the IR for each section
2714 // Iterate through all sections and emit a switch construct:
2715 // switch (IV) {
2716 // case 0:
2717 // <SectionStmt[0]>;
2718 // break;
2719 // ...
2720 // case <NumSection> - 1:
2721 // <SectionStmt[<NumSection> - 1]>;
2722 // break;
2723 // }
2724 // ...
2725 // section_loop.after:
2726 // <FiniCB>;
2727 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2728 Builder.restoreIP(CodeGenIP);
2730 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2731 Function *CurFn = Continue->getParent();
2732 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2733
2734 unsigned CaseNumber = 0;
2735 for (auto SectionCB : SectionCBs) {
2737 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2738 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2739 Builder.SetInsertPoint(CaseBB);
2740 BranchInst *CaseEndBr = Builder.CreateBr(Continue);
2741 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2742 CaseEndBr->getIterator()}))
2743 return Err;
2744 CaseNumber++;
2745 }
2746 // remove the existing terminator from body BB since there can be no
2747 // terminators after switch/case
2748 return Error::success();
2749 };
2750 // Loop body ends here
2751 // LowerBound, UpperBound, and STride for createCanonicalLoop
2752 Type *I32Ty = Type::getInt32Ty(M.getContext());
2753 Value *LB = ConstantInt::get(I32Ty, 0);
2754 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2755 Value *ST = ConstantInt::get(I32Ty, 1);
2757 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2758 if (!LoopInfo)
2759 return LoopInfo.takeError();
2760
2761 InsertPointOrErrorTy WsloopIP =
2762 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2763 WorksharingLoopType::ForStaticLoop, !IsNowait);
2764 if (!WsloopIP)
2765 return WsloopIP.takeError();
2766 InsertPointTy AfterIP = *WsloopIP;
2767
2768 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2769 assert(LoopFini && "Bad structure of static workshare loop finalization");
2770
2771 // Apply the finalization callback in LoopAfterBB
2772 auto FiniInfo = FinalizationStack.pop_back_val();
2773 assert(FiniInfo.DK == OMPD_sections &&
2774 "Unexpected finalization stack state!");
2775 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2776 return Err;
2777
2778 return AfterIP;
2779}
2780
2783 BodyGenCallbackTy BodyGenCB,
2784 FinalizeCallbackTy FiniCB) {
2785 if (!updateToLocation(Loc))
2786 return Loc.IP;
2787
2788 auto FiniCBWrapper = [&](InsertPointTy IP) {
2789 if (IP.getBlock()->end() != IP.getPoint())
2790 return FiniCB(IP);
2791 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2792 // will fail because that function requires the Finalization Basic Block to
2793 // have a terminator, which is already removed by EmitOMPRegionBody.
2794 // IP is currently at cancelation block.
2795 // We need to backtrack to the condition block to fetch
2796 // the exit block and create a branch from cancelation
2797 // to exit block.
2799 Builder.restoreIP(IP);
2800 auto *CaseBB = Loc.IP.getBlock();
2801 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2802 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2803 Instruction *I = Builder.CreateBr(ExitBB);
2804 IP = InsertPointTy(I->getParent(), I->getIterator());
2805 return FiniCB(IP);
2806 };
2807
2808 Directive OMPD = Directive::OMPD_sections;
2809 // Since we are using Finalization Callback here, HasFinalize
2810 // and IsCancellable have to be true
2811 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2812 /*Conditional*/ false, /*hasFinalize*/ true,
2813 /*IsCancellable*/ true);
2814}
2815
2821
2822Value *OpenMPIRBuilder::getGPUThreadID() {
2825 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2826 {});
2827}
2828
2829Value *OpenMPIRBuilder::getGPUWarpSize() {
2831 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2832}
2833
2834Value *OpenMPIRBuilder::getNVPTXWarpID() {
2835 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2836 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2837}
2838
2839Value *OpenMPIRBuilder::getNVPTXLaneID() {
2840 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2841 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2842 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2843 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2844 "nvptx_lane_id");
2845}
2846
2847Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2848 Type *ToType) {
2849 Type *FromType = From->getType();
2850 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2851 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2852 assert(FromSize > 0 && "From size must be greater than zero");
2853 assert(ToSize > 0 && "To size must be greater than zero");
2854 if (FromType == ToType)
2855 return From;
2856 if (FromSize == ToSize)
2857 return Builder.CreateBitCast(From, ToType);
2858 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2859 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2860 InsertPointTy SaveIP = Builder.saveIP();
2861 Builder.restoreIP(AllocaIP);
2862 Value *CastItem = Builder.CreateAlloca(ToType);
2863 Builder.restoreIP(SaveIP);
2864
2865 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2866 CastItem, Builder.getPtrTy(0));
2867 Builder.CreateStore(From, ValCastItem);
2868 return Builder.CreateLoad(ToType, CastItem);
2869}
2870
2871Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2872 Value *Element,
2873 Type *ElementType,
2874 Value *Offset) {
2875 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2876 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2877
2878 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2879 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2880 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2881 Value *WarpSize =
2882 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2884 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2885 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2886 Value *WarpSizeCast =
2887 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2888 Value *ShuffleCall =
2889 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2890 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2891}
2892
2893void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2894 Value *DstAddr, Type *ElemType,
2895 Value *Offset, Type *ReductionArrayTy,
2896 bool IsByRefElem) {
2897 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2898 // Create the loop over the big sized data.
2899 // ptr = (void*)Elem;
2900 // ptrEnd = (void*) Elem + 1;
2901 // Step = 8;
2902 // while (ptr + Step < ptrEnd)
2903 // shuffle((int64_t)*ptr);
2904 // Step = 4;
2905 // while (ptr + Step < ptrEnd)
2906 // shuffle((int32_t)*ptr);
2907 // ...
2908 Type *IndexTy = Builder.getIndexTy(
2909 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2910 Value *ElemPtr = DstAddr;
2911 Value *Ptr = SrcAddr;
2912 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2913 if (Size < IntSize)
2914 continue;
2915 Type *IntType = Builder.getIntNTy(IntSize * 8);
2916 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2917 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2918 Value *SrcAddrGEP =
2919 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2920 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2921 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2922
2923 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2924 if ((Size / IntSize) > 1) {
2925 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2926 SrcAddrGEP, Builder.getPtrTy());
2927 BasicBlock *PreCondBB =
2928 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2929 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2930 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2931 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2932 emitBlock(PreCondBB, CurFunc);
2933 PHINode *PhiSrc =
2934 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2935 PhiSrc->addIncoming(Ptr, CurrentBB);
2936 PHINode *PhiDest =
2937 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2938 PhiDest->addIncoming(ElemPtr, CurrentBB);
2939 Ptr = PhiSrc;
2940 ElemPtr = PhiDest;
2941 Value *PtrDiff = Builder.CreatePtrDiff(
2942 Builder.getInt8Ty(), PtrEnd,
2943 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2944 Builder.CreateCondBr(
2945 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2946 ExitBB);
2947 emitBlock(ThenBB, CurFunc);
2948 Value *Res = createRuntimeShuffleFunction(
2949 AllocaIP,
2950 Builder.CreateAlignedLoad(
2951 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2952 IntType, Offset);
2953 Builder.CreateAlignedStore(Res, ElemPtr,
2954 M.getDataLayout().getPrefTypeAlign(ElemType));
2955 Value *LocalPtr =
2956 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2957 Value *LocalElemPtr =
2958 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2959 PhiSrc->addIncoming(LocalPtr, ThenBB);
2960 PhiDest->addIncoming(LocalElemPtr, ThenBB);
2961 emitBranch(PreCondBB);
2962 emitBlock(ExitBB, CurFunc);
2963 } else {
2964 Value *Res = createRuntimeShuffleFunction(
2965 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
2966 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
2967 Res->getType()->getScalarSizeInBits())
2968 Res = Builder.CreateTrunc(Res, ElemType);
2969 Builder.CreateStore(Res, ElemPtr);
2970 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2971 ElemPtr =
2972 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
2973 }
2974 Size = Size % IntSize;
2975 }
2976}
2977
2978Error OpenMPIRBuilder::emitReductionListCopy(
2979 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
2980 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
2981 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
2982 Type *IndexTy = Builder.getIndexTy(
2983 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2984 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2985
2986 // Iterates, element-by-element, through the source Reduce list and
2987 // make a copy.
2988 for (auto En : enumerate(ReductionInfos)) {
2989 const ReductionInfo &RI = En.value();
2990 Value *SrcElementAddr = nullptr;
2991 AllocaInst *DestAlloca = nullptr;
2992 Value *DestElementAddr = nullptr;
2993 Value *DestElementPtrAddr = nullptr;
2994 // Should we shuffle in an element from a remote lane?
2995 bool ShuffleInElement = false;
2996 // Set to true to update the pointer in the dest Reduce list to a
2997 // newly created element.
2998 bool UpdateDestListPtr = false;
2999
3000 // Step 1.1: Get the address for the src element in the Reduce list.
3001 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3002 ReductionArrayTy, SrcBase,
3003 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3004 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3005
3006 // Step 1.2: Create a temporary to store the element in the destination
3007 // Reduce list.
3008 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3009 ReductionArrayTy, DestBase,
3010 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3011 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3012 switch (Action) {
3014 InsertPointTy CurIP = Builder.saveIP();
3015 Builder.restoreIP(AllocaIP);
3016
3017 Type *DestAllocaType =
3018 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3019 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3020 ".omp.reduction.element");
3021 DestAlloca->setAlignment(
3022 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3023 DestElementAddr = DestAlloca;
3024 DestElementAddr =
3025 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3026 DestElementAddr->getName() + ".ascast");
3027 Builder.restoreIP(CurIP);
3028 ShuffleInElement = true;
3029 UpdateDestListPtr = true;
3030 break;
3031 }
3033 DestElementAddr =
3034 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3035 break;
3036 }
3037 }
3038
3039 // Now that all active lanes have read the element in the
3040 // Reduce list, shuffle over the value from the remote lane.
3041 if (ShuffleInElement) {
3042 Type *ShuffleType = RI.ElementType;
3043 Value *ShuffleSrcAddr = SrcElementAddr;
3044 Value *ShuffleDestAddr = DestElementAddr;
3045 AllocaInst *LocalStorage = nullptr;
3046
3047 if (IsByRefElem) {
3048 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3049 assert(RI.ByRefAllocatedType &&
3050 "Expected by-ref allocated type to be set");
3051 // For by-ref reductions, we need to copy from the remote lane the
3052 // actual value of the partial reduction computed by that remote lane;
3053 // rather than, for example, a pointer to that data or, even worse, a
3054 // pointer to the descriptor of the by-ref reduction element.
3055 ShuffleType = RI.ByRefElementType;
3056
3057 InsertPointOrErrorTy GenResult =
3058 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3059
3060 if (!GenResult)
3061 return GenResult.takeError();
3062
3063 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3064
3065 {
3066 InsertPointTy OldIP = Builder.saveIP();
3067 Builder.restoreIP(AllocaIP);
3068
3069 LocalStorage = Builder.CreateAlloca(ShuffleType);
3070 Builder.restoreIP(OldIP);
3071 ShuffleDestAddr = LocalStorage;
3072 }
3073 }
3074
3075 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3076 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3077
3078 if (IsByRefElem) {
3079 Value *GEP;
3080 InsertPointOrErrorTy GenResult =
3081 RI.DataPtrPtrGen(Builder.saveIP(),
3082 Builder.CreatePointerBitCastOrAddrSpaceCast(
3083 DestAlloca, Builder.getPtrTy(), ".ascast"),
3084 GEP);
3085
3086 if (!GenResult)
3087 return GenResult.takeError();
3088
3089 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3090 LocalStorage, Builder.getPtrTy(), ".ascast"),
3091 GEP);
3092 }
3093 } else {
3094 switch (RI.EvaluationKind) {
3095 case EvalKind::Scalar: {
3096 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3097 // Store the source element value to the dest element address.
3098 Builder.CreateStore(Elem, DestElementAddr);
3099 break;
3100 }
3101 case EvalKind::Complex: {
3102 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3103 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3104 Value *SrcReal = Builder.CreateLoad(
3105 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3106 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3107 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3108 Value *SrcImg = Builder.CreateLoad(
3109 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3110
3111 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3112 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3113 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3114 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3115 Builder.CreateStore(SrcReal, DestRealPtr);
3116 Builder.CreateStore(SrcImg, DestImgPtr);
3117 break;
3118 }
3119 case EvalKind::Aggregate: {
3120 Value *SizeVal = Builder.getInt64(
3121 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3122 Builder.CreateMemCpy(
3123 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3124 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3125 SizeVal, false);
3126 break;
3127 }
3128 };
3129 }
3130
3131 // Step 3.1: Modify reference in dest Reduce list as needed.
3132 // Modifying the reference in Reduce list to point to the newly
3133 // created element. The element is live in the current function
3134 // scope and that of functions it invokes (i.e., reduce_function).
3135 // RemoteReduceData[i] = (void*)&RemoteElem
3136 if (UpdateDestListPtr) {
3137 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3138 DestElementAddr, Builder.getPtrTy(),
3139 DestElementAddr->getName() + ".ascast");
3140 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3141 }
3142 }
3143
3144 return Error::success();
3145}
3146
3147Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3148 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3149 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3150 InsertPointTy SavedIP = Builder.saveIP();
3151 LLVMContext &Ctx = M.getContext();
3152 FunctionType *FuncTy = FunctionType::get(
3153 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3154 /* IsVarArg */ false);
3155 Function *WcFunc =
3157 "_omp_reduction_inter_warp_copy_func", &M);
3158 WcFunc->setAttributes(FuncAttrs);
3159 WcFunc->addParamAttr(0, Attribute::NoUndef);
3160 WcFunc->addParamAttr(1, Attribute::NoUndef);
3161 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3162 Builder.SetInsertPoint(EntryBB);
3163
3164 // ReduceList: thread local Reduce list.
3165 // At the stage of the computation when this function is called, partially
3166 // aggregated values reside in the first lane of every active warp.
3167 Argument *ReduceListArg = WcFunc->getArg(0);
3168 // NumWarps: number of warps active in the parallel region. This could
3169 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3170 Argument *NumWarpsArg = WcFunc->getArg(1);
3171
3172 // This array is used as a medium to transfer, one reduce element at a time,
3173 // the data from the first lane of every warp to lanes in the first warp
3174 // in order to perform the final step of a reduction in a parallel region
3175 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3176 // for reduced latency, as well as to have a distinct copy for concurrently
3177 // executing target regions. The array is declared with common linkage so
3178 // as to be shared across compilation units.
3179 StringRef TransferMediumName =
3180 "__openmp_nvptx_data_transfer_temporary_storage";
3181 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3182 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3183 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3184 if (!TransferMedium) {
3185 TransferMedium = new GlobalVariable(
3186 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3187 UndefValue::get(ArrayTy), TransferMediumName,
3188 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3189 /*AddressSpace=*/3);
3190 }
3191
3192 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3193 Value *GPUThreadID = getGPUThreadID();
3194 // nvptx_lane_id = nvptx_id % warpsize
3195 Value *LaneID = getNVPTXLaneID();
3196 // nvptx_warp_id = nvptx_id / warpsize
3197 Value *WarpID = getNVPTXWarpID();
3198
3199 InsertPointTy AllocaIP =
3200 InsertPointTy(Builder.GetInsertBlock(),
3201 Builder.GetInsertBlock()->getFirstInsertionPt());
3202 Type *Arg0Type = ReduceListArg->getType();
3203 Type *Arg1Type = NumWarpsArg->getType();
3204 Builder.restoreIP(AllocaIP);
3205 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3206 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3207 AllocaInst *NumWarpsAlloca =
3208 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3209 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3210 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3211 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3212 NumWarpsAlloca, Builder.getPtrTy(0),
3213 NumWarpsAlloca->getName() + ".ascast");
3214 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3215 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3216 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3217 InsertPointTy CodeGenIP =
3218 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3219 Builder.restoreIP(CodeGenIP);
3220
3221 Value *ReduceList =
3222 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3223
3224 for (auto En : enumerate(ReductionInfos)) {
3225 //
3226 // Warp master copies reduce element to transfer medium in __shared__
3227 // memory.
3228 //
3229 const ReductionInfo &RI = En.value();
3230 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3231 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3232 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3233 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3234 Type *CType = Builder.getIntNTy(TySize * 8);
3235
3236 unsigned NumIters = RealTySize / TySize;
3237 if (NumIters == 0)
3238 continue;
3239 Value *Cnt = nullptr;
3240 Value *CntAddr = nullptr;
3241 BasicBlock *PrecondBB = nullptr;
3242 BasicBlock *ExitBB = nullptr;
3243 if (NumIters > 1) {
3244 CodeGenIP = Builder.saveIP();
3245 Builder.restoreIP(AllocaIP);
3246 CntAddr =
3247 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3248
3249 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3250 CntAddr->getName() + ".ascast");
3251 Builder.restoreIP(CodeGenIP);
3252 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3253 CntAddr,
3254 /*Volatile=*/false);
3255 PrecondBB = BasicBlock::Create(Ctx, "precond");
3256 ExitBB = BasicBlock::Create(Ctx, "exit");
3257 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3258 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3259 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3260 /*Volatile=*/false);
3261 Value *Cmp = Builder.CreateICmpULT(
3262 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3263 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3264 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3265 }
3266
3267 // kmpc_barrier.
3268 InsertPointOrErrorTy BarrierIP1 =
3269 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3270 omp::Directive::OMPD_unknown,
3271 /* ForceSimpleCall */ false,
3272 /* CheckCancelFlag */ true);
3273 if (!BarrierIP1)
3274 return BarrierIP1.takeError();
3275 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3276 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3277 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3278
3279 // if (lane_id == 0)
3280 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3281 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3282 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3283
3284 // Reduce element = LocalReduceList[i]
3285 auto *RedListArrayTy =
3286 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3287 Type *IndexTy = Builder.getIndexTy(
3288 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3289 Value *ElemPtrPtr =
3290 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3291 {ConstantInt::get(IndexTy, 0),
3292 ConstantInt::get(IndexTy, En.index())});
3293 // elemptr = ((CopyType*)(elemptrptr)) + I
3294 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3295
3296 if (IsByRefElem) {
3297 InsertPointOrErrorTy GenRes =
3298 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3299
3300 if (!GenRes)
3301 return GenRes.takeError();
3302
3303 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3304 }
3305
3306 if (NumIters > 1)
3307 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3308
3309 // Get pointer to location in transfer medium.
3310 // MediumPtr = &medium[warp_id]
3311 Value *MediumPtr = Builder.CreateInBoundsGEP(
3312 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3313 // elem = *elemptr
3314 //*MediumPtr = elem
3315 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3316 // Store the source element value to the dest element address.
3317 Builder.CreateStore(Elem, MediumPtr,
3318 /*IsVolatile*/ true);
3319 Builder.CreateBr(MergeBB);
3320
3321 // else
3322 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3323 Builder.CreateBr(MergeBB);
3324
3325 // endif
3326 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3327 InsertPointOrErrorTy BarrierIP2 =
3328 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3329 omp::Directive::OMPD_unknown,
3330 /* ForceSimpleCall */ false,
3331 /* CheckCancelFlag */ true);
3332 if (!BarrierIP2)
3333 return BarrierIP2.takeError();
3334
3335 // Warp 0 copies reduce element from transfer medium
3336 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3337 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3338 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3339
3340 Value *NumWarpsVal =
3341 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3342 // Up to 32 threads in warp 0 are active.
3343 Value *IsActiveThread =
3344 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3345 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3346
3347 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3348
3349 // SecMediumPtr = &medium[tid]
3350 // SrcMediumVal = *SrcMediumPtr
3351 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3352 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3353 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3354 Value *TargetElemPtrPtr =
3355 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3356 {ConstantInt::get(IndexTy, 0),
3357 ConstantInt::get(IndexTy, En.index())});
3358 Value *TargetElemPtrVal =
3359 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3360 Value *TargetElemPtr = TargetElemPtrVal;
3361
3362 if (IsByRefElem) {
3363 InsertPointOrErrorTy GenRes =
3364 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3365
3366 if (!GenRes)
3367 return GenRes.takeError();
3368
3369 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3370 }
3371
3372 if (NumIters > 1)
3373 TargetElemPtr =
3374 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3375
3376 // *TargetElemPtr = SrcMediumVal;
3377 Value *SrcMediumValue =
3378 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3379 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3380 Builder.CreateBr(W0MergeBB);
3381
3382 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3383 Builder.CreateBr(W0MergeBB);
3384
3385 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3386
3387 if (NumIters > 1) {
3388 Cnt = Builder.CreateNSWAdd(
3389 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3390 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3391
3392 auto *CurFn = Builder.GetInsertBlock()->getParent();
3393 emitBranch(PrecondBB);
3394 emitBlock(ExitBB, CurFn);
3395 }
3396 RealTySize %= TySize;
3397 }
3398 }
3399
3400 Builder.CreateRetVoid();
3401 Builder.restoreIP(SavedIP);
3402
3403 return WcFunc;
3404}
3405
3406Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3407 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3408 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3409 LLVMContext &Ctx = M.getContext();
3410 FunctionType *FuncTy =
3411 FunctionType::get(Builder.getVoidTy(),
3412 {Builder.getPtrTy(), Builder.getInt16Ty(),
3413 Builder.getInt16Ty(), Builder.getInt16Ty()},
3414 /* IsVarArg */ false);
3415 Function *SarFunc =
3417 "_omp_reduction_shuffle_and_reduce_func", &M);
3418 SarFunc->setAttributes(FuncAttrs);
3419 SarFunc->addParamAttr(0, Attribute::NoUndef);
3420 SarFunc->addParamAttr(1, Attribute::NoUndef);
3421 SarFunc->addParamAttr(2, Attribute::NoUndef);
3422 SarFunc->addParamAttr(3, Attribute::NoUndef);
3423 SarFunc->addParamAttr(1, Attribute::SExt);
3424 SarFunc->addParamAttr(2, Attribute::SExt);
3425 SarFunc->addParamAttr(3, Attribute::SExt);
3426 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3427 Builder.SetInsertPoint(EntryBB);
3428
3429 // Thread local Reduce list used to host the values of data to be reduced.
3430 Argument *ReduceListArg = SarFunc->getArg(0);
3431 // Current lane id; could be logical.
3432 Argument *LaneIDArg = SarFunc->getArg(1);
3433 // Offset of the remote source lane relative to the current lane.
3434 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3435 // Algorithm version. This is expected to be known at compile time.
3436 Argument *AlgoVerArg = SarFunc->getArg(3);
3437
3438 Type *ReduceListArgType = ReduceListArg->getType();
3439 Type *LaneIDArgType = LaneIDArg->getType();
3440 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3441 Value *ReduceListAlloca = Builder.CreateAlloca(
3442 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3443 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3444 LaneIDArg->getName() + ".addr");
3445 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3446 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3447 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3448 AlgoVerArg->getName() + ".addr");
3449 ArrayType *RedListArrayTy =
3450 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3451
3452 // Create a local thread-private variable to host the Reduce list
3453 // from a remote lane.
3454 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3455 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3456
3457 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3458 ReduceListAlloca, ReduceListArgType,
3459 ReduceListAlloca->getName() + ".ascast");
3460 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3461 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3462 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3463 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3464 RemoteLaneOffsetAlloca->getName() + ".ascast");
3465 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3466 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3467 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3468 RemoteReductionListAlloca, Builder.getPtrTy(),
3469 RemoteReductionListAlloca->getName() + ".ascast");
3470
3471 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3472 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3473 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3474 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3475
3476 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3477 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3478 Value *RemoteLaneOffset =
3479 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3480 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3481
3482 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3483
3484 // This loop iterates through the list of reduce elements and copies,
3485 // element by element, from a remote lane in the warp to RemoteReduceList,
3486 // hosted on the thread's stack.
3487 Error EmitRedLsCpRes = emitReductionListCopy(
3488 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3489 ReduceList, RemoteListAddrCast, IsByRef,
3490 {RemoteLaneOffset, nullptr, nullptr});
3491
3492 if (EmitRedLsCpRes)
3493 return EmitRedLsCpRes;
3494
3495 // The actions to be performed on the Remote Reduce list is dependent
3496 // on the algorithm version.
3497 //
3498 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3499 // LaneId % 2 == 0 && Offset > 0):
3500 // do the reduction value aggregation
3501 //
3502 // The thread local variable Reduce list is mutated in place to host the
3503 // reduced data, which is the aggregated value produced from local and
3504 // remote lanes.
3505 //
3506 // Note that AlgoVer is expected to be a constant integer known at compile
3507 // time.
3508 // When AlgoVer==0, the first conjunction evaluates to true, making
3509 // the entire predicate true during compile time.
3510 // When AlgoVer==1, the second conjunction has only the second part to be
3511 // evaluated during runtime. Other conjunctions evaluates to false
3512 // during compile time.
3513 // When AlgoVer==2, the third conjunction has only the second part to be
3514 // evaluated during runtime. Other conjunctions evaluates to false
3515 // during compile time.
3516 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3517 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3518 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3519 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3520 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3521 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3522 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3523 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3524 Value *RemoteOffsetComp =
3525 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3526 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3527 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3528 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3529
3530 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3531 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3532 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3533
3534 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3535 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3536 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3537 ReduceList, Builder.getPtrTy());
3538 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3539 RemoteListAddrCast, Builder.getPtrTy());
3540 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3541 ->addFnAttr(Attribute::NoUnwind);
3542 Builder.CreateBr(MergeBB);
3543
3544 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3545 Builder.CreateBr(MergeBB);
3546
3547 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3548
3549 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3550 // Reduce list.
3551 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3552 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3553 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3554
3555 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3556 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3557 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3558 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3559
3560 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3561
3562 EmitRedLsCpRes = emitReductionListCopy(
3563 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3564 RemoteListAddrCast, ReduceList, IsByRef);
3565
3566 if (EmitRedLsCpRes)
3567 return EmitRedLsCpRes;
3568
3569 Builder.CreateBr(CpyMergeBB);
3570
3571 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3572 Builder.CreateBr(CpyMergeBB);
3573
3574 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3575
3576 Builder.CreateRetVoid();
3577
3578 return SarFunc;
3579}
3580
3581Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3582 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3583 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3584 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3585 LLVMContext &Ctx = M.getContext();
3586 FunctionType *FuncTy = FunctionType::get(
3587 Builder.getVoidTy(),
3588 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3589 /* IsVarArg */ false);
3590 Function *LtGCFunc =
3592 "_omp_reduction_list_to_global_copy_func", &M);
3593 LtGCFunc->setAttributes(FuncAttrs);
3594 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3595 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3596 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3597
3598 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3599 Builder.SetInsertPoint(EntryBlock);
3600
3601 // Buffer: global reduction buffer.
3602 Argument *BufferArg = LtGCFunc->getArg(0);
3603 // Idx: index of the buffer.
3604 Argument *IdxArg = LtGCFunc->getArg(1);
3605 // ReduceList: thread local Reduce list.
3606 Argument *ReduceListArg = LtGCFunc->getArg(2);
3607
3608 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3609 BufferArg->getName() + ".addr");
3610 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3611 IdxArg->getName() + ".addr");
3612 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3613 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3614 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3615 BufferArgAlloca, Builder.getPtrTy(),
3616 BufferArgAlloca->getName() + ".ascast");
3617 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3618 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3619 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3620 ReduceListArgAlloca, Builder.getPtrTy(),
3621 ReduceListArgAlloca->getName() + ".ascast");
3622
3623 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3624 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3625 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3626
3627 Value *LocalReduceList =
3628 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3629 Value *BufferArgVal =
3630 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3631 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3632 Type *IndexTy = Builder.getIndexTy(
3633 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3634 for (auto En : enumerate(ReductionInfos)) {
3635 const ReductionInfo &RI = En.value();
3636 auto *RedListArrayTy =
3637 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3638 // Reduce element = LocalReduceList[i]
3639 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3640 RedListArrayTy, LocalReduceList,
3641 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3642 // elemptr = ((CopyType*)(elemptrptr)) + I
3643 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3644
3645 // Global = Buffer.VD[Idx];
3646 Value *BufferVD =
3647 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3648 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3649 ReductionsBufferTy, BufferVD, 0, En.index());
3650
3651 switch (RI.EvaluationKind) {
3652 case EvalKind::Scalar: {
3653 Value *TargetElement;
3654
3655 if (IsByRef.empty() || !IsByRef[En.index()]) {
3656 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3657 } else {
3658 InsertPointOrErrorTy GenResult =
3659 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3660
3661 if (!GenResult)
3662 return GenResult.takeError();
3663
3664 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3665 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3666 }
3667
3668 Builder.CreateStore(TargetElement, GlobVal);
3669 break;
3670 }
3671 case EvalKind::Complex: {
3672 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3673 RI.ElementType, ElemPtr, 0, 0, ".realp");
3674 Value *SrcReal = Builder.CreateLoad(
3675 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3676 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3677 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3678 Value *SrcImg = Builder.CreateLoad(
3679 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3680
3681 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3682 RI.ElementType, GlobVal, 0, 0, ".realp");
3683 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3684 RI.ElementType, GlobVal, 0, 1, ".imagp");
3685 Builder.CreateStore(SrcReal, DestRealPtr);
3686 Builder.CreateStore(SrcImg, DestImgPtr);
3687 break;
3688 }
3689 case EvalKind::Aggregate: {
3690 Value *SizeVal =
3691 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3692 Builder.CreateMemCpy(
3693 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3694 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3695 break;
3696 }
3697 }
3698 }
3699
3700 Builder.CreateRetVoid();
3701 Builder.restoreIP(OldIP);
3702 return LtGCFunc;
3703}
3704
3705Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3706 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3707 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3708 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3709 LLVMContext &Ctx = M.getContext();
3710 FunctionType *FuncTy = FunctionType::get(
3711 Builder.getVoidTy(),
3712 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3713 /* IsVarArg */ false);
3714 Function *LtGRFunc =
3716 "_omp_reduction_list_to_global_reduce_func", &M);
3717 LtGRFunc->setAttributes(FuncAttrs);
3718 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3719 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3720 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3721
3722 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3723 Builder.SetInsertPoint(EntryBlock);
3724
3725 // Buffer: global reduction buffer.
3726 Argument *BufferArg = LtGRFunc->getArg(0);
3727 // Idx: index of the buffer.
3728 Argument *IdxArg = LtGRFunc->getArg(1);
3729 // ReduceList: thread local Reduce list.
3730 Argument *ReduceListArg = LtGRFunc->getArg(2);
3731
3732 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3733 BufferArg->getName() + ".addr");
3734 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3735 IdxArg->getName() + ".addr");
3736 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3737 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3738 auto *RedListArrayTy =
3739 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3740
3741 // 1. Build a list of reduction variables.
3742 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3743 Value *LocalReduceList =
3744 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3745
3746 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3747
3748 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3749 BufferArgAlloca, Builder.getPtrTy(),
3750 BufferArgAlloca->getName() + ".ascast");
3751 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3752 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3753 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3754 ReduceListArgAlloca, Builder.getPtrTy(),
3755 ReduceListArgAlloca->getName() + ".ascast");
3756 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3757 LocalReduceList, Builder.getPtrTy(),
3758 LocalReduceList->getName() + ".ascast");
3759
3760 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3761 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3762 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3763
3764 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3765 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3766 Type *IndexTy = Builder.getIndexTy(
3767 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3768 for (auto En : enumerate(ReductionInfos)) {
3769 const ReductionInfo &RI = En.value();
3770 Value *ByRefAlloc;
3771
3772 if (!IsByRef.empty() && IsByRef[En.index()]) {
3773 InsertPointTy OldIP = Builder.saveIP();
3774 Builder.restoreIP(AllocaIP);
3775
3776 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3777 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3778 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3779
3780 Builder.restoreIP(OldIP);
3781 }
3782
3783 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3784 RedListArrayTy, LocalReduceListAddrCast,
3785 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3786 Value *BufferVD =
3787 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3788 // Global = Buffer.VD[Idx];
3789 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3790 ReductionsBufferTy, BufferVD, 0, En.index());
3791
3792 if (!IsByRef.empty() && IsByRef[En.index()]) {
3793 Value *ByRefDataPtr;
3794
3795 InsertPointOrErrorTy GenResult =
3796 RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr);
3797
3798 if (!GenResult)
3799 return GenResult.takeError();
3800
3801 Builder.CreateStore(GlobValPtr, ByRefDataPtr);
3802 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3803 } else {
3804 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3805 }
3806 }
3807
3808 // Call reduce_function(GlobalReduceList, ReduceList)
3809 Value *ReduceList =
3810 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3811 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3812 ->addFnAttr(Attribute::NoUnwind);
3813 Builder.CreateRetVoid();
3814 Builder.restoreIP(OldIP);
3815 return LtGRFunc;
3816}
3817
3818Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3819 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3820 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3821 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3822 LLVMContext &Ctx = M.getContext();
3823 FunctionType *FuncTy = FunctionType::get(
3824 Builder.getVoidTy(),
3825 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3826 /* IsVarArg */ false);
3827 Function *GtLCFunc =
3829 "_omp_reduction_global_to_list_copy_func", &M);
3830 GtLCFunc->setAttributes(FuncAttrs);
3831 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3832 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3833 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3834
3835 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3836 Builder.SetInsertPoint(EntryBlock);
3837
3838 // Buffer: global reduction buffer.
3839 Argument *BufferArg = GtLCFunc->getArg(0);
3840 // Idx: index of the buffer.
3841 Argument *IdxArg = GtLCFunc->getArg(1);
3842 // ReduceList: thread local Reduce list.
3843 Argument *ReduceListArg = GtLCFunc->getArg(2);
3844
3845 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3846 BufferArg->getName() + ".addr");
3847 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3848 IdxArg->getName() + ".addr");
3849 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3850 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3851 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3852 BufferArgAlloca, Builder.getPtrTy(),
3853 BufferArgAlloca->getName() + ".ascast");
3854 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3855 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3856 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3857 ReduceListArgAlloca, Builder.getPtrTy(),
3858 ReduceListArgAlloca->getName() + ".ascast");
3859 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3860 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3861 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3862
3863 Value *LocalReduceList =
3864 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3865 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3866 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3867 Type *IndexTy = Builder.getIndexTy(
3868 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3869 for (auto En : enumerate(ReductionInfos)) {
3870 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3871 auto *RedListArrayTy =
3872 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3873 // Reduce element = LocalReduceList[i]
3874 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3875 RedListArrayTy, LocalReduceList,
3876 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3877 // elemptr = ((CopyType*)(elemptrptr)) + I
3878 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3879 // Global = Buffer.VD[Idx];
3880 Value *BufferVD =
3881 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3882 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3883 ReductionsBufferTy, BufferVD, 0, En.index());
3884
3885 switch (RI.EvaluationKind) {
3886 case EvalKind::Scalar: {
3887 Type *ElemType = RI.ElementType;
3888
3889 if (!IsByRef.empty() && IsByRef[En.index()]) {
3890 ElemType = RI.ByRefElementType;
3891 InsertPointOrErrorTy GenResult =
3892 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3893
3894 if (!GenResult)
3895 return GenResult.takeError();
3896
3897 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3898 }
3899
3900 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3901 Builder.CreateStore(TargetElement, ElemPtr);
3902 break;
3903 }
3904 case EvalKind::Complex: {
3905 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3906 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3907 Value *SrcReal = Builder.CreateLoad(
3908 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3909 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3910 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3911 Value *SrcImg = Builder.CreateLoad(
3912 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3913
3914 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3915 RI.ElementType, ElemPtr, 0, 0, ".realp");
3916 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3917 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3918 Builder.CreateStore(SrcReal, DestRealPtr);
3919 Builder.CreateStore(SrcImg, DestImgPtr);
3920 break;
3921 }
3922 case EvalKind::Aggregate: {
3923 Value *SizeVal =
3924 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3925 Builder.CreateMemCpy(
3926 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3927 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3928 SizeVal, false);
3929 break;
3930 }
3931 }
3932 }
3933
3934 Builder.CreateRetVoid();
3935 Builder.restoreIP(OldIP);
3936 return GtLCFunc;
3937}
3938
3939Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
3940 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3941 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3942 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3943 LLVMContext &Ctx = M.getContext();
3944 auto *FuncTy = FunctionType::get(
3945 Builder.getVoidTy(),
3946 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3947 /* IsVarArg */ false);
3948 Function *GtLRFunc =
3950 "_omp_reduction_global_to_list_reduce_func", &M);
3951 GtLRFunc->setAttributes(FuncAttrs);
3952 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
3953 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
3954 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
3955
3956 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
3957 Builder.SetInsertPoint(EntryBlock);
3958
3959 // Buffer: global reduction buffer.
3960 Argument *BufferArg = GtLRFunc->getArg(0);
3961 // Idx: index of the buffer.
3962 Argument *IdxArg = GtLRFunc->getArg(1);
3963 // ReduceList: thread local Reduce list.
3964 Argument *ReduceListArg = GtLRFunc->getArg(2);
3965
3966 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3967 BufferArg->getName() + ".addr");
3968 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3969 IdxArg->getName() + ".addr");
3970 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3971 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3972 ArrayType *RedListArrayTy =
3973 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3974
3975 // 1. Build a list of reduction variables.
3976 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3977 Value *LocalReduceList =
3978 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3979
3980 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3981
3982 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3983 BufferArgAlloca, Builder.getPtrTy(),
3984 BufferArgAlloca->getName() + ".ascast");
3985 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3986 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3987 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3988 ReduceListArgAlloca, Builder.getPtrTy(),
3989 ReduceListArgAlloca->getName() + ".ascast");
3990 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
3991 LocalReduceList, Builder.getPtrTy(),
3992 LocalReduceList->getName() + ".ascast");
3993
3994 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3995 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3996 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3997
3998 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3999 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4000 Type *IndexTy = Builder.getIndexTy(
4001 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4002 for (auto En : enumerate(ReductionInfos)) {
4003 const ReductionInfo &RI = En.value();
4004 Value *ByRefAlloc;
4005
4006 if (!IsByRef.empty() && IsByRef[En.index()]) {
4007 InsertPointTy OldIP = Builder.saveIP();
4008 Builder.restoreIP(AllocaIP);
4009
4010 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4011 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4012 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4013
4014 Builder.restoreIP(OldIP);
4015 }
4016
4017 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4018 RedListArrayTy, ReductionList,
4019 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4020 // Global = Buffer.VD[Idx];
4021 Value *BufferVD =
4022 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4023 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4024 ReductionsBufferTy, BufferVD, 0, En.index());
4025
4026 if (!IsByRef.empty() && IsByRef[En.index()]) {
4027 Value *ByRefDataPtr;
4028 InsertPointOrErrorTy GenResult =
4029 RI.DataPtrPtrGen(Builder.saveIP(), ByRefAlloc, ByRefDataPtr);
4030 if (!GenResult)
4031 return GenResult.takeError();
4032
4033 Builder.CreateStore(GlobValPtr, ByRefDataPtr);
4034 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4035 } else {
4036 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4037 }
4038 }
4039
4040 // Call reduce_function(ReduceList, GlobalReduceList)
4041 Value *ReduceList =
4042 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4043 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4044 ->addFnAttr(Attribute::NoUnwind);
4045 Builder.CreateRetVoid();
4046 Builder.restoreIP(OldIP);
4047 return GtLRFunc;
4048}
4049
4050std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4051 std::string Suffix =
4052 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4053 return (Name + Suffix).str();
4054}
4055
4056Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4057 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4059 AttributeList FuncAttrs) {
4060 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4061 {Builder.getPtrTy(), Builder.getPtrTy()},
4062 /* IsVarArg */ false);
4063 std::string Name = getReductionFuncName(ReducerName);
4064 Function *ReductionFunc =
4066 ReductionFunc->setAttributes(FuncAttrs);
4067 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4068 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4069 BasicBlock *EntryBB =
4070 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4071 Builder.SetInsertPoint(EntryBB);
4072
4073 // Need to alloca memory here and deal with the pointers before getting
4074 // LHS/RHS pointers out
4075 Value *LHSArrayPtr = nullptr;
4076 Value *RHSArrayPtr = nullptr;
4077 Argument *Arg0 = ReductionFunc->getArg(0);
4078 Argument *Arg1 = ReductionFunc->getArg(1);
4079 Type *Arg0Type = Arg0->getType();
4080 Type *Arg1Type = Arg1->getType();
4081
4082 Value *LHSAlloca =
4083 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4084 Value *RHSAlloca =
4085 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4086 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4087 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4088 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4089 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4090 Builder.CreateStore(Arg0, LHSAddrCast);
4091 Builder.CreateStore(Arg1, RHSAddrCast);
4092 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4093 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4094
4095 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4096 Type *IndexTy = Builder.getIndexTy(
4097 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4098 SmallVector<Value *> LHSPtrs, RHSPtrs;
4099 for (auto En : enumerate(ReductionInfos)) {
4100 const ReductionInfo &RI = En.value();
4101 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4102 RedArrayTy, RHSArrayPtr,
4103 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4104 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4105 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4106 RHSI8Ptr, RI.PrivateVariable->getType(),
4107 RHSI8Ptr->getName() + ".ascast");
4108
4109 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4110 RedArrayTy, LHSArrayPtr,
4111 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4112 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4113 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4114 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4115
4117 LHSPtrs.emplace_back(LHSPtr);
4118 RHSPtrs.emplace_back(RHSPtr);
4119 } else {
4120 Value *LHS = LHSPtr;
4121 Value *RHS = RHSPtr;
4122
4123 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4124 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4125 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4126 }
4127
4128 Value *Reduced;
4129 InsertPointOrErrorTy AfterIP =
4130 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4131 if (!AfterIP)
4132 return AfterIP.takeError();
4133 if (!Builder.GetInsertBlock())
4134 return ReductionFunc;
4135
4136 Builder.restoreIP(*AfterIP);
4137
4138 if (!IsByRef.empty() && !IsByRef[En.index()])
4139 Builder.CreateStore(Reduced, LHSPtr);
4140 }
4141 }
4142
4144 for (auto En : enumerate(ReductionInfos)) {
4145 unsigned Index = En.index();
4146 const ReductionInfo &RI = En.value();
4147 Value *LHSFixupPtr, *RHSFixupPtr;
4148 Builder.restoreIP(RI.ReductionGenClang(
4149 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4150
4151 // Fix the CallBack code genereated to use the correct Values for the LHS
4152 // and RHS
4153 LHSFixupPtr->replaceUsesWithIf(
4154 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4155 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4156 ReductionFunc;
4157 });
4158 RHSFixupPtr->replaceUsesWithIf(
4159 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4160 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4161 ReductionFunc;
4162 });
4163 }
4164
4165 Builder.CreateRetVoid();
4166 return ReductionFunc;
4167}
4168
4169static void
4171 bool IsGPU) {
4172 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4173 (void)RI;
4174 assert(RI.Variable && "expected non-null variable");
4175 assert(RI.PrivateVariable && "expected non-null private variable");
4176 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4177 "expected non-null reduction generator callback");
4178 if (!IsGPU) {
4179 assert(
4180 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4181 "expected variables and their private equivalents to have the same "
4182 "type");
4183 }
4184 assert(RI.Variable->getType()->isPointerTy() &&
4185 "expected variables to be pointers");
4186 }
4187}
4188
4190 const LocationDescription &Loc, InsertPointTy AllocaIP,
4191 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4192 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4193 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4194 unsigned ReductionBufNum, Value *SrcLocInfo) {
4195 if (!updateToLocation(Loc))
4196 return InsertPointTy();
4197 Builder.restoreIP(CodeGenIP);
4198 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4199 LLVMContext &Ctx = M.getContext();
4200
4201 // Source location for the ident struct
4202 if (!SrcLocInfo) {
4203 uint32_t SrcLocStrSize;
4204 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4205 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4206 }
4207
4208 if (ReductionInfos.size() == 0)
4209 return Builder.saveIP();
4210
4211 BasicBlock *ContinuationBlock = nullptr;
4213 // Copied code from createReductions
4214 BasicBlock *InsertBlock = Loc.IP.getBlock();
4215 ContinuationBlock =
4216 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4217 InsertBlock->getTerminator()->eraseFromParent();
4218 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4219 }
4220
4221 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4222 AttributeList FuncAttrs;
4223 AttrBuilder AttrBldr(Ctx);
4224 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4225 AttrBldr.addAttribute(Attr);
4226 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4227 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4228
4229 CodeGenIP = Builder.saveIP();
4230 Expected<Function *> ReductionResult = createReductionFunction(
4231 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4232 ReductionGenCBKind, FuncAttrs);
4233 if (!ReductionResult)
4234 return ReductionResult.takeError();
4235 Function *ReductionFunc = *ReductionResult;
4236 Builder.restoreIP(CodeGenIP);
4237
4238 // Set the grid value in the config needed for lowering later on
4239 if (GridValue.has_value())
4240 Config.setGridValue(GridValue.value());
4241 else
4242 Config.setGridValue(getGridValue(T, ReductionFunc));
4243
4244 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4245 // RedList, shuffle_reduce_func, interwarp_copy_func);
4246 // or
4247 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4248 Value *Res;
4249
4250 // 1. Build a list of reduction variables.
4251 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4252 auto Size = ReductionInfos.size();
4253 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4254 Type *FuncPtrTy =
4255 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4256 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4257 CodeGenIP = Builder.saveIP();
4258 Builder.restoreIP(AllocaIP);
4259 Value *ReductionListAlloca =
4260 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4261 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4262 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4263 Builder.restoreIP(CodeGenIP);
4264 Type *IndexTy = Builder.getIndexTy(
4265 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4266 for (auto En : enumerate(ReductionInfos)) {
4267 const ReductionInfo &RI = En.value();
4268 Value *ElemPtr = Builder.CreateInBoundsGEP(
4269 RedArrayTy, ReductionList,
4270 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4271
4272 Value *PrivateVar = RI.PrivateVariable;
4273 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4274 if (IsByRefElem)
4275 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4276
4277 Value *CastElem =
4278 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4279 Builder.CreateStore(CastElem, ElemPtr);
4280 }
4281 CodeGenIP = Builder.saveIP();
4282 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4283 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4284
4285 if (!SarFunc)
4286 return SarFunc.takeError();
4287
4288 Expected<Function *> CopyResult =
4289 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4290 if (!CopyResult)
4291 return CopyResult.takeError();
4292 Function *WcFunc = *CopyResult;
4293 Builder.restoreIP(CodeGenIP);
4294
4295 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4296
4297 unsigned MaxDataSize = 0;
4298 SmallVector<Type *> ReductionTypeArgs;
4299 for (auto En : enumerate(ReductionInfos)) {
4300 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
4301 if (Size > MaxDataSize)
4302 MaxDataSize = Size;
4303 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4304 ? En.value().ByRefElementType
4305 : En.value().ElementType;
4306 ReductionTypeArgs.emplace_back(RedTypeArg);
4307 }
4308 Value *ReductionDataSize =
4309 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4310 if (!IsTeamsReduction) {
4311 Value *SarFuncCast =
4312 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4313 Value *WcFuncCast =
4314 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4315 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4316 WcFuncCast};
4318 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4319 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4320 } else {
4321 CodeGenIP = Builder.saveIP();
4322 StructType *ReductionsBufferTy = StructType::create(
4323 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4324 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4325 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4326
4327 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4328 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4329 if (!LtGCFunc)
4330 return LtGCFunc.takeError();
4331
4332 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4333 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4334 if (!LtGRFunc)
4335 return LtGRFunc.takeError();
4336
4337 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4338 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4339 if (!GtLCFunc)
4340 return GtLCFunc.takeError();
4341
4342 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4343 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4344 if (!GtLRFunc)
4345 return GtLRFunc.takeError();
4346
4347 Builder.restoreIP(CodeGenIP);
4348
4349 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4350 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4351
4352 Value *Args3[] = {SrcLocInfo,
4353 KernelTeamsReductionPtr,
4354 Builder.getInt32(ReductionBufNum),
4355 ReductionDataSize,
4356 RL,
4357 *SarFunc,
4358 WcFunc,
4359 *LtGCFunc,
4360 *LtGRFunc,
4361 *GtLCFunc,
4362 *GtLRFunc};
4363
4364 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4365 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4366 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4367 }
4368
4369 // 5. Build if (res == 1)
4370 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4371 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4372 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4373 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4374
4375 // 6. Build then branch: where we have reduced values in the master
4376 // thread in each team.
4377 // __kmpc_end_reduce{_nowait}(<gtid>);
4378 // break;
4379 emitBlock(ThenBB, CurFunc);
4380
4381 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4382 for (auto En : enumerate(ReductionInfos)) {
4383 const ReductionInfo &RI = En.value();
4385 Value *RedValue = RI.Variable;
4386 Value *RHS =
4387 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4388
4390 Value *LHSPtr, *RHSPtr;
4391 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4392 &LHSPtr, &RHSPtr, CurFunc));
4393
4394 // Fix the CallBack code genereated to use the correct Values for the LHS
4395 // and RHS
4396 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4397 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4398 ReductionFunc;
4399 });
4400 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4401 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4402 ReductionFunc;
4403 });
4404 } else {
4405 if (IsByRef.empty() || !IsByRef[En.index()]) {
4406 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4407 "red.value." + Twine(En.index()));
4408 }
4409 Value *PrivateRedValue = Builder.CreateLoad(
4410 ValueType, RHS, "red.private.value" + Twine(En.index()));
4411 Value *Reduced;
4412 InsertPointOrErrorTy AfterIP =
4413 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4414 if (!AfterIP)
4415 return AfterIP.takeError();
4416 Builder.restoreIP(*AfterIP);
4417
4418 if (!IsByRef.empty() && !IsByRef[En.index()])
4419 Builder.CreateStore(Reduced, RI.Variable);
4420 }
4421 }
4422 emitBlock(ExitBB, CurFunc);
4423 if (ContinuationBlock) {
4424 Builder.CreateBr(ContinuationBlock);
4425 Builder.SetInsertPoint(ContinuationBlock);
4426 }
4427 Config.setEmitLLVMUsed();
4428
4429 return Builder.saveIP();
4430}
4431
4433 Type *VoidTy = Type::getVoidTy(M.getContext());
4434 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4435 auto *FuncTy =
4436 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4438 ".omp.reduction.func", &M);
4439}
4440
4442 Function *ReductionFunc,
4444 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4445 Module *Module = ReductionFunc->getParent();
4446 BasicBlock *ReductionFuncBlock =
4447 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4448 Builder.SetInsertPoint(ReductionFuncBlock);
4449 Value *LHSArrayPtr = nullptr;
4450 Value *RHSArrayPtr = nullptr;
4451 if (IsGPU) {
4452 // Need to alloca memory here and deal with the pointers before getting
4453 // LHS/RHS pointers out
4454 //
4455 Argument *Arg0 = ReductionFunc->getArg(0);
4456 Argument *Arg1 = ReductionFunc->getArg(1);
4457 Type *Arg0Type = Arg0->getType();
4458 Type *Arg1Type = Arg1->getType();
4459
4460 Value *LHSAlloca =
4461 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4462 Value *RHSAlloca =
4463 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4464 Value *LHSAddrCast =
4465 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4466 Value *RHSAddrCast =
4467 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4468 Builder.CreateStore(Arg0, LHSAddrCast);
4469 Builder.CreateStore(Arg1, RHSAddrCast);
4470 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4471 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4472 } else {
4473 LHSArrayPtr = ReductionFunc->getArg(0);
4474 RHSArrayPtr = ReductionFunc->getArg(1);
4475 }
4476
4477 unsigned NumReductions = ReductionInfos.size();
4478 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4479
4480 for (auto En : enumerate(ReductionInfos)) {
4481 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4482 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4483 RedArrayTy, LHSArrayPtr, 0, En.index());
4484 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4485 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4486 LHSI8Ptr, RI.Variable->getType());
4487 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4488 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4489 RedArrayTy, RHSArrayPtr, 0, En.index());
4490 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4491 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4492 RHSI8Ptr, RI.PrivateVariable->getType());
4493 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4494 Value *Reduced;
4496 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4497 if (!AfterIP)
4498 return AfterIP.takeError();
4499
4500 Builder.restoreIP(*AfterIP);
4501 // TODO: Consider flagging an error.
4502 if (!Builder.GetInsertBlock())
4503 return Error::success();
4504
4505 // store is inside of the reduction region when using by-ref
4506 if (!IsByRef[En.index()])
4507 Builder.CreateStore(Reduced, LHSPtr);
4508 }
4509 Builder.CreateRetVoid();
4510 return Error::success();
4511}
4512
4514 const LocationDescription &Loc, InsertPointTy AllocaIP,
4515 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4516 bool IsNoWait, bool IsTeamsReduction) {
4517 assert(ReductionInfos.size() == IsByRef.size());
4518 if (Config.isGPU())
4519 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4520 IsByRef, IsNoWait, IsTeamsReduction);
4521
4522 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4523
4524 if (!updateToLocation(Loc))
4525 return InsertPointTy();
4526
4527 if (ReductionInfos.size() == 0)
4528 return Builder.saveIP();
4529
4530 BasicBlock *InsertBlock = Loc.IP.getBlock();
4531 BasicBlock *ContinuationBlock =
4532 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4533 InsertBlock->getTerminator()->eraseFromParent();
4534
4535 // Create and populate array of type-erased pointers to private reduction
4536 // values.
4537 unsigned NumReductions = ReductionInfos.size();
4538 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4539 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4540 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4541
4542 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4543
4544 for (auto En : enumerate(ReductionInfos)) {
4545 unsigned Index = En.index();
4546 const ReductionInfo &RI = En.value();
4547 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4548 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4549 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4550 }
4551
4552 // Emit a call to the runtime function that orchestrates the reduction.
4553 // Declare the reduction function in the process.
4554 Type *IndexTy = Builder.getIndexTy(
4555 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4556 Function *Func = Builder.GetInsertBlock()->getParent();
4557 Module *Module = Func->getParent();
4558 uint32_t SrcLocStrSize;
4559 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4560 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4561 return RI.AtomicReductionGen;
4562 });
4563 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4564 CanGenerateAtomic
4565 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4566 : IdentFlag(0));
4567 Value *ThreadId = getOrCreateThreadID(Ident);
4568 Constant *NumVariables = Builder.getInt32(NumReductions);
4569 const DataLayout &DL = Module->getDataLayout();
4570 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4571 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4572 Function *ReductionFunc = getFreshReductionFunc(*Module);
4573 Value *Lock = getOMPCriticalRegionLock(".reduction");
4575 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4576 : RuntimeFunction::OMPRTL___kmpc_reduce);
4577 CallInst *ReduceCall =
4578 createRuntimeFunctionCall(ReduceFunc,
4579 {Ident, ThreadId, NumVariables, RedArraySize,
4580 RedArray, ReductionFunc, Lock},
4581 "reduce");
4582
4583 // Create final reduction entry blocks for the atomic and non-atomic case.
4584 // Emit IR that dispatches control flow to one of the blocks based on the
4585 // reduction supporting the atomic mode.
4586 BasicBlock *NonAtomicRedBlock =
4587 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4588 BasicBlock *AtomicRedBlock =
4589 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4590 SwitchInst *Switch =
4591 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4592 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4593 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4594
4595 // Populate the non-atomic reduction using the elementwise reduction function.
4596 // This loads the elements from the global and private variables and reduces
4597 // them before storing back the result to the global variable.
4598 Builder.SetInsertPoint(NonAtomicRedBlock);
4599 for (auto En : enumerate(ReductionInfos)) {
4600 const ReductionInfo &RI = En.value();
4602 // We have one less load for by-ref case because that load is now inside of
4603 // the reduction region
4604 Value *RedValue = RI.Variable;
4605 if (!IsByRef[En.index()]) {
4606 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4607 "red.value." + Twine(En.index()));
4608 }
4609 Value *PrivateRedValue =
4610 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4611 "red.private.value." + Twine(En.index()));
4612 Value *Reduced;
4613 InsertPointOrErrorTy AfterIP =
4614 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4615 if (!AfterIP)
4616 return AfterIP.takeError();
4617 Builder.restoreIP(*AfterIP);
4618
4619 if (!Builder.GetInsertBlock())
4620 return InsertPointTy();
4621 // for by-ref case, the load is inside of the reduction region
4622 if (!IsByRef[En.index()])
4623 Builder.CreateStore(Reduced, RI.Variable);
4624 }
4625 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4626 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4627 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4628 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4629 Builder.CreateBr(ContinuationBlock);
4630
4631 // Populate the atomic reduction using the atomic elementwise reduction
4632 // function. There are no loads/stores here because they will be happening
4633 // inside the atomic elementwise reduction.
4634 Builder.SetInsertPoint(AtomicRedBlock);
4635 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4636 for (const ReductionInfo &RI : ReductionInfos) {
4638 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4639 if (!AfterIP)
4640 return AfterIP.takeError();
4641 Builder.restoreIP(*AfterIP);
4642 if (!Builder.GetInsertBlock())
4643 return InsertPointTy();
4644 }
4645 Builder.CreateBr(ContinuationBlock);
4646 } else {
4647 Builder.CreateUnreachable();
4648 }
4649
4650 // Populate the outlined reduction function using the elementwise reduction
4651 // function. Partial values are extracted from the type-erased array of
4652 // pointers to private variables.
4653 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4654 IsByRef, /*isGPU=*/false);
4655 if (Err)
4656 return Err;
4657
4658 if (!Builder.GetInsertBlock())
4659 return InsertPointTy();
4660
4661 Builder.SetInsertPoint(ContinuationBlock);
4662 return Builder.saveIP();
4663}
4664
4667 BodyGenCallbackTy BodyGenCB,
4668 FinalizeCallbackTy FiniCB) {
4669 if (!updateToLocation(Loc))
4670 return Loc.IP;
4671
4672 Directive OMPD = Directive::OMPD_master;
4673 uint32_t SrcLocStrSize;
4674 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4675 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4676 Value *ThreadId = getOrCreateThreadID(Ident);
4677 Value *Args[] = {Ident, ThreadId};
4678
4679 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4680 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4681
4682 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4683 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4684
4685 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4686 /*Conditional*/ true, /*hasFinalize*/ true);
4687}
4688
4691 BodyGenCallbackTy BodyGenCB,
4692 FinalizeCallbackTy FiniCB, Value *Filter) {
4693 if (!updateToLocation(Loc))
4694 return Loc.IP;
4695
4696 Directive OMPD = Directive::OMPD_masked;
4697 uint32_t SrcLocStrSize;
4698 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4699 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4700 Value *ThreadId = getOrCreateThreadID(Ident);
4701 Value *Args[] = {Ident, ThreadId, Filter};
4702 Value *ArgsEnd[] = {Ident, ThreadId};
4703
4704 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4705 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4706
4707 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4708 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4709
4710 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4711 /*Conditional*/ true, /*hasFinalize*/ true);
4712}
4713
4715 llvm::FunctionCallee Callee,
4717 const llvm::Twine &Name) {
4718 llvm::CallInst *Call = Builder.CreateCall(
4719 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4720 Call->setDoesNotThrow();
4721 return Call;
4722}
4723
4724// Expects input basic block is dominated by BeforeScanBB.
4725// Once Scan directive is encountered, the code after scan directive should be
4726// dominated by AfterScanBB. Scan directive splits the code sequence to
4727// scan and input phase. Based on whether inclusive or exclusive
4728// clause is used in the scan directive and whether input loop or scan loop
4729// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4730// input loop and second is the scan loop. The code generated handles only
4731// inclusive scans now.
4733 const LocationDescription &Loc, InsertPointTy AllocaIP,
4734 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4735 bool IsInclusive, ScanInfo *ScanRedInfo) {
4736 if (ScanRedInfo->OMPFirstScanLoop) {
4737 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4738 ScanVarsType, ScanRedInfo);
4739 if (Err)
4740 return Err;
4741 }
4742 if (!updateToLocation(Loc))
4743 return Loc.IP;
4744
4745 llvm::Value *IV = ScanRedInfo->IV;
4746
4747 if (ScanRedInfo->OMPFirstScanLoop) {
4748 // Emit buffer[i] = red; at the end of the input phase.
4749 for (size_t i = 0; i < ScanVars.size(); i++) {
4750 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4751 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4752 Type *DestTy = ScanVarsType[i];
4753 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4754 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4755
4756 Builder.CreateStore(Src, Val);
4757 }
4758 }
4759 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4760 emitBlock(ScanRedInfo->OMPScanDispatch,
4761 Builder.GetInsertBlock()->getParent());
4762
4763 if (!ScanRedInfo->OMPFirstScanLoop) {
4764 IV = ScanRedInfo->IV;
4765 // Emit red = buffer[i]; at the entrance to the scan phase.
4766 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4767 for (size_t i = 0; i < ScanVars.size(); i++) {
4768 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4769 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4770 Type *DestTy = ScanVarsType[i];
4771 Value *SrcPtr =
4772 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4773 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4774 Builder.CreateStore(Src, ScanVars[i]);
4775 }
4776 }
4777
4778 // TODO: Update it to CreateBr and remove dead blocks
4779 llvm::Value *CmpI = Builder.getInt1(true);
4780 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4781 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4782 ScanRedInfo->OMPAfterScanBlock);
4783 } else {
4784 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4785 ScanRedInfo->OMPBeforeScanBlock);
4786 }
4787 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4788 Builder.GetInsertBlock()->getParent());
4789 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4790 return Builder.saveIP();
4791}
4792
4793Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4794 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4795 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4796
4797 Builder.restoreIP(AllocaIP);
4798 // Create the shared pointer at alloca IP.
4799 for (size_t i = 0; i < ScanVars.size(); i++) {
4800 llvm::Value *BuffPtr =
4801 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4802 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4803 }
4804
4805 // Allocate temporary buffer by master thread
4806 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4807 InsertPointTy CodeGenIP) -> Error {
4808 Builder.restoreIP(CodeGenIP);
4809 Value *AllocSpan =
4810 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4811 for (size_t i = 0; i < ScanVars.size(); i++) {
4812 Type *IntPtrTy = Builder.getInt32Ty();
4813 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4814 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4815 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4816 AllocSpan, nullptr, "arr");
4817 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4818 }
4819 return Error::success();
4820 };
4821 // TODO: Perform finalization actions for variables. This has to be
4822 // called for variables which have destructors/finalizers.
4823 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4824
4825 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4826 llvm::Value *FilterVal = Builder.getInt32(0);
4828 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4829
4830 if (!AfterIP)
4831 return AfterIP.takeError();
4832 Builder.restoreIP(*AfterIP);
4833 BasicBlock *InputBB = Builder.GetInsertBlock();
4834 if (InputBB->getTerminator())
4835 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4836 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4837 if (!AfterIP)
4838 return AfterIP.takeError();
4839 Builder.restoreIP(*AfterIP);
4840
4841 return Error::success();
4842}
4843
4844Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4845 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4846 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4847 InsertPointTy CodeGenIP) -> Error {
4848 Builder.restoreIP(CodeGenIP);
4849 for (ReductionInfo RedInfo : ReductionInfos) {
4850 Value *PrivateVar = RedInfo.PrivateVariable;
4851 Value *OrigVar = RedInfo.Variable;
4852 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4853 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4854
4855 Type *SrcTy = RedInfo.ElementType;
4856 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4857 "arrayOffset");
4858 Value *Src = Builder.CreateLoad(SrcTy, Val);
4859
4860 Builder.CreateStore(Src, OrigVar);
4861 Builder.CreateFree(Buff);
4862 }
4863 return Error::success();
4864 };
4865 // TODO: Perform finalization actions for variables. This has to be
4866 // called for variables which have destructors/finalizers.
4867 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4868
4869 if (ScanRedInfo->OMPScanFinish->getTerminator())
4870 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish->getTerminator());
4871 else
4872 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4873
4874 llvm::Value *FilterVal = Builder.getInt32(0);
4876 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4877
4878 if (!AfterIP)
4879 return AfterIP.takeError();
4880 Builder.restoreIP(*AfterIP);
4881 BasicBlock *InputBB = Builder.GetInsertBlock();
4882 if (InputBB->getTerminator())
4883 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4884 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4885 if (!AfterIP)
4886 return AfterIP.takeError();
4887 Builder.restoreIP(*AfterIP);
4888 return Error::success();
4889}
4890
4892 const LocationDescription &Loc,
4894 ScanInfo *ScanRedInfo) {
4895
4896 if (!updateToLocation(Loc))
4897 return Loc.IP;
4898 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4899 InsertPointTy CodeGenIP) -> Error {
4900 Builder.restoreIP(CodeGenIP);
4901 Function *CurFn = Builder.GetInsertBlock()->getParent();
4902 // for (int k = 0; k <= ceil(log2(n)); ++k)
4903 llvm::BasicBlock *LoopBB =
4904 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
4905 llvm::BasicBlock *ExitBB =
4906 splitBB(Builder, false, "omp.outer.log.scan.exit");
4908 Builder.GetInsertBlock()->getModule(),
4909 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
4910 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
4911 llvm::Value *Arg =
4912 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
4913 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
4915 Builder.GetInsertBlock()->getModule(),
4916 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
4917 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
4918 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
4919 llvm::Value *NMin1 = Builder.CreateNUWSub(
4920 ScanRedInfo->Span,
4921 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
4922 Builder.SetInsertPoint(InputBB);
4923 Builder.CreateBr(LoopBB);
4924 emitBlock(LoopBB, CurFn);
4925 Builder.SetInsertPoint(LoopBB);
4926
4927 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4928 // size pow2k = 1;
4929 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4930 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
4931 InputBB);
4932 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
4933 InputBB);
4934 // for (size i = n - 1; i >= 2 ^ k; --i)
4935 // tmp[i] op= tmp[i-pow2k];
4936 llvm::BasicBlock *InnerLoopBB =
4937 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
4938 llvm::BasicBlock *InnerExitBB =
4939 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
4940 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
4941 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4942 emitBlock(InnerLoopBB, CurFn);
4943 Builder.SetInsertPoint(InnerLoopBB);
4944 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
4945 IVal->addIncoming(NMin1, LoopBB);
4946 for (ReductionInfo RedInfo : ReductionInfos) {
4947 Value *ReductionVal = RedInfo.PrivateVariable;
4948 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
4949 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4950 Type *DestTy = RedInfo.ElementType;
4951 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
4952 Value *LHSPtr =
4953 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4954 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
4955 Value *RHSPtr =
4956 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
4957 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
4958 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
4959 llvm::Value *Result;
4960 InsertPointOrErrorTy AfterIP =
4961 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
4962 if (!AfterIP)
4963 return AfterIP.takeError();
4964 Builder.CreateStore(Result, LHSPtr);
4965 }
4966 llvm::Value *NextIVal = Builder.CreateNUWSub(
4967 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
4968 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
4969 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
4970 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
4971 emitBlock(InnerExitBB, CurFn);
4972 llvm::Value *Next = Builder.CreateNUWAdd(
4973 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
4974 Counter->addIncoming(Next, Builder.GetInsertBlock());
4975 // pow2k <<= 1;
4976 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
4977 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
4978 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
4979 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
4980 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
4981 return Error::success();
4982 };
4983
4984 // TODO: Perform finalization actions for variables. This has to be
4985 // called for variables which have destructors/finalizers.
4986 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4987
4988 llvm::Value *FilterVal = Builder.getInt32(0);
4990 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4991
4992 if (!AfterIP)
4993 return AfterIP.takeError();
4994 Builder.restoreIP(*AfterIP);
4995 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4996
4997 if (!AfterIP)
4998 return AfterIP.takeError();
4999 Builder.restoreIP(*AfterIP);
5000 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5001 if (Err)
5002 return Err;
5003
5004 return AfterIP;
5005}
5006
5007Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5008 llvm::function_ref<Error()> InputLoopGen,
5009 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5010 ScanInfo *ScanRedInfo) {
5011
5012 {
5013 // Emit loop with input phase:
5014 // for (i: 0..<num_iters>) {
5015 // <input phase>;
5016 // buffer[i] = red;
5017 // }
5018 ScanRedInfo->OMPFirstScanLoop = true;
5019 Error Err = InputLoopGen();
5020 if (Err)
5021 return Err;
5022 }
5023 {
5024 // Emit loop with scan phase:
5025 // for (i: 0..<num_iters>) {
5026 // red = buffer[i];
5027 // <scan phase>;
5028 // }
5029 ScanRedInfo->OMPFirstScanLoop = false;
5030 Error Err = ScanLoopGen(Builder.saveIP());
5031 if (Err)
5032 return Err;
5033 }
5034 return Error::success();
5035}
5036
5037void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5038 Function *Fun = Builder.GetInsertBlock()->getParent();
5039 ScanRedInfo->OMPScanDispatch =
5040 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5041 ScanRedInfo->OMPAfterScanBlock =
5042 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5043 ScanRedInfo->OMPBeforeScanBlock =
5044 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5045 ScanRedInfo->OMPScanLoopExit =
5046 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5047}
5049 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5050 BasicBlock *PostInsertBefore, const Twine &Name) {
5051 Module *M = F->getParent();
5052 LLVMContext &Ctx = M->getContext();
5053 Type *IndVarTy = TripCount->getType();
5054
5055 // Create the basic block structure.
5056 BasicBlock *Preheader =
5057 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5058 BasicBlock *Header =
5059 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5060 BasicBlock *Cond =
5061 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5062 BasicBlock *Body =
5063 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5064 BasicBlock *Latch =
5065 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5066 BasicBlock *Exit =
5067 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5068 BasicBlock *After =
5069 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5070
5071 // Use specified DebugLoc for new instructions.
5072 Builder.SetCurrentDebugLocation(DL);
5073
5074 Builder.SetInsertPoint(Preheader);
5075 Builder.CreateBr(Header);
5076
5077 Builder.SetInsertPoint(Header);
5078 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5079 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5080 Builder.CreateBr(Cond);
5081
5082 Builder.SetInsertPoint(Cond);
5083 Value *Cmp =
5084 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5085 Builder.CreateCondBr(Cmp, Body, Exit);
5086
5087 Builder.SetInsertPoint(Body);
5088 Builder.CreateBr(Latch);
5089
5090 Builder.SetInsertPoint(Latch);
5091 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5092 "omp_" + Name + ".next", /*HasNUW=*/true);
5093 Builder.CreateBr(Header);
5094 IndVarPHI->addIncoming(Next, Latch);
5095
5096 Builder.SetInsertPoint(Exit);
5097 Builder.CreateBr(After);
5098
5099 // Remember and return the canonical control flow.
5100 LoopInfos.emplace_front();
5101 CanonicalLoopInfo *CL = &LoopInfos.front();
5102
5103 CL->Header = Header;
5104 CL->Cond = Cond;
5105 CL->Latch = Latch;
5106 CL->Exit = Exit;
5107
5108#ifndef NDEBUG
5109 CL->assertOK();
5110#endif
5111 return CL;
5112}
5113
5116 LoopBodyGenCallbackTy BodyGenCB,
5117 Value *TripCount, const Twine &Name) {
5118 BasicBlock *BB = Loc.IP.getBlock();
5119 BasicBlock *NextBB = BB->getNextNode();
5120
5121 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5122 NextBB, NextBB, Name);
5123 BasicBlock *After = CL->getAfter();
5124
5125 // If location is not set, don't connect the loop.
5126 if (updateToLocation(Loc)) {
5127 // Split the loop at the insertion point: Branch to the preheader and move
5128 // every following instruction to after the loop (the After BB). Also, the
5129 // new successor is the loop's after block.
5130 spliceBB(Builder, After, /*CreateBranch=*/false);
5131 Builder.CreateBr(CL->getPreheader());
5132 }
5133
5134 // Emit the body content. We do it after connecting the loop to the CFG to
5135 // avoid that the callback encounters degenerate BBs.
5136 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5137 return Err;
5138
5139#ifndef NDEBUG
5140 CL->assertOK();
5141#endif
5142 return CL;
5143}
5144
5146 ScanInfos.emplace_front();
5147 ScanInfo *Result = &ScanInfos.front();
5148 return Result;
5149}
5150
5154 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5155 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5156 LocationDescription ComputeLoc =
5157 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5158 updateToLocation(ComputeLoc);
5159
5161
5163 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5164 ScanRedInfo->Span = TripCount;
5165 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5166 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5167
5168 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5169 Builder.restoreIP(CodeGenIP);
5170 ScanRedInfo->IV = IV;
5171 createScanBBs(ScanRedInfo);
5172 BasicBlock *InputBlock = Builder.GetInsertBlock();
5173 Instruction *Terminator = InputBlock->getTerminator();
5174 assert(Terminator->getNumSuccessors() == 1);
5175 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5176 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5177 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5178 Builder.GetInsertBlock()->getParent());
5179 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5180 emitBlock(ScanRedInfo->OMPScanLoopExit,
5181 Builder.GetInsertBlock()->getParent());
5182 Builder.CreateBr(ContinueBlock);
5183 Builder.SetInsertPoint(
5184 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5185 return BodyGenCB(Builder.saveIP(), IV);
5186 };
5187
5188 const auto &&InputLoopGen = [&]() -> Error {
5190 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5191 ComputeIP, Name, true, ScanRedInfo);
5192 if (!LoopInfo)
5193 return LoopInfo.takeError();
5194 Result.push_back(*LoopInfo);
5195 Builder.restoreIP((*LoopInfo)->getAfterIP());
5196 return Error::success();
5197 };
5198 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5200 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5201 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5202 if (!LoopInfo)
5203 return LoopInfo.takeError();
5204 Result.push_back(*LoopInfo);
5205 Builder.restoreIP((*LoopInfo)->getAfterIP());
5206 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5207 return Error::success();
5208 };
5209 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5210 if (Err)
5211 return Err;
5212 return Result;
5213}
5214
5216 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5217 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5218
5219 // Consider the following difficulties (assuming 8-bit signed integers):
5220 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5221 // DO I = 1, 100, 50
5222 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5223 // DO I = 100, 0, -128
5224
5225 // Start, Stop and Step must be of the same integer type.
5226 auto *IndVarTy = cast<IntegerType>(Start->getType());
5227 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5228 assert(IndVarTy == Step->getType() && "Step type mismatch");
5229
5231
5232 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5233 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5234
5235 // Like Step, but always positive.
5236 Value *Incr = Step;
5237
5238 // Distance between Start and Stop; always positive.
5239 Value *Span;
5240
5241 // Condition whether there are no iterations are executed at all, e.g. because
5242 // UB < LB.
5243 Value *ZeroCmp;
5244
5245 if (IsSigned) {
5246 // Ensure that increment is positive. If not, negate and invert LB and UB.
5247 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5248 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5249 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5250 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5251 Span = Builder.CreateSub(UB, LB, "", false, true);
5252 ZeroCmp = Builder.CreateICmp(
5253 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5254 } else {
5255 Span = Builder.CreateSub(Stop, Start, "", true);
5256 ZeroCmp = Builder.CreateICmp(
5257 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5258 }
5259
5260 Value *CountIfLooping;
5261 if (InclusiveStop) {
5262 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5263 } else {
5264 // Avoid incrementing past stop since it could overflow.
5265 Value *CountIfTwo = Builder.CreateAdd(
5266 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5267 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5268 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5269 }
5270
5271 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5272 "omp_" + Name + ".tripcount");
5273}
5274
5277 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5278 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5279 ScanInfo *ScanRedInfo) {
5280 LocationDescription ComputeLoc =
5281 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5282
5284 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5285
5286 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5287 Builder.restoreIP(CodeGenIP);
5288 Value *Span = Builder.CreateMul(IV, Step);
5289 Value *IndVar = Builder.CreateAdd(Span, Start);
5290 if (InScan)
5291 ScanRedInfo->IV = IndVar;
5292 return BodyGenCB(Builder.saveIP(), IndVar);
5293 };
5294 LocationDescription LoopLoc =
5295 ComputeIP.isSet()
5296 ? Loc
5297 : LocationDescription(Builder.saveIP(),
5298 Builder.getCurrentDebugLocation());
5299 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5300}
5301
5302// Returns an LLVM function to call for initializing loop bounds using OpenMP
5303// static scheduling for composite `distribute parallel for` depending on
5304// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5305// integers as unsigned similarly to CanonicalLoopInfo.
5306static FunctionCallee
5308 OpenMPIRBuilder &OMPBuilder) {
5309 unsigned Bitwidth = Ty->getIntegerBitWidth();
5310 if (Bitwidth == 32)
5311 return OMPBuilder.getOrCreateRuntimeFunction(
5312 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5313 if (Bitwidth == 64)
5314 return OMPBuilder.getOrCreateRuntimeFunction(
5315 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5316 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5317}
5318
5319// Returns an LLVM function to call for initializing loop bounds using OpenMP
5320// static scheduling depending on `type`. Only i32 and i64 are supported by the
5321// runtime. Always interpret integers as unsigned similarly to
5322// CanonicalLoopInfo.
5324 OpenMPIRBuilder &OMPBuilder) {
5325 unsigned Bitwidth = Ty->getIntegerBitWidth();
5326 if (Bitwidth == 32)
5327 return OMPBuilder.getOrCreateRuntimeFunction(
5328 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5329 if (Bitwidth == 64)
5330 return OMPBuilder.getOrCreateRuntimeFunction(
5331 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5332 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5333}
5334
5335OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5336 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5337 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5338 OMPScheduleType DistScheduleSchedType) {
5339 assert(CLI->isValid() && "Requires a valid canonical loop");
5340 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5341 "Require dedicated allocate IP");
5342
5343 // Set up the source location value for OpenMP runtime.
5344 Builder.restoreIP(CLI->getPreheaderIP());
5345 Builder.SetCurrentDebugLocation(DL);
5346
5347 uint32_t SrcLocStrSize;
5348 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5349 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5350
5351 // Declare useful OpenMP runtime functions.
5352 Value *IV = CLI->getIndVar();
5353 Type *IVTy = IV->getType();
5354 FunctionCallee StaticInit =
5355 LoopType == WorksharingLoopType::DistributeForStaticLoop
5356 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5357 : getKmpcForStaticInitForType(IVTy, M, *this);
5358 FunctionCallee StaticFini =
5359 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5360
5361 // Allocate space for computed loop bounds as expected by the "init" function.
5362 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5363
5364 Type *I32Type = Type::getInt32Ty(M.getContext());
5365 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5366 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5367 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5368 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5369 CLI->setLastIter(PLastIter);
5370
5371 // At the end of the preheader, prepare for calling the "init" function by
5372 // storing the current loop bounds into the allocated space. A canonical loop
5373 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5374 // and produces an inclusive upper bound.
5375 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5376 Constant *Zero = ConstantInt::get(IVTy, 0);
5377 Constant *One = ConstantInt::get(IVTy, 1);
5378 Builder.CreateStore(Zero, PLowerBound);
5379 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5380 Builder.CreateStore(UpperBound, PUpperBound);
5381 Builder.CreateStore(One, PStride);
5382
5383 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5384
5385 OMPScheduleType SchedType =
5386 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5387 ? OMPScheduleType::OrderedDistribute
5389 Constant *SchedulingType =
5390 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5391
5392 // Call the "init" function and update the trip count of the loop with the
5393 // value it produced.
5394 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5395 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5396 this](Value *SchedulingType, auto &Builder) {
5397 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5398 PLowerBound, PUpperBound});
5399 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5400 Value *PDistUpperBound =
5401 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5402 Args.push_back(PDistUpperBound);
5403 }
5404 Args.append({PStride, One, Zero});
5405 createRuntimeFunctionCall(StaticInit, Args);
5406 };
5407 BuildInitCall(SchedulingType, Builder);
5408 if (HasDistSchedule &&
5409 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5410 Constant *DistScheduleSchedType = ConstantInt::get(
5411 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5412 // We want to emit a second init function call for the dist_schedule clause
5413 // to the Distribute construct. This should only be done however if a
5414 // Workshare Loop is nested within a Distribute Construct
5415 BuildInitCall(DistScheduleSchedType, Builder);
5416 }
5417 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5418 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5419 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5420 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5421 CLI->setTripCount(TripCount);
5422
5423 // Update all uses of the induction variable except the one in the condition
5424 // block that compares it with the actual upper bound, and the increment in
5425 // the latch block.
5426
5427 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5428 Builder.SetInsertPoint(CLI->getBody(),
5429 CLI->getBody()->getFirstInsertionPt());
5430 Builder.SetCurrentDebugLocation(DL);
5431 return Builder.CreateAdd(OldIV, LowerBound);
5432 });
5433
5434 // In the "exit" block, call the "fini" function.
5435 Builder.SetInsertPoint(CLI->getExit(),
5436 CLI->getExit()->getTerminator()->getIterator());
5437 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5438
5439 // Add the barrier if requested.
5440 if (NeedsBarrier) {
5441 InsertPointOrErrorTy BarrierIP =
5443 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5444 /* CheckCancelFlag */ false);
5445 if (!BarrierIP)
5446 return BarrierIP.takeError();
5447 }
5448
5449 InsertPointTy AfterIP = CLI->getAfterIP();
5450 CLI->invalidate();
5451
5452 return AfterIP;
5453}
5454
5455static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5456 LoopInfo &LI);
5457static void addLoopMetadata(CanonicalLoopInfo *Loop,
5458 ArrayRef<Metadata *> Properties);
5459
5461 LLVMContext &Ctx, Loop *Loop,
5463 SmallVector<Metadata *> &LoopMDList) {
5464 SmallSet<BasicBlock *, 8> Reachable;
5465
5466 // Get the basic blocks from the loop in which memref instructions
5467 // can be found.
5468 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5469 // preferably without running any passes.
5470 for (BasicBlock *Block : Loop->getBlocks()) {
5471 if (Block == CLI->getCond() || Block == CLI->getHeader())
5472 continue;
5473 Reachable.insert(Block);
5474 }
5475
5476 // Add access group metadata to memory-access instructions.
5477 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5478 for (BasicBlock *BB : Reachable)
5479 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5480 // TODO: If the loop has existing parallel access metadata, have
5481 // to combine two lists.
5482 LoopMDList.push_back(MDNode::get(
5483 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5484}
5485
5487OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5488 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5489 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5490 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5491 assert(CLI->isValid() && "Requires a valid canonical loop");
5492 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5493
5494 LLVMContext &Ctx = CLI->getFunction()->getContext();
5495 Value *IV = CLI->getIndVar();
5496 Value *OrigTripCount = CLI->getTripCount();
5497 Type *IVTy = IV->getType();
5498 assert(IVTy->getIntegerBitWidth() <= 64 &&
5499 "Max supported tripcount bitwidth is 64 bits");
5500 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5501 : Type::getInt64Ty(Ctx);
5502 Type *I32Type = Type::getInt32Ty(M.getContext());
5503 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5504 Constant *One = ConstantInt::get(InternalIVTy, 1);
5505
5506 Function *F = CLI->getFunction();
5508 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5509 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5510 LoopAnalysis LIA;
5511 LoopInfo &&LI = LIA.run(*F, FAM);
5512 Loop *L = LI.getLoopFor(CLI->getHeader());
5513 SmallVector<Metadata *> LoopMDList;
5514 if (ChunkSize || DistScheduleChunkSize)
5515 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5516 addLoopMetadata(CLI, LoopMDList);
5517
5518 // Declare useful OpenMP runtime functions.
5519 FunctionCallee StaticInit =
5520 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5521 FunctionCallee StaticFini =
5522 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5523
5524 // Allocate space for computed loop bounds as expected by the "init" function.
5525 Builder.restoreIP(AllocaIP);
5526 Builder.SetCurrentDebugLocation(DL);
5527 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5528 Value *PLowerBound =
5529 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5530 Value *PUpperBound =
5531 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5532 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5533 CLI->setLastIter(PLastIter);
5534
5535 // Set up the source location value for the OpenMP runtime.
5536 Builder.restoreIP(CLI->getPreheaderIP());
5537 Builder.SetCurrentDebugLocation(DL);
5538
5539 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5540 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5541 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5542 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5543 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5544 "distschedulechunksize");
5545 Value *CastedTripCount =
5546 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5547
5548 Constant *SchedulingType =
5549 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5550 Constant *DistSchedulingType =
5551 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5552 Builder.CreateStore(Zero, PLowerBound);
5553 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5554 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5555 Value *UpperBound =
5556 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5557 Builder.CreateStore(UpperBound, PUpperBound);
5558 Builder.CreateStore(One, PStride);
5559
5560 // Call the "init" function and update the trip count of the loop with the
5561 // value it produced.
5562 uint32_t SrcLocStrSize;
5563 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5564 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5565 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5566 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5567 PUpperBound, PStride, One,
5568 this](Value *SchedulingType, Value *ChunkSize,
5569 auto &Builder) {
5571 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5572 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5573 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5574 /*pstride=*/PStride, /*incr=*/One,
5575 /*chunk=*/ChunkSize});
5576 };
5577 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5578 if (DistScheduleSchedType != OMPScheduleType::None &&
5579 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5580 SchedType != OMPScheduleType::OrderedDistribute) {
5581 // We want to emit a second init function call for the dist_schedule clause
5582 // to the Distribute construct. This should only be done however if a
5583 // Workshare Loop is nested within a Distribute Construct
5584 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5585 }
5586
5587 // Load values written by the "init" function.
5588 Value *FirstChunkStart =
5589 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5590 Value *FirstChunkStop =
5591 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5592 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5593 Value *ChunkRange =
5594 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5595 Value *NextChunkStride =
5596 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5597
5598 // Create outer "dispatch" loop for enumerating the chunks.
5599 BasicBlock *DispatchEnter = splitBB(Builder, true);
5600 Value *DispatchCounter;
5601
5602 // It is safe to assume this didn't return an error because the callback
5603 // passed into createCanonicalLoop is the only possible error source, and it
5604 // always returns success.
5605 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5606 {Builder.saveIP(), DL},
5607 [&](InsertPointTy BodyIP, Value *Counter) {
5608 DispatchCounter = Counter;
5609 return Error::success();
5610 },
5611 FirstChunkStart, CastedTripCount, NextChunkStride,
5612 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5613 "dispatch"));
5614
5615 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5616 // not have to preserve the canonical invariant.
5617 BasicBlock *DispatchBody = DispatchCLI->getBody();
5618 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5619 BasicBlock *DispatchExit = DispatchCLI->getExit();
5620 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5621 DispatchCLI->invalidate();
5622
5623 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5624 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5625 redirectTo(CLI->getExit(), DispatchLatch, DL);
5626 redirectTo(DispatchBody, DispatchEnter, DL);
5627
5628 // Prepare the prolog of the chunk loop.
5629 Builder.restoreIP(CLI->getPreheaderIP());
5630 Builder.SetCurrentDebugLocation(DL);
5631
5632 // Compute the number of iterations of the chunk loop.
5633 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5634 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5635 Value *IsLastChunk =
5636 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5637 Value *CountUntilOrigTripCount =
5638 Builder.CreateSub(CastedTripCount, DispatchCounter);
5639 Value *ChunkTripCount = Builder.CreateSelect(
5640 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5641 Value *BackcastedChunkTC =
5642 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5643 CLI->setTripCount(BackcastedChunkTC);
5644
5645 // Update all uses of the induction variable except the one in the condition
5646 // block that compares it with the actual upper bound, and the increment in
5647 // the latch block.
5648 Value *BackcastedDispatchCounter =
5649 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5650 CLI->mapIndVar([&](Instruction *) -> Value * {
5651 Builder.restoreIP(CLI->getBodyIP());
5652 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5653 });
5654
5655 // In the "exit" block, call the "fini" function.
5656 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5657 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5658
5659 // Add the barrier if requested.
5660 if (NeedsBarrier) {
5661 InsertPointOrErrorTy AfterIP =
5662 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5663 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5664 if (!AfterIP)
5665 return AfterIP.takeError();
5666 }
5667
5668#ifndef NDEBUG
5669 // Even though we currently do not support applying additional methods to it,
5670 // the chunk loop should remain a canonical loop.
5671 CLI->assertOK();
5672#endif
5673
5674 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5675}
5676
5677// Returns an LLVM function to call for executing an OpenMP static worksharing
5678// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5679// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5680static FunctionCallee
5682 WorksharingLoopType LoopType) {
5683 unsigned Bitwidth = Ty->getIntegerBitWidth();
5684 Module &M = OMPBuilder->M;
5685 switch (LoopType) {
5686 case WorksharingLoopType::ForStaticLoop:
5687 if (Bitwidth == 32)
5688 return OMPBuilder->getOrCreateRuntimeFunction(
5689 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5690 if (Bitwidth == 64)
5691 return OMPBuilder->getOrCreateRuntimeFunction(
5692 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5693 break;
5694 case WorksharingLoopType::DistributeStaticLoop:
5695 if (Bitwidth == 32)
5696 return OMPBuilder->getOrCreateRuntimeFunction(
5697 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5698 if (Bitwidth == 64)
5699 return OMPBuilder->getOrCreateRuntimeFunction(
5700 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5701 break;
5702 case WorksharingLoopType::DistributeForStaticLoop:
5703 if (Bitwidth == 32)
5704 return OMPBuilder->getOrCreateRuntimeFunction(
5705 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5706 if (Bitwidth == 64)
5707 return OMPBuilder->getOrCreateRuntimeFunction(
5708 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5709 break;
5710 }
5711 if (Bitwidth != 32 && Bitwidth != 64) {
5712 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5713 }
5714 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5715}
5716
5717// Inserts a call to proper OpenMP Device RTL function which handles
5718// loop worksharing.
5720 WorksharingLoopType LoopType,
5721 BasicBlock *InsertBlock, Value *Ident,
5722 Value *LoopBodyArg, Value *TripCount,
5723 Function &LoopBodyFn, bool NoLoop) {
5724 Type *TripCountTy = TripCount->getType();
5725 Module &M = OMPBuilder->M;
5726 IRBuilder<> &Builder = OMPBuilder->Builder;
5727 FunctionCallee RTLFn =
5728 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5729 SmallVector<Value *, 8> RealArgs;
5730 RealArgs.push_back(Ident);
5731 RealArgs.push_back(&LoopBodyFn);
5732 RealArgs.push_back(LoopBodyArg);
5733 RealArgs.push_back(TripCount);
5734 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5735 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5736 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5737 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5738 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5739 return;
5740 }
5741 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5742 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5743 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5744 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5745
5746 RealArgs.push_back(
5747 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5748 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5749 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5750 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5751 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5752 } else {
5753 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5754 }
5755
5756 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5757}
5758
5760 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5761 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5762 WorksharingLoopType LoopType, bool NoLoop) {
5763 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5764 BasicBlock *Preheader = CLI->getPreheader();
5765 Value *TripCount = CLI->getTripCount();
5766
5767 // After loop body outling, the loop body contains only set up
5768 // of loop body argument structure and the call to the outlined
5769 // loop body function. Firstly, we need to move setup of loop body args
5770 // into loop preheader.
5771 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5772 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5773
5774 // The next step is to remove the whole loop. We do not it need anymore.
5775 // That's why make an unconditional branch from loop preheader to loop
5776 // exit block
5777 Builder.restoreIP({Preheader, Preheader->end()});
5778 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5779 Preheader->getTerminator()->eraseFromParent();
5780 Builder.CreateBr(CLI->getExit());
5781
5782 // Delete dead loop blocks
5783 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5784 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5785 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5786 CleanUpInfo.EntryBB = CLI->getHeader();
5787 CleanUpInfo.ExitBB = CLI->getExit();
5788 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5789 DeleteDeadBlocks(BlocksToBeRemoved);
5790
5791 // Find the instruction which corresponds to loop body argument structure
5792 // and remove the call to loop body function instruction.
5793 Value *LoopBodyArg;
5794 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5795 assert(OutlinedFnUser &&
5796 "Expected unique undroppable user of outlined function");
5797 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5798 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5799 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5800 "Expected outlined function call to be located in loop preheader");
5801 // Check in case no argument structure has been passed.
5802 if (OutlinedFnCallInstruction->arg_size() > 1)
5803 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5804 else
5805 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5806 OutlinedFnCallInstruction->eraseFromParent();
5807
5808 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5809 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5810
5811 for (auto &ToBeDeletedItem : ToBeDeleted)
5812 ToBeDeletedItem->eraseFromParent();
5813 CLI->invalidate();
5814}
5815
5816OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5817 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5818 WorksharingLoopType LoopType, bool NoLoop) {
5819 uint32_t SrcLocStrSize;
5820 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5821 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5822
5823 OutlineInfo OI;
5824 OI.OuterAllocaBB = CLI->getPreheader();
5825 Function *OuterFn = CLI->getPreheader()->getParent();
5826
5827 // Instructions which need to be deleted at the end of code generation
5828 SmallVector<Instruction *, 4> ToBeDeleted;
5829
5830 OI.OuterAllocaBB = AllocaIP.getBlock();
5831
5832 // Mark the body loop as region which needs to be extracted
5833 OI.EntryBB = CLI->getBody();
5834 OI.ExitBB = CLI->getLatch()->splitBasicBlock(CLI->getLatch()->begin(),
5835 "omp.prelatch", true);
5836
5837 // Prepare loop body for extraction
5838 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5839
5840 // Insert new loop counter variable which will be used only in loop
5841 // body.
5842 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5843 Instruction *NewLoopCntLoad =
5844 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5845 // New loop counter instructions are redundant in the loop preheader when
5846 // code generation for workshare loop is finshed. That's why mark them as
5847 // ready for deletion.
5848 ToBeDeleted.push_back(NewLoopCntLoad);
5849 ToBeDeleted.push_back(NewLoopCnt);
5850
5851 // Analyse loop body region. Find all input variables which are used inside
5852 // loop body region.
5853 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5855 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5856
5857 CodeExtractorAnalysisCache CEAC(*OuterFn);
5858 CodeExtractor Extractor(Blocks,
5859 /* DominatorTree */ nullptr,
5860 /* AggregateArgs */ true,
5861 /* BlockFrequencyInfo */ nullptr,
5862 /* BranchProbabilityInfo */ nullptr,
5863 /* AssumptionCache */ nullptr,
5864 /* AllowVarArgs */ true,
5865 /* AllowAlloca */ true,
5866 /* AllocationBlock */ CLI->getPreheader(),
5867 /* Suffix */ ".omp_wsloop",
5868 /* AggrArgsIn0AddrSpace */ true);
5869
5870 BasicBlock *CommonExit = nullptr;
5871 SetVector<Value *> SinkingCands, HoistingCands;
5872
5873 // Find allocas outside the loop body region which are used inside loop
5874 // body
5875 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5876
5877 // We need to model loop body region as the function f(cnt, loop_arg).
5878 // That's why we replace loop induction variable by the new counter
5879 // which will be one of loop body function argument
5881 CLI->getIndVar()->user_end());
5882 for (auto Use : Users) {
5883 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5884 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5885 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5886 }
5887 }
5888 }
5889 // Make sure that loop counter variable is not merged into loop body
5890 // function argument structure and it is passed as separate variable
5891 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5892
5893 // PostOutline CB is invoked when loop body function is outlined and
5894 // loop body is replaced by call to outlined function. We need to add
5895 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
5896 // function will handle loop control logic.
5897 //
5898 OI.PostOutlineCB = [=, ToBeDeletedVec =
5899 std::move(ToBeDeleted)](Function &OutlinedFn) {
5900 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
5901 LoopType, NoLoop);
5902 };
5903 addOutlineInfo(std::move(OI));
5904 return CLI->getAfterIP();
5905}
5906
5909 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
5910 bool HasSimdModifier, bool HasMonotonicModifier,
5911 bool HasNonmonotonicModifier, bool HasOrderedClause,
5912 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
5913 Value *DistScheduleChunkSize) {
5914 if (Config.isTargetDevice())
5915 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
5916 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
5917 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
5918 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
5919
5920 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
5921 OMPScheduleType::ModifierOrdered;
5922 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
5923 if (HasDistSchedule) {
5924 DistScheduleSchedType = DistScheduleChunkSize
5925 ? OMPScheduleType::OrderedDistributeChunked
5926 : OMPScheduleType::OrderedDistribute;
5927 }
5928 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
5929 case OMPScheduleType::BaseStatic:
5930 case OMPScheduleType::BaseDistribute:
5931 assert((!ChunkSize || !DistScheduleChunkSize) &&
5932 "No chunk size with static-chunked schedule");
5933 if (IsOrdered && !HasDistSchedule)
5934 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5935 NeedsBarrier, ChunkSize);
5936 // FIXME: Monotonicity ignored?
5937 if (DistScheduleChunkSize)
5938 return applyStaticChunkedWorkshareLoop(
5939 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5940 DistScheduleChunkSize, DistScheduleSchedType);
5941 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
5942 HasDistSchedule);
5943
5944 case OMPScheduleType::BaseStaticChunked:
5945 case OMPScheduleType::BaseDistributeChunked:
5946 if (IsOrdered && !HasDistSchedule)
5947 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5948 NeedsBarrier, ChunkSize);
5949 // FIXME: Monotonicity ignored?
5950 return applyStaticChunkedWorkshareLoop(
5951 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
5952 DistScheduleChunkSize, DistScheduleSchedType);
5953
5954 case OMPScheduleType::BaseRuntime:
5955 case OMPScheduleType::BaseAuto:
5956 case OMPScheduleType::BaseGreedy:
5957 case OMPScheduleType::BaseBalanced:
5958 case OMPScheduleType::BaseSteal:
5959 case OMPScheduleType::BaseGuidedSimd:
5960 case OMPScheduleType::BaseRuntimeSimd:
5961 assert(!ChunkSize &&
5962 "schedule type does not support user-defined chunk sizes");
5963 [[fallthrough]];
5964 case OMPScheduleType::BaseDynamicChunked:
5965 case OMPScheduleType::BaseGuidedChunked:
5966 case OMPScheduleType::BaseGuidedIterativeChunked:
5967 case OMPScheduleType::BaseGuidedAnalyticalChunked:
5968 case OMPScheduleType::BaseStaticBalancedChunked:
5969 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
5970 NeedsBarrier, ChunkSize);
5971
5972 default:
5973 llvm_unreachable("Unknown/unimplemented schedule kind");
5974 }
5975}
5976
5977/// Returns an LLVM function to call for initializing loop bounds using OpenMP
5978/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5979/// the runtime. Always interpret integers as unsigned similarly to
5980/// CanonicalLoopInfo.
5981static FunctionCallee
5983 unsigned Bitwidth = Ty->getIntegerBitWidth();
5984 if (Bitwidth == 32)
5985 return OMPBuilder.getOrCreateRuntimeFunction(
5986 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
5987 if (Bitwidth == 64)
5988 return OMPBuilder.getOrCreateRuntimeFunction(
5989 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
5990 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5991}
5992
5993/// Returns an LLVM function to call for updating the next loop using OpenMP
5994/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
5995/// the runtime. Always interpret integers as unsigned similarly to
5996/// CanonicalLoopInfo.
5997static FunctionCallee
5999 unsigned Bitwidth = Ty->getIntegerBitWidth();
6000 if (Bitwidth == 32)
6001 return OMPBuilder.getOrCreateRuntimeFunction(
6002 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6003 if (Bitwidth == 64)
6004 return OMPBuilder.getOrCreateRuntimeFunction(
6005 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6006 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6007}
6008
6009/// Returns an LLVM function to call for finalizing the dynamic loop using
6010/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6011/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6012static FunctionCallee
6014 unsigned Bitwidth = Ty->getIntegerBitWidth();
6015 if (Bitwidth == 32)
6016 return OMPBuilder.getOrCreateRuntimeFunction(
6017 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6018 if (Bitwidth == 64)
6019 return OMPBuilder.getOrCreateRuntimeFunction(
6020 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6021 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6022}
6023
6025OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6026 InsertPointTy AllocaIP,
6027 OMPScheduleType SchedType,
6028 bool NeedsBarrier, Value *Chunk) {
6029 assert(CLI->isValid() && "Requires a valid canonical loop");
6030 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6031 "Require dedicated allocate IP");
6033 "Require valid schedule type");
6034
6035 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6036 OMPScheduleType::ModifierOrdered;
6037
6038 // Set up the source location value for OpenMP runtime.
6039 Builder.SetCurrentDebugLocation(DL);
6040
6041 uint32_t SrcLocStrSize;
6042 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6043 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6044
6045 // Declare useful OpenMP runtime functions.
6046 Value *IV = CLI->getIndVar();
6047 Type *IVTy = IV->getType();
6048 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6049 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6050
6051 // Allocate space for computed loop bounds as expected by the "init" function.
6052 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6053 Type *I32Type = Type::getInt32Ty(M.getContext());
6054 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6055 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6056 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6057 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6058 CLI->setLastIter(PLastIter);
6059
6060 // At the end of the preheader, prepare for calling the "init" function by
6061 // storing the current loop bounds into the allocated space. A canonical loop
6062 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6063 // and produces an inclusive upper bound.
6064 BasicBlock *PreHeader = CLI->getPreheader();
6065 Builder.SetInsertPoint(PreHeader->getTerminator());
6066 Constant *One = ConstantInt::get(IVTy, 1);
6067 Builder.CreateStore(One, PLowerBound);
6068 Value *UpperBound = CLI->getTripCount();
6069 Builder.CreateStore(UpperBound, PUpperBound);
6070 Builder.CreateStore(One, PStride);
6071
6072 BasicBlock *Header = CLI->getHeader();
6073 BasicBlock *Exit = CLI->getExit();
6074 BasicBlock *Cond = CLI->getCond();
6075 BasicBlock *Latch = CLI->getLatch();
6076 InsertPointTy AfterIP = CLI->getAfterIP();
6077
6078 // The CLI will be "broken" in the code below, as the loop is no longer
6079 // a valid canonical loop.
6080
6081 if (!Chunk)
6082 Chunk = One;
6083
6084 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6085
6086 Constant *SchedulingType =
6087 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6088
6089 // Call the "init" function.
6090 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6091 /* LowerBound */ One, UpperBound,
6092 /* step */ One, Chunk});
6093
6094 // An outer loop around the existing one.
6095 BasicBlock *OuterCond = BasicBlock::Create(
6096 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6097 PreHeader->getParent());
6098 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6099 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6101 DynamicNext,
6102 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6103 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6104 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6105 Value *LowerBound =
6106 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6107 Builder.CreateCondBr(MoreWork, Header, Exit);
6108
6109 // Change PHI-node in loop header to use outer cond rather than preheader,
6110 // and set IV to the LowerBound.
6111 Instruction *Phi = &Header->front();
6112 auto *PI = cast<PHINode>(Phi);
6113 PI->setIncomingBlock(0, OuterCond);
6114 PI->setIncomingValue(0, LowerBound);
6115
6116 // Then set the pre-header to jump to the OuterCond
6117 Instruction *Term = PreHeader->getTerminator();
6118 auto *Br = cast<BranchInst>(Term);
6119 Br->setSuccessor(0, OuterCond);
6120
6121 // Modify the inner condition:
6122 // * Use the UpperBound returned from the DynamicNext call.
6123 // * jump to the loop outer loop when done with one of the inner loops.
6124 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6125 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6126 Instruction *Comp = &*Builder.GetInsertPoint();
6127 auto *CI = cast<CmpInst>(Comp);
6128 CI->setOperand(1, UpperBound);
6129 // Redirect the inner exit to branch to outer condition.
6130 Instruction *Branch = &Cond->back();
6131 auto *BI = cast<BranchInst>(Branch);
6132 assert(BI->getSuccessor(1) == Exit);
6133 BI->setSuccessor(1, OuterCond);
6134
6135 // Call the "fini" function if "ordered" is present in wsloop directive.
6136 if (Ordered) {
6137 Builder.SetInsertPoint(&Latch->back());
6138 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6139 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6140 }
6141
6142 // Add the barrier if requested.
6143 if (NeedsBarrier) {
6144 Builder.SetInsertPoint(&Exit->back());
6145 InsertPointOrErrorTy BarrierIP =
6147 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6148 /* CheckCancelFlag */ false);
6149 if (!BarrierIP)
6150 return BarrierIP.takeError();
6151 }
6152
6153 CLI->invalidate();
6154 return AfterIP;
6155}
6156
6157/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6158/// after this \p OldTarget will be orphaned.
6160 BasicBlock *NewTarget, DebugLoc DL) {
6161 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6162 redirectTo(Pred, NewTarget, DL);
6163}
6164
6165/// Determine which blocks in \p BBs are reachable from outside and remove the
6166/// ones that are not reachable from the function.
6169 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6170 for (Use &U : BB->uses()) {
6171 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6172 if (!UseInst)
6173 continue;
6174 if (BBsToErase.count(UseInst->getParent()))
6175 continue;
6176 return true;
6177 }
6178 return false;
6179 };
6180
6181 while (BBsToErase.remove_if(HasRemainingUses)) {
6182 // Try again if anything was removed.
6183 }
6184
6185 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6186 DeleteDeadBlocks(BBVec);
6187}
6188
6189CanonicalLoopInfo *
6191 InsertPointTy ComputeIP) {
6192 assert(Loops.size() >= 1 && "At least one loop required");
6193 size_t NumLoops = Loops.size();
6194
6195 // Nothing to do if there is already just one loop.
6196 if (NumLoops == 1)
6197 return Loops.front();
6198
6199 CanonicalLoopInfo *Outermost = Loops.front();
6200 CanonicalLoopInfo *Innermost = Loops.back();
6201 BasicBlock *OrigPreheader = Outermost->getPreheader();
6202 BasicBlock *OrigAfter = Outermost->getAfter();
6203 Function *F = OrigPreheader->getParent();
6204
6205 // Loop control blocks that may become orphaned later.
6206 SmallVector<BasicBlock *, 12> OldControlBBs;
6207 OldControlBBs.reserve(6 * Loops.size());
6209 Loop->collectControlBlocks(OldControlBBs);
6210
6211 // Setup the IRBuilder for inserting the trip count computation.
6212 Builder.SetCurrentDebugLocation(DL);
6213 if (ComputeIP.isSet())
6214 Builder.restoreIP(ComputeIP);
6215 else
6216 Builder.restoreIP(Outermost->getPreheaderIP());
6217
6218 // Derive the collapsed' loop trip count.
6219 // TODO: Find common/largest indvar type.
6220 Value *CollapsedTripCount = nullptr;
6221 for (CanonicalLoopInfo *L : Loops) {
6222 assert(L->isValid() &&
6223 "All loops to collapse must be valid canonical loops");
6224 Value *OrigTripCount = L->getTripCount();
6225 if (!CollapsedTripCount) {
6226 CollapsedTripCount = OrigTripCount;
6227 continue;
6228 }
6229
6230 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6231 CollapsedTripCount =
6232 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6233 }
6234
6235 // Create the collapsed loop control flow.
6236 CanonicalLoopInfo *Result =
6237 createLoopSkeleton(DL, CollapsedTripCount, F,
6238 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6239
6240 // Build the collapsed loop body code.
6241 // Start with deriving the input loop induction variables from the collapsed
6242 // one, using a divmod scheme. To preserve the original loops' order, the
6243 // innermost loop use the least significant bits.
6244 Builder.restoreIP(Result->getBodyIP());
6245
6246 Value *Leftover = Result->getIndVar();
6247 SmallVector<Value *> NewIndVars;
6248 NewIndVars.resize(NumLoops);
6249 for (int i = NumLoops - 1; i >= 1; --i) {
6250 Value *OrigTripCount = Loops[i]->getTripCount();
6251
6252 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6253 NewIndVars[i] = NewIndVar;
6254
6255 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6256 }
6257 // Outermost loop gets all the remaining bits.
6258 NewIndVars[0] = Leftover;
6259
6260 // Construct the loop body control flow.
6261 // We progressively construct the branch structure following in direction of
6262 // the control flow, from the leading in-between code, the loop nest body, the
6263 // trailing in-between code, and rejoining the collapsed loop's latch.
6264 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6265 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6266 // its predecessors as sources.
6267 BasicBlock *ContinueBlock = Result->getBody();
6268 BasicBlock *ContinuePred = nullptr;
6269 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6270 BasicBlock *NextSrc) {
6271 if (ContinueBlock)
6272 redirectTo(ContinueBlock, Dest, DL);
6273 else
6274 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6275
6276 ContinueBlock = nullptr;
6277 ContinuePred = NextSrc;
6278 };
6279
6280 // The code before the nested loop of each level.
6281 // Because we are sinking it into the nest, it will be executed more often
6282 // that the original loop. More sophisticated schemes could keep track of what
6283 // the in-between code is and instantiate it only once per thread.
6284 for (size_t i = 0; i < NumLoops - 1; ++i)
6285 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6286
6287 // Connect the loop nest body.
6288 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6289
6290 // The code after the nested loop at each level.
6291 for (size_t i = NumLoops - 1; i > 0; --i)
6292 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6293
6294 // Connect the finished loop to the collapsed loop latch.
6295 ContinueWith(Result->getLatch(), nullptr);
6296
6297 // Replace the input loops with the new collapsed loop.
6298 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6299 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6300
6301 // Replace the input loop indvars with the derived ones.
6302 for (size_t i = 0; i < NumLoops; ++i)
6303 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6304
6305 // Remove unused parts of the input loops.
6306 removeUnusedBlocksFromParent(OldControlBBs);
6307
6308 for (CanonicalLoopInfo *L : Loops)
6309 L->invalidate();
6310
6311#ifndef NDEBUG
6312 Result->assertOK();
6313#endif
6314 return Result;
6315}
6316
6317std::vector<CanonicalLoopInfo *>
6319 ArrayRef<Value *> TileSizes) {
6320 assert(TileSizes.size() == Loops.size() &&
6321 "Must pass as many tile sizes as there are loops");
6322 int NumLoops = Loops.size();
6323 assert(NumLoops >= 1 && "At least one loop to tile required");
6324
6325 CanonicalLoopInfo *OutermostLoop = Loops.front();
6326 CanonicalLoopInfo *InnermostLoop = Loops.back();
6327 Function *F = OutermostLoop->getBody()->getParent();
6328 BasicBlock *InnerEnter = InnermostLoop->getBody();
6329 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6330
6331 // Loop control blocks that may become orphaned later.
6332 SmallVector<BasicBlock *, 12> OldControlBBs;
6333 OldControlBBs.reserve(6 * Loops.size());
6335 Loop->collectControlBlocks(OldControlBBs);
6336
6337 // Collect original trip counts and induction variable to be accessible by
6338 // index. Also, the structure of the original loops is not preserved during
6339 // the construction of the tiled loops, so do it before we scavenge the BBs of
6340 // any original CanonicalLoopInfo.
6341 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6342 for (CanonicalLoopInfo *L : Loops) {
6343 assert(L->isValid() && "All input loops must be valid canonical loops");
6344 OrigTripCounts.push_back(L->getTripCount());
6345 OrigIndVars.push_back(L->getIndVar());
6346 }
6347
6348 // Collect the code between loop headers. These may contain SSA definitions
6349 // that are used in the loop nest body. To be usable with in the innermost
6350 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6351 // these instructions may be executed more often than before the tiling.
6352 // TODO: It would be sufficient to only sink them into body of the
6353 // corresponding tile loop.
6355 for (int i = 0; i < NumLoops - 1; ++i) {
6356 CanonicalLoopInfo *Surrounding = Loops[i];
6357 CanonicalLoopInfo *Nested = Loops[i + 1];
6358
6359 BasicBlock *EnterBB = Surrounding->getBody();
6360 BasicBlock *ExitBB = Nested->getHeader();
6361 InbetweenCode.emplace_back(EnterBB, ExitBB);
6362 }
6363
6364 // Compute the trip counts of the floor loops.
6365 Builder.SetCurrentDebugLocation(DL);
6366 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6367 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6368 for (int i = 0; i < NumLoops; ++i) {
6369 Value *TileSize = TileSizes[i];
6370 Value *OrigTripCount = OrigTripCounts[i];
6371 Type *IVType = OrigTripCount->getType();
6372
6373 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6374 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6375
6376 // 0 if tripcount divides the tilesize, 1 otherwise.
6377 // 1 means we need an additional iteration for a partial tile.
6378 //
6379 // Unfortunately we cannot just use the roundup-formula
6380 // (tripcount + tilesize - 1)/tilesize
6381 // because the summation might overflow. We do not want introduce undefined
6382 // behavior when the untiled loop nest did not.
6383 Value *FloorTripOverflow =
6384 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6385
6386 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6387 Value *FloorTripCount =
6388 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6389 "omp_floor" + Twine(i) + ".tripcount", true);
6390
6391 // Remember some values for later use.
6392 FloorCompleteCount.push_back(FloorCompleteTripCount);
6393 FloorCount.push_back(FloorTripCount);
6394 FloorRems.push_back(FloorTripRem);
6395 }
6396
6397 // Generate the new loop nest, from the outermost to the innermost.
6398 std::vector<CanonicalLoopInfo *> Result;
6399 Result.reserve(NumLoops * 2);
6400
6401 // The basic block of the surrounding loop that enters the nest generated
6402 // loop.
6403 BasicBlock *Enter = OutermostLoop->getPreheader();
6404
6405 // The basic block of the surrounding loop where the inner code should
6406 // continue.
6407 BasicBlock *Continue = OutermostLoop->getAfter();
6408
6409 // Where the next loop basic block should be inserted.
6410 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6411
6412 auto EmbeddNewLoop =
6413 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6414 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6415 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6416 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6417 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6418 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6419
6420 // Setup the position where the next embedded loop connects to this loop.
6421 Enter = EmbeddedLoop->getBody();
6422 Continue = EmbeddedLoop->getLatch();
6423 OutroInsertBefore = EmbeddedLoop->getLatch();
6424 return EmbeddedLoop;
6425 };
6426
6427 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6428 const Twine &NameBase) {
6429 for (auto P : enumerate(TripCounts)) {
6430 CanonicalLoopInfo *EmbeddedLoop =
6431 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6432 Result.push_back(EmbeddedLoop);
6433 }
6434 };
6435
6436 EmbeddNewLoops(FloorCount, "floor");
6437
6438 // Within the innermost floor loop, emit the code that computes the tile
6439 // sizes.
6440 Builder.SetInsertPoint(Enter->getTerminator());
6441 SmallVector<Value *, 4> TileCounts;
6442 for (int i = 0; i < NumLoops; ++i) {
6443 CanonicalLoopInfo *FloorLoop = Result[i];
6444 Value *TileSize = TileSizes[i];
6445
6446 Value *FloorIsEpilogue =
6447 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6448 Value *TileTripCount =
6449 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6450
6451 TileCounts.push_back(TileTripCount);
6452 }
6453
6454 // Create the tile loops.
6455 EmbeddNewLoops(TileCounts, "tile");
6456
6457 // Insert the inbetween code into the body.
6458 BasicBlock *BodyEnter = Enter;
6459 BasicBlock *BodyEntered = nullptr;
6460 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6461 BasicBlock *EnterBB = P.first;
6462 BasicBlock *ExitBB = P.second;
6463
6464 if (BodyEnter)
6465 redirectTo(BodyEnter, EnterBB, DL);
6466 else
6467 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6468
6469 BodyEnter = nullptr;
6470 BodyEntered = ExitBB;
6471 }
6472
6473 // Append the original loop nest body into the generated loop nest body.
6474 if (BodyEnter)
6475 redirectTo(BodyEnter, InnerEnter, DL);
6476 else
6477 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6479
6480 // Replace the original induction variable with an induction variable computed
6481 // from the tile and floor induction variables.
6482 Builder.restoreIP(Result.back()->getBodyIP());
6483 for (int i = 0; i < NumLoops; ++i) {
6484 CanonicalLoopInfo *FloorLoop = Result[i];
6485 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6486 Value *OrigIndVar = OrigIndVars[i];
6487 Value *Size = TileSizes[i];
6488
6489 Value *Scale =
6490 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6491 Value *Shift =
6492 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6493 OrigIndVar->replaceAllUsesWith(Shift);
6494 }
6495
6496 // Remove unused parts of the original loops.
6497 removeUnusedBlocksFromParent(OldControlBBs);
6498
6499 for (CanonicalLoopInfo *L : Loops)
6500 L->invalidate();
6501
6502#ifndef NDEBUG
6503 for (CanonicalLoopInfo *GenL : Result)
6504 GenL->assertOK();
6505#endif
6506 return Result;
6507}
6508
6509/// Attach metadata \p Properties to the basic block described by \p BB. If the
6510/// basic block already has metadata, the basic block properties are appended.
6512 ArrayRef<Metadata *> Properties) {
6513 // Nothing to do if no property to attach.
6514 if (Properties.empty())
6515 return;
6516
6517 LLVMContext &Ctx = BB->getContext();
6518 SmallVector<Metadata *> NewProperties;
6519 NewProperties.push_back(nullptr);
6520
6521 // If the basic block already has metadata, prepend it to the new metadata.
6522 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6523 if (Existing)
6524 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6525
6526 append_range(NewProperties, Properties);
6527 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6528 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6529
6530 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6531}
6532
6533/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6534/// loop already has metadata, the loop properties are appended.
6536 ArrayRef<Metadata *> Properties) {
6537 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6538
6539 // Attach metadata to the loop's latch
6540 BasicBlock *Latch = Loop->getLatch();
6541 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6542 addBasicBlockMetadata(Latch, Properties);
6543}
6544
6545/// Attach llvm.access.group metadata to the memref instructions of \p Block
6547 LoopInfo &LI) {
6548 for (Instruction &I : *Block) {
6549 if (I.mayReadOrWriteMemory()) {
6550 // TODO: This instruction may already have access group from
6551 // other pragmas e.g. #pragma clang loop vectorize. Append
6552 // so that the existing metadata is not overwritten.
6553 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6554 }
6555 }
6556}
6557
6559 LLVMContext &Ctx = Builder.getContext();
6561 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6562 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6563}
6564
6566 LLVMContext &Ctx = Builder.getContext();
6568 Loop, {
6569 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6570 });
6571}
6572
6573void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6574 Value *IfCond, ValueToValueMapTy &VMap,
6575 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6576 const Twine &NamePrefix) {
6577 Function *F = CanonicalLoop->getFunction();
6578
6579 // We can't do
6580 // if (cond) {
6581 // simd_loop;
6582 // } else {
6583 // non_simd_loop;
6584 // }
6585 // because then the CanonicalLoopInfo would only point to one of the loops:
6586 // leading to other constructs operating on the same loop to malfunction.
6587 // Instead generate
6588 // while (...) {
6589 // if (cond) {
6590 // simd_body;
6591 // } else {
6592 // not_simd_body;
6593 // }
6594 // }
6595 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6596 // body at -O3
6597
6598 // Define where if branch should be inserted
6599 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6600
6601 // Create additional blocks for the if statement
6602 BasicBlock *Cond = SplitBeforeIt->getParent();
6603 llvm::LLVMContext &C = Cond->getContext();
6605 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6607 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6608
6609 // Create if condition branch.
6610 Builder.SetInsertPoint(SplitBeforeIt);
6611 Instruction *BrInstr =
6612 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6613 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6614 // Then block contains branch to omp loop body which needs to be vectorized
6615 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6616 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6617
6618 Builder.SetInsertPoint(ElseBlock);
6619
6620 // Clone loop for the else branch
6622
6623 SmallVector<BasicBlock *, 8> ExistingBlocks;
6624 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6625 ExistingBlocks.push_back(ThenBlock);
6626 ExistingBlocks.append(L->block_begin(), L->block_end());
6627 // Cond is the block that has the if clause condition
6628 // LoopCond is omp_loop.cond
6629 // LoopHeader is omp_loop.header
6630 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6631 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6632 assert(LoopCond && LoopHeader && "Invalid loop structure");
6633 for (BasicBlock *Block : ExistingBlocks) {
6634 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6635 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6636 continue;
6637 }
6638 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6639
6640 // fix name not to be omp.if.then
6641 if (Block == ThenBlock)
6642 NewBB->setName(NamePrefix + ".if.else");
6643
6644 NewBB->moveBefore(CanonicalLoop->getExit());
6645 VMap[Block] = NewBB;
6646 NewBlocks.push_back(NewBB);
6647 }
6648 remapInstructionsInBlocks(NewBlocks, VMap);
6649 Builder.CreateBr(NewBlocks.front());
6650
6651 // The loop latch must have only one predecessor. Currently it is branched to
6652 // from both the 'then' and 'else' branches.
6653 L->getLoopLatch()->splitBasicBlock(
6654 L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
6655
6656 // Ensure that the then block is added to the loop so we add the attributes in
6657 // the next step
6658 L->addBasicBlockToLoop(ThenBlock, LI);
6659}
6660
6661unsigned
6663 const StringMap<bool> &Features) {
6664 if (TargetTriple.isX86()) {
6665 if (Features.lookup("avx512f"))
6666 return 512;
6667 else if (Features.lookup("avx"))
6668 return 256;
6669 return 128;
6670 }
6671 if (TargetTriple.isPPC())
6672 return 128;
6673 if (TargetTriple.isWasm())
6674 return 128;
6675 return 0;
6676}
6677
6679 MapVector<Value *, Value *> AlignedVars,
6680 Value *IfCond, OrderKind Order,
6681 ConstantInt *Simdlen, ConstantInt *Safelen) {
6682 LLVMContext &Ctx = Builder.getContext();
6683
6684 Function *F = CanonicalLoop->getFunction();
6685
6686 // TODO: We should not rely on pass manager. Currently we use pass manager
6687 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6688 // object. We should have a method which returns all blocks between
6689 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6691 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6692 FAM.registerPass([]() { return LoopAnalysis(); });
6693 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6694
6695 LoopAnalysis LIA;
6696 LoopInfo &&LI = LIA.run(*F, FAM);
6697
6698 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6699 if (AlignedVars.size()) {
6700 InsertPointTy IP = Builder.saveIP();
6701 for (auto &AlignedItem : AlignedVars) {
6702 Value *AlignedPtr = AlignedItem.first;
6703 Value *Alignment = AlignedItem.second;
6704 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6705 Builder.SetInsertPoint(loadInst->getNextNode());
6706 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6707 Alignment);
6708 }
6709 Builder.restoreIP(IP);
6710 }
6711
6712 if (IfCond) {
6713 ValueToValueMapTy VMap;
6714 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6715 }
6716
6718
6719 // Get the basic blocks from the loop in which memref instructions
6720 // can be found.
6721 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6722 // preferably without running any passes.
6723 for (BasicBlock *Block : L->getBlocks()) {
6724 if (Block == CanonicalLoop->getCond() ||
6725 Block == CanonicalLoop->getHeader())
6726 continue;
6727 Reachable.insert(Block);
6728 }
6729
6730 SmallVector<Metadata *> LoopMDList;
6731
6732 // In presence of finite 'safelen', it may be unsafe to mark all
6733 // the memory instructions parallel, because loop-carried
6734 // dependences of 'safelen' iterations are possible.
6735 // If clause order(concurrent) is specified then the memory instructions
6736 // are marked parallel even if 'safelen' is finite.
6737 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6738 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6739
6740 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6741 // versions so we can't add the loop attributes in that case.
6742 if (IfCond) {
6743 // we can still add llvm.loop.parallel_access
6744 addLoopMetadata(CanonicalLoop, LoopMDList);
6745 return;
6746 }
6747
6748 // Use the above access group metadata to create loop level
6749 // metadata, which should be distinct for each loop.
6750 ConstantAsMetadata *BoolConst =
6752 LoopMDList.push_back(MDNode::get(
6753 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6754
6755 if (Simdlen || Safelen) {
6756 // If both simdlen and safelen clauses are specified, the value of the
6757 // simdlen parameter must be less than or equal to the value of the safelen
6758 // parameter. Therefore, use safelen only in the absence of simdlen.
6759 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6760 LoopMDList.push_back(
6761 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6762 ConstantAsMetadata::get(VectorizeWidth)}));
6763 }
6764
6765 addLoopMetadata(CanonicalLoop, LoopMDList);
6766}
6767
6768/// Create the TargetMachine object to query the backend for optimization
6769/// preferences.
6770///
6771/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6772/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6773/// needed for the LLVM pass pipline. We use some default options to avoid
6774/// having to pass too many settings from the frontend that probably do not
6775/// matter.
6776///
6777/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
6778/// method. If we are going to use TargetMachine for more purposes, especially
6779/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
6780/// might become be worth requiring front-ends to pass on their TargetMachine,
6781/// or at least cache it between methods. Note that while fontends such as Clang
6782/// have just a single main TargetMachine per translation unit, "target-cpu" and
6783/// "target-features" that determine the TargetMachine are per-function and can
6784/// be overrided using __attribute__((target("OPTIONS"))).
6785static std::unique_ptr<TargetMachine>
6787 Module *M = F->getParent();
6788
6789 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
6790 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
6791 const llvm::Triple &Triple = M->getTargetTriple();
6792
6793 std::string Error;
6795 if (!TheTarget)
6796 return {};
6797
6799 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
6800 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
6801 /*CodeModel=*/std::nullopt, OptLevel));
6802}
6803
6804/// Heuristically determine the best-performant unroll factor for \p CLI. This
6805/// depends on the target processor. We are re-using the same heuristics as the
6806/// LoopUnrollPass.
6808 Function *F = CLI->getFunction();
6809
6810 // Assume the user requests the most aggressive unrolling, even if the rest of
6811 // the code is optimized using a lower setting.
6813 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
6814
6816 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
6817 FAM.registerPass([]() { return AssumptionAnalysis(); });
6818 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6819 FAM.registerPass([]() { return LoopAnalysis(); });
6820 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
6821 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6822 TargetIRAnalysis TIRA;
6823 if (TM)
6824 TIRA = TargetIRAnalysis(
6825 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
6826 FAM.registerPass([&]() { return TIRA; });
6827
6828 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
6830 ScalarEvolution &&SE = SEA.run(*F, FAM);
6832 DominatorTree &&DT = DTA.run(*F, FAM);
6833 LoopAnalysis LIA;
6834 LoopInfo &&LI = LIA.run(*F, FAM);
6836 AssumptionCache &&AC = ACT.run(*F, FAM);
6838
6839 Loop *L = LI.getLoopFor(CLI->getHeader());
6840 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
6841
6843 L, SE, TTI,
6844 /*BlockFrequencyInfo=*/nullptr,
6845 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
6846 /*UserThreshold=*/std::nullopt,
6847 /*UserCount=*/std::nullopt,
6848 /*UserAllowPartial=*/true,
6849 /*UserAllowRuntime=*/true,
6850 /*UserUpperBound=*/std::nullopt,
6851 /*UserFullUnrollMaxCount=*/std::nullopt);
6852
6853 UP.Force = true;
6854
6855 // Account for additional optimizations taking place before the LoopUnrollPass
6856 // would unroll the loop.
6859
6860 // Use normal unroll factors even if the rest of the code is optimized for
6861 // size.
6864
6865 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
6866 << " Threshold=" << UP.Threshold << "\n"
6867 << " PartialThreshold=" << UP.PartialThreshold << "\n"
6868 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
6869 << " PartialOptSizeThreshold="
6870 << UP.PartialOptSizeThreshold << "\n");
6871
6872 // Disable peeling.
6875 /*UserAllowPeeling=*/false,
6876 /*UserAllowProfileBasedPeeling=*/false,
6877 /*UnrollingSpecficValues=*/false);
6878
6880 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
6881
6882 // Assume that reads and writes to stack variables can be eliminated by
6883 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
6884 // size.
6885 for (BasicBlock *BB : L->blocks()) {
6886 for (Instruction &I : *BB) {
6887 Value *Ptr;
6888 if (auto *Load = dyn_cast<LoadInst>(&I)) {
6889 Ptr = Load->getPointerOperand();
6890 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
6891 Ptr = Store->getPointerOperand();
6892 } else
6893 continue;
6894
6895 Ptr = Ptr->stripPointerCasts();
6896
6897 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
6898 if (Alloca->getParent() == &F->getEntryBlock())
6899 EphValues.insert(&I);
6900 }
6901 }
6902 }
6903
6904 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
6905
6906 // Loop is not unrollable if the loop contains certain instructions.
6907 if (!UCE.canUnroll()) {
6908 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
6909 return 1;
6910 }
6911
6912 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
6913 << "\n");
6914
6915 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
6916 // be able to use it.
6917 int TripCount = 0;
6918 int MaxTripCount = 0;
6919 bool MaxOrZero = false;
6920 unsigned TripMultiple = 0;
6921
6922 bool UseUpperBound = false;
6923 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
6924 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP,
6925 UseUpperBound);
6926 unsigned Factor = UP.Count;
6927 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
6928
6929 // This function returns 1 to signal to not unroll a loop.
6930 if (Factor == 0)
6931 return 1;
6932 return Factor;
6933}
6934
6936 int32_t Factor,
6937 CanonicalLoopInfo **UnrolledCLI) {
6938 assert(Factor >= 0 && "Unroll factor must not be negative");
6939
6940 Function *F = Loop->getFunction();
6941 LLVMContext &Ctx = F->getContext();
6942
6943 // If the unrolled loop is not used for another loop-associated directive, it
6944 // is sufficient to add metadata for the LoopUnrollPass.
6945 if (!UnrolledCLI) {
6946 SmallVector<Metadata *, 2> LoopMetadata;
6947 LoopMetadata.push_back(
6948 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
6949
6950 if (Factor >= 1) {
6952 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6953 LoopMetadata.push_back(MDNode::get(
6954 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
6955 }
6956
6957 addLoopMetadata(Loop, LoopMetadata);
6958 return;
6959 }
6960
6961 // Heuristically determine the unroll factor.
6962 if (Factor == 0)
6964
6965 // No change required with unroll factor 1.
6966 if (Factor == 1) {
6967 *UnrolledCLI = Loop;
6968 return;
6969 }
6970
6971 assert(Factor >= 2 &&
6972 "unrolling only makes sense with a factor of 2 or larger");
6973
6974 Type *IndVarTy = Loop->getIndVarType();
6975
6976 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
6977 // unroll the inner loop.
6978 Value *FactorVal =
6979 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
6980 /*isSigned=*/false));
6981 std::vector<CanonicalLoopInfo *> LoopNest =
6982 tileLoops(DL, {Loop}, {FactorVal});
6983 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
6984 *UnrolledCLI = LoopNest[0];
6985 CanonicalLoopInfo *InnerLoop = LoopNest[1];
6986
6987 // LoopUnrollPass can only fully unroll loops with constant trip count.
6988 // Unroll by the unroll factor with a fallback epilog for the remainder
6989 // iterations if necessary.
6991 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
6993 InnerLoop,
6994 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6996 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
6997
6998#ifndef NDEBUG
6999 (*UnrolledCLI)->assertOK();
7000#endif
7001}
7002
7005 llvm::Value *BufSize, llvm::Value *CpyBuf,
7006 llvm::Value *CpyFn, llvm::Value *DidIt) {
7007 if (!updateToLocation(Loc))
7008 return Loc.IP;
7009
7010 uint32_t SrcLocStrSize;
7011 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7012 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7013 Value *ThreadId = getOrCreateThreadID(Ident);
7014
7015 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7016
7017 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7018
7019 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7020 createRuntimeFunctionCall(Fn, Args);
7021
7022 return Builder.saveIP();
7023}
7024
7026 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7027 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7029
7030 if (!updateToLocation(Loc))
7031 return Loc.IP;
7032
7033 // If needed allocate and initialize `DidIt` with 0.
7034 // DidIt: flag variable: 1=single thread; 0=not single thread.
7035 llvm::Value *DidIt = nullptr;
7036 if (!CPVars.empty()) {
7037 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7038 Builder.CreateStore(Builder.getInt32(0), DidIt);
7039 }
7040
7041 Directive OMPD = Directive::OMPD_single;
7042 uint32_t SrcLocStrSize;
7043 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7044 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7045 Value *ThreadId = getOrCreateThreadID(Ident);
7046 Value *Args[] = {Ident, ThreadId};
7047
7048 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7049 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7050
7051 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7052 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7053
7054 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7055 if (Error Err = FiniCB(IP))
7056 return Err;
7057
7058 // The thread that executes the single region must set `DidIt` to 1.
7059 // This is used by __kmpc_copyprivate, to know if the caller is the
7060 // single thread or not.
7061 if (DidIt)
7062 Builder.CreateStore(Builder.getInt32(1), DidIt);
7063
7064 return Error::success();
7065 };
7066
7067 // generates the following:
7068 // if (__kmpc_single()) {
7069 // .... single region ...
7070 // __kmpc_end_single
7071 // }
7072 // __kmpc_copyprivate
7073 // __kmpc_barrier
7074
7075 InsertPointOrErrorTy AfterIP =
7076 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7077 /*Conditional*/ true,
7078 /*hasFinalize*/ true);
7079 if (!AfterIP)
7080 return AfterIP.takeError();
7081
7082 if (DidIt) {
7083 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7084 // NOTE BufSize is currently unused, so just pass 0.
7086 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7087 CPFuncs[I], DidIt);
7088 // NOTE __kmpc_copyprivate already inserts a barrier
7089 } else if (!IsNowait) {
7090 InsertPointOrErrorTy AfterIP =
7092 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7093 /* CheckCancelFlag */ false);
7094 if (!AfterIP)
7095 return AfterIP.takeError();
7096 }
7097 return Builder.saveIP();
7098}
7099
7101 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7102 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7103
7104 if (!updateToLocation(Loc))
7105 return Loc.IP;
7106
7107 Directive OMPD = Directive::OMPD_critical;
7108 uint32_t SrcLocStrSize;
7109 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7110 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7111 Value *ThreadId = getOrCreateThreadID(Ident);
7112 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7113 Value *Args[] = {Ident, ThreadId, LockVar};
7114
7115 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7116 Function *RTFn = nullptr;
7117 if (HintInst) {
7118 // Add Hint to entry Args and create call
7119 EnterArgs.push_back(HintInst);
7120 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7121 } else {
7122 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7123 }
7124 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7125
7126 Function *ExitRTLFn =
7127 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7128 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7129
7130 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7131 /*Conditional*/ false, /*hasFinalize*/ true);
7132}
7133
7136 InsertPointTy AllocaIP, unsigned NumLoops,
7137 ArrayRef<llvm::Value *> StoreValues,
7138 const Twine &Name, bool IsDependSource) {
7139 assert(
7140 llvm::all_of(StoreValues,
7141 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7142 "OpenMP runtime requires depend vec with i64 type");
7143
7144 if (!updateToLocation(Loc))
7145 return Loc.IP;
7146
7147 // Allocate space for vector and generate alloc instruction.
7148 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7149 Builder.restoreIP(AllocaIP);
7150 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7151 ArgsBase->setAlignment(Align(8));
7153
7154 // Store the index value with offset in depend vector.
7155 for (unsigned I = 0; I < NumLoops; ++I) {
7156 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7157 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7158 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7159 STInst->setAlignment(Align(8));
7160 }
7161
7162 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7163 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7164
7165 uint32_t SrcLocStrSize;
7166 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7167 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7168 Value *ThreadId = getOrCreateThreadID(Ident);
7169 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7170
7171 Function *RTLFn = nullptr;
7172 if (IsDependSource)
7173 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7174 else
7175 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7176 createRuntimeFunctionCall(RTLFn, Args);
7177
7178 return Builder.saveIP();
7179}
7180
7182 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7183 FinalizeCallbackTy FiniCB, bool IsThreads) {
7184 if (!updateToLocation(Loc))
7185 return Loc.IP;
7186
7187 Directive OMPD = Directive::OMPD_ordered;
7188 Instruction *EntryCall = nullptr;
7189 Instruction *ExitCall = nullptr;
7190
7191 if (IsThreads) {
7192 uint32_t SrcLocStrSize;
7193 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7194 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7195 Value *ThreadId = getOrCreateThreadID(Ident);
7196 Value *Args[] = {Ident, ThreadId};
7197
7198 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7199 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7200
7201 Function *ExitRTLFn =
7202 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7203 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7204 }
7205
7206 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7207 /*Conditional*/ false, /*hasFinalize*/ true);
7208}
7209
7210OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7211 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7212 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7213 bool HasFinalize, bool IsCancellable) {
7214
7215 if (HasFinalize)
7216 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7217
7218 // Create inlined region's entry and body blocks, in preparation
7219 // for conditional creation
7220 BasicBlock *EntryBB = Builder.GetInsertBlock();
7221 Instruction *SplitPos = EntryBB->getTerminator();
7222 if (!isa_and_nonnull<BranchInst>(SplitPos))
7223 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7224 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7225 BasicBlock *FiniBB =
7226 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7227
7228 Builder.SetInsertPoint(EntryBB->getTerminator());
7229 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7230
7231 // generate body
7232 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7233 /* CodeGenIP */ Builder.saveIP()))
7234 return Err;
7235
7236 // emit exit call and do any needed finalization.
7237 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7238 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7239 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7240 "Unexpected control flow graph state!!");
7241 InsertPointOrErrorTy AfterIP =
7242 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7243 if (!AfterIP)
7244 return AfterIP.takeError();
7245
7246 // If we are skipping the region of a non conditional, remove the exit
7247 // block, and clear the builder's insertion point.
7248 assert(SplitPos->getParent() == ExitBB &&
7249 "Unexpected Insertion point location!");
7250 auto merged = MergeBlockIntoPredecessor(ExitBB);
7251 BasicBlock *ExitPredBB = SplitPos->getParent();
7252 auto InsertBB = merged ? ExitPredBB : ExitBB;
7253 if (!isa_and_nonnull<BranchInst>(SplitPos))
7254 SplitPos->eraseFromParent();
7255 Builder.SetInsertPoint(InsertBB);
7256
7257 return Builder.saveIP();
7258}
7259
7260OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7261 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7262 // if nothing to do, Return current insertion point.
7263 if (!Conditional || !EntryCall)
7264 return Builder.saveIP();
7265
7266 BasicBlock *EntryBB = Builder.GetInsertBlock();
7267 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7268 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7269 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7270
7271 // Emit thenBB and set the Builder's insertion point there for
7272 // body generation next. Place the block after the current block.
7273 Function *CurFn = EntryBB->getParent();
7274 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7275
7276 // Move Entry branch to end of ThenBB, and replace with conditional
7277 // branch (If-stmt)
7278 Instruction *EntryBBTI = EntryBB->getTerminator();
7279 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7280 EntryBBTI->removeFromParent();
7281 Builder.SetInsertPoint(UI);
7282 Builder.Insert(EntryBBTI);
7283 UI->eraseFromParent();
7284 Builder.SetInsertPoint(ThenBB->getTerminator());
7285
7286 // return an insertion point to ExitBB.
7287 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7288}
7289
7290OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7291 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7292 bool HasFinalize) {
7293
7294 Builder.restoreIP(FinIP);
7295
7296 // If there is finalization to do, emit it before the exit call
7297 if (HasFinalize) {
7298 assert(!FinalizationStack.empty() &&
7299 "Unexpected finalization stack state!");
7300
7301 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7302 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7303
7304 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7305 return std::move(Err);
7306
7307 // Exit condition: insertion point is before the terminator of the new Fini
7308 // block
7309 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7310 }
7311
7312 if (!ExitCall)
7313 return Builder.saveIP();
7314
7315 // place the Exitcall as last instruction before Finalization block terminator
7316 ExitCall->removeFromParent();
7317 Builder.Insert(ExitCall);
7318
7319 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7320 ExitCall->getIterator());
7321}
7322
7324 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7325 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7326 if (!IP.isSet())
7327 return IP;
7328
7330
7331 // creates the following CFG structure
7332 // OMP_Entry : (MasterAddr != PrivateAddr)?
7333 // F T
7334 // | \
7335 // | copin.not.master
7336 // | /
7337 // v /
7338 // copyin.not.master.end
7339 // |
7340 // v
7341 // OMP.Entry.Next
7342
7343 BasicBlock *OMP_Entry = IP.getBlock();
7344 Function *CurFn = OMP_Entry->getParent();
7345 BasicBlock *CopyBegin =
7346 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7347 BasicBlock *CopyEnd = nullptr;
7348
7349 // If entry block is terminated, split to preserve the branch to following
7350 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7351 if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
7352 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7353 "copyin.not.master.end");
7354 OMP_Entry->getTerminator()->eraseFromParent();
7355 } else {
7356 CopyEnd =
7357 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7358 }
7359
7360 Builder.SetInsertPoint(OMP_Entry);
7361 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7362 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7363 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7364 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7365
7366 Builder.SetInsertPoint(CopyBegin);
7367 if (BranchtoEnd)
7368 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7369
7370 return Builder.saveIP();
7371}
7372
7374 Value *Size, Value *Allocator,
7375 std::string Name) {
7378
7379 uint32_t SrcLocStrSize;
7380 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7381 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7382 Value *ThreadId = getOrCreateThreadID(Ident);
7383 Value *Args[] = {ThreadId, Size, Allocator};
7384
7385 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7386
7387 return createRuntimeFunctionCall(Fn, Args, Name);
7388}
7389
7391 Value *Addr, Value *Allocator,
7392 std::string Name) {
7395
7396 uint32_t SrcLocStrSize;
7397 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7398 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7399 Value *ThreadId = getOrCreateThreadID(Ident);
7400 Value *Args[] = {ThreadId, Addr, Allocator};
7401 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7402 return createRuntimeFunctionCall(Fn, Args, Name);
7403}
7404
7406 const LocationDescription &Loc, Value *InteropVar,
7407 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7408 Value *DependenceAddress, bool HaveNowaitClause) {
7411
7412 uint32_t SrcLocStrSize;
7413 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7414 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7415 Value *ThreadId = getOrCreateThreadID(Ident);
7416 if (Device == nullptr)
7417 Device = Constant::getAllOnesValue(Int32);
7418 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7419 if (NumDependences == nullptr) {
7420 NumDependences = ConstantInt::get(Int32, 0);
7421 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7422 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7423 }
7424 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7425 Value *Args[] = {
7426 Ident, ThreadId, InteropVar, InteropTypeVal,
7427 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7428
7429 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7430
7431 return createRuntimeFunctionCall(Fn, Args);
7432}
7433
7435 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7436 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7439
7440 uint32_t SrcLocStrSize;
7441 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7442 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7443 Value *ThreadId = getOrCreateThreadID(Ident);
7444 if (Device == nullptr)
7445 Device = Constant::getAllOnesValue(Int32);
7446 if (NumDependences == nullptr) {
7447 NumDependences = ConstantInt::get(Int32, 0);
7448 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7449 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7450 }
7451 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7452 Value *Args[] = {
7453 Ident, ThreadId, InteropVar, Device,
7454 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7455
7456 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7457
7458 return createRuntimeFunctionCall(Fn, Args);
7459}
7460
7462 Value *InteropVar, Value *Device,
7463 Value *NumDependences,
7464 Value *DependenceAddress,
7465 bool HaveNowaitClause) {
7468 uint32_t SrcLocStrSize;
7469 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7470 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7471 Value *ThreadId = getOrCreateThreadID(Ident);
7472 if (Device == nullptr)
7473 Device = Constant::getAllOnesValue(Int32);
7474 if (NumDependences == nullptr) {
7475 NumDependences = ConstantInt::get(Int32, 0);
7476 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7477 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7478 }
7479 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7480 Value *Args[] = {
7481 Ident, ThreadId, InteropVar, Device,
7482 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7483
7484 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7485
7486 return createRuntimeFunctionCall(Fn, Args);
7487}
7488
7491 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7494
7495 uint32_t SrcLocStrSize;
7496 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7497 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7498 Value *ThreadId = getOrCreateThreadID(Ident);
7499 Constant *ThreadPrivateCache =
7500 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7501 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7502
7503 Function *Fn =
7504 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7505
7506 return createRuntimeFunctionCall(Fn, Args);
7507}
7508
7510 const LocationDescription &Loc,
7512 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7513 "expected num_threads and num_teams to be specified");
7514
7515 if (!updateToLocation(Loc))
7516 return Loc.IP;
7517
7518 uint32_t SrcLocStrSize;
7519 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7520 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7521 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7522 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7523 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7524 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7525 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7526
7527 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7528 Function *Kernel = DebugKernelWrapper;
7529
7530 // We need to strip the debug prefix to get the correct kernel name.
7531 StringRef KernelName = Kernel->getName();
7532 const std::string DebugPrefix = "_debug__";
7533 if (KernelName.ends_with(DebugPrefix)) {
7534 KernelName = KernelName.drop_back(DebugPrefix.length());
7535 Kernel = M.getFunction(KernelName);
7536 assert(Kernel && "Expected the real kernel to exist");
7537 }
7538
7539 // Manifest the launch configuration in the metadata matching the kernel
7540 // environment.
7541 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7542 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7543
7544 // If MaxThreads not set, select the maximum between the default workgroup
7545 // size and the MinThreads value.
7546 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7547 if (MaxThreadsVal < 0)
7548 MaxThreadsVal = std::max(
7549 int32_t(getGridValue(T, Kernel).GV_Default_WG_Size), Attrs.MinThreads);
7550
7551 if (MaxThreadsVal > 0)
7552 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7553
7554 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7555 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7556 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7557 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7558 Constant *ReductionDataSize =
7559 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7560 Constant *ReductionBufferLength =
7561 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7562
7564 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7565 const DataLayout &DL = Fn->getDataLayout();
7566
7567 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7568 Constant *DynamicEnvironmentInitializer =
7569 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7570 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7571 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7572 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7573 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7574 DL.getDefaultGlobalsAddressSpace());
7575 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7576
7577 Constant *DynamicEnvironment =
7578 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7579 ? DynamicEnvironmentGV
7580 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7581 DynamicEnvironmentPtr);
7582
7583 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7584 ConfigurationEnvironment, {
7585 UseGenericStateMachineVal,
7586 MayUseNestedParallelismVal,
7587 IsSPMDVal,
7588 MinThreads,
7589 MaxThreads,
7590 MinTeams,
7591 MaxTeams,
7592 ReductionDataSize,
7593 ReductionBufferLength,
7594 });
7595 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7596 KernelEnvironment, {
7597 ConfigurationEnvironmentInitializer,
7598 Ident,
7599 DynamicEnvironment,
7600 });
7601 std::string KernelEnvironmentName =
7602 (KernelName + "_kernel_environment").str();
7603 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7604 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7605 KernelEnvironmentInitializer, KernelEnvironmentName,
7606 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7607 DL.getDefaultGlobalsAddressSpace());
7608 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7609
7610 Constant *KernelEnvironment =
7611 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7612 ? KernelEnvironmentGV
7613 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7614 KernelEnvironmentPtr);
7615 Value *KernelLaunchEnvironment = DebugKernelWrapper->getArg(0);
7616 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7617 KernelLaunchEnvironment =
7618 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7619 ? KernelLaunchEnvironment
7620 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7621 KernelLaunchEnvParamTy);
7622 CallInst *ThreadKind = createRuntimeFunctionCall(
7623 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7624
7625 Value *ExecUserCode = Builder.CreateICmpEQ(
7626 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7627 "exec_user_code");
7628
7629 // ThreadKind = __kmpc_target_init(...)
7630 // if (ThreadKind == -1)
7631 // user_code
7632 // else
7633 // return;
7634
7635 auto *UI = Builder.CreateUnreachable();
7636 BasicBlock *CheckBB = UI->getParent();
7637 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7638
7639 BasicBlock *WorkerExitBB = BasicBlock::Create(
7640 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7641 Builder.SetInsertPoint(WorkerExitBB);
7642 Builder.CreateRetVoid();
7643
7644 auto *CheckBBTI = CheckBB->getTerminator();
7645 Builder.SetInsertPoint(CheckBBTI);
7646 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7647
7648 CheckBBTI->eraseFromParent();
7649 UI->eraseFromParent();
7650
7651 // Continue in the "user_code" block, see diagram above and in
7652 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7653 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7654}
7655
7657 int32_t TeamsReductionDataSize,
7658 int32_t TeamsReductionBufferLength) {
7659 if (!updateToLocation(Loc))
7660 return;
7661
7663 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7664
7666
7667 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7668 return;
7669
7670 Function *Kernel = Builder.GetInsertBlock()->getParent();
7671 // We need to strip the debug prefix to get the correct kernel name.
7672 StringRef KernelName = Kernel->getName();
7673 const std::string DebugPrefix = "_debug__";
7674 if (KernelName.ends_with(DebugPrefix))
7675 KernelName = KernelName.drop_back(DebugPrefix.length());
7676 auto *KernelEnvironmentGV =
7677 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7678 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7679 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7680 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7681 KernelEnvironmentInitializer,
7682 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7683 NewInitializer = ConstantFoldInsertValueInstruction(
7684 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7685 {0, 8});
7686 KernelEnvironmentGV->setInitializer(NewInitializer);
7687}
7688
7689static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7690 bool Min) {
7691 if (Kernel.hasFnAttribute(Name)) {
7692 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7693 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7694 }
7695 Kernel.addFnAttr(Name, llvm::utostr(Value));
7696}
7697
7698std::pair<int32_t, int32_t>
7700 int32_t ThreadLimit =
7701 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7702
7703 if (T.isAMDGPU()) {
7704 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7705 if (!Attr.isValid() || !Attr.isStringAttribute())
7706 return {0, ThreadLimit};
7707 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7708 int32_t LB, UB;
7709 if (!llvm::to_integer(UBStr, UB, 10))
7710 return {0, ThreadLimit};
7711 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7712 if (!llvm::to_integer(LBStr, LB, 10))
7713 return {0, UB};
7714 return {LB, UB};
7715 }
7716
7717 if (Kernel.hasFnAttribute("nvvm.maxntid")) {
7718 int32_t UB = Kernel.getFnAttributeAsParsedInteger("nvvm.maxntid");
7719 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7720 }
7721 return {0, ThreadLimit};
7722}
7723
7725 Function &Kernel, int32_t LB,
7726 int32_t UB) {
7727 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7728
7729 if (T.isAMDGPU()) {
7730 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7731 llvm::utostr(LB) + "," + llvm::utostr(UB));
7732 return;
7733 }
7734
7735 updateNVPTXAttr(Kernel, "nvvm.maxntid", UB, true);
7736}
7737
7738std::pair<int32_t, int32_t>
7740 // TODO: Read from backend annotations if available.
7741 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7742}
7743
7745 int32_t LB, int32_t UB) {
7746 if (T.isNVPTX())
7747 if (UB > 0)
7748 Kernel.addFnAttr("nvvm.maxclusterrank", llvm::utostr(UB));
7749 if (T.isAMDGPU())
7750 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7751
7752 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7753}
7754
7755void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7756 Function *OutlinedFn) {
7757 if (Config.isTargetDevice()) {
7759 // TODO: Determine if DSO local can be set to true.
7760 OutlinedFn->setDSOLocal(false);
7762 if (T.isAMDGCN())
7764 else if (T.isNVPTX())
7766 else if (T.isSPIRV())
7768 }
7769}
7770
7771Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
7772 StringRef EntryFnIDName) {
7773 if (Config.isTargetDevice()) {
7774 assert(OutlinedFn && "The outlined function must exist if embedded");
7775 return OutlinedFn;
7776 }
7777
7778 return new GlobalVariable(
7779 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
7780 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
7781}
7782
7783Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
7784 StringRef EntryFnName) {
7785 if (OutlinedFn)
7786 return OutlinedFn;
7787
7788 assert(!M.getGlobalVariable(EntryFnName, true) &&
7789 "Named kernel already exists?");
7790 return new GlobalVariable(
7791 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
7792 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
7793}
7794
7796 TargetRegionEntryInfo &EntryInfo,
7797 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
7798 Function *&OutlinedFn, Constant *&OutlinedFnID) {
7799
7800 SmallString<64> EntryFnName;
7801 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
7802
7803 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
7804 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
7805 if (!CBResult)
7806 return CBResult.takeError();
7807 OutlinedFn = *CBResult;
7808 } else {
7809 OutlinedFn = nullptr;
7810 }
7811
7812 // If this target outline function is not an offload entry, we don't need to
7813 // register it. This may be in the case of a false if clause, or if there are
7814 // no OpenMP targets.
7815 if (!IsOffloadEntry)
7816 return Error::success();
7817
7818 std::string EntryFnIDName =
7819 Config.isTargetDevice()
7820 ? std::string(EntryFnName)
7821 : createPlatformSpecificName({EntryFnName, "region_id"});
7822
7823 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
7824 EntryFnName, EntryFnIDName);
7825 return Error::success();
7826}
7827
7829 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
7830 StringRef EntryFnName, StringRef EntryFnIDName) {
7831 if (OutlinedFn)
7832 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
7833 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
7834 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
7835 OffloadInfoManager.registerTargetRegionEntryInfo(
7836 EntryInfo, EntryAddr, OutlinedFnID,
7838 return OutlinedFnID;
7839}
7840
7842 const LocationDescription &Loc, InsertPointTy AllocaIP,
7843 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
7844 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
7845 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
7847 BodyGenTy BodyGenType)>
7848 BodyGenCB,
7849 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
7850 if (!updateToLocation(Loc))
7851 return InsertPointTy();
7852
7853 Builder.restoreIP(CodeGenIP);
7854 // Disable TargetData CodeGen on Device pass.
7855 if (Config.IsTargetDevice.value_or(false)) {
7856 if (BodyGenCB) {
7857 InsertPointOrErrorTy AfterIP =
7858 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
7859 if (!AfterIP)
7860 return AfterIP.takeError();
7861 Builder.restoreIP(*AfterIP);
7862 }
7863 return Builder.saveIP();
7864 }
7865
7866 bool IsStandAlone = !BodyGenCB;
7867 MapInfosTy *MapInfo;
7868 // Generate the code for the opening of the data environment. Capture all the
7869 // arguments of the runtime call by reference because they are used in the
7870 // closing of the region.
7871 auto BeginThenGen = [&](InsertPointTy AllocaIP,
7872 InsertPointTy CodeGenIP) -> Error {
7873 MapInfo = &GenMapInfoCB(Builder.saveIP());
7874 if (Error Err = emitOffloadingArrays(
7875 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
7876 /*IsNonContiguous=*/true, DeviceAddrCB))
7877 return Err;
7878
7879 TargetDataRTArgs RTArgs;
7881
7882 // Emit the number of elements in the offloading arrays.
7883 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7884
7885 // Source location for the ident struct
7886 if (!SrcLocInfo) {
7887 uint32_t SrcLocStrSize;
7888 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7889 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7890 }
7891
7892 SmallVector<llvm::Value *, 13> OffloadingArgs = {
7893 SrcLocInfo, DeviceID,
7894 PointerNum, RTArgs.BasePointersArray,
7895 RTArgs.PointersArray, RTArgs.SizesArray,
7896 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7897 RTArgs.MappersArray};
7898
7899 if (IsStandAlone) {
7900 assert(MapperFunc && "MapperFunc missing for standalone target data");
7901
7902 auto TaskBodyCB = [&](Value *, Value *,
7904 if (Info.HasNoWait) {
7905 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
7909 }
7910
7912 OffloadingArgs);
7913
7914 if (Info.HasNoWait) {
7915 BasicBlock *OffloadContBlock =
7916 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
7917 Function *CurFn = Builder.GetInsertBlock()->getParent();
7918 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
7919 Builder.restoreIP(Builder.saveIP());
7920 }
7921 return Error::success();
7922 };
7923
7924 bool RequiresOuterTargetTask = Info.HasNoWait;
7925 if (!RequiresOuterTargetTask)
7926 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
7927 /*TargetTaskAllocaIP=*/{}));
7928 else
7929 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
7930 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
7931 } else {
7932 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
7933 omp::OMPRTL___tgt_target_data_begin_mapper);
7934
7935 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
7936
7937 for (auto DeviceMap : Info.DevicePtrInfoMap) {
7938 if (isa<AllocaInst>(DeviceMap.second.second)) {
7939 auto *LI =
7940 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
7941 Builder.CreateStore(LI, DeviceMap.second.second);
7942 }
7943 }
7944
7945 // If device pointer privatization is required, emit the body of the
7946 // region here. It will have to be duplicated: with and without
7947 // privatization.
7948 InsertPointOrErrorTy AfterIP =
7949 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
7950 if (!AfterIP)
7951 return AfterIP.takeError();
7952 Builder.restoreIP(*AfterIP);
7953 }
7954 return Error::success();
7955 };
7956
7957 // If we need device pointer privatization, we need to emit the body of the
7958 // region with no privatization in the 'else' branch of the conditional.
7959 // Otherwise, we don't have to do anything.
7960 auto BeginElseGen = [&](InsertPointTy AllocaIP,
7961 InsertPointTy CodeGenIP) -> Error {
7962 InsertPointOrErrorTy AfterIP =
7963 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
7964 if (!AfterIP)
7965 return AfterIP.takeError();
7966 Builder.restoreIP(*AfterIP);
7967 return Error::success();
7968 };
7969
7970 // Generate code for the closing of the data region.
7971 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
7972 TargetDataRTArgs RTArgs;
7973 Info.EmitDebug = !MapInfo->Names.empty();
7974 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
7975
7976 // Emit the number of elements in the offloading arrays.
7977 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
7978
7979 // Source location for the ident struct
7980 if (!SrcLocInfo) {
7981 uint32_t SrcLocStrSize;
7982 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7983 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7984 }
7985
7986 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
7987 PointerNum, RTArgs.BasePointersArray,
7988 RTArgs.PointersArray, RTArgs.SizesArray,
7989 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
7990 RTArgs.MappersArray};
7991 Function *EndMapperFunc =
7992 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
7993
7994 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
7995 return Error::success();
7996 };
7997
7998 // We don't have to do anything to close the region if the if clause evaluates
7999 // to false.
8000 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8001 return Error::success();
8002 };
8003
8004 Error Err = [&]() -> Error {
8005 if (BodyGenCB) {
8006 Error Err = [&]() {
8007 if (IfCond)
8008 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8009 return BeginThenGen(AllocaIP, Builder.saveIP());
8010 }();
8011
8012 if (Err)
8013 return Err;
8014
8015 // If we don't require privatization of device pointers, we emit the body
8016 // in between the runtime calls. This avoids duplicating the body code.
8017 InsertPointOrErrorTy AfterIP =
8018 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8019 if (!AfterIP)
8020 return AfterIP.takeError();
8021 restoreIPandDebugLoc(Builder, *AfterIP);
8022
8023 if (IfCond)
8024 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8025 return EndThenGen(AllocaIP, Builder.saveIP());
8026 }
8027 if (IfCond)
8028 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8029 return BeginThenGen(AllocaIP, Builder.saveIP());
8030 }();
8031
8032 if (Err)
8033 return Err;
8034
8035 return Builder.saveIP();
8036}
8037
8040 bool IsGPUDistribute) {
8041 assert((IVSize == 32 || IVSize == 64) &&
8042 "IV size is not compatible with the omp runtime");
8043 RuntimeFunction Name;
8044 if (IsGPUDistribute)
8045 Name = IVSize == 32
8046 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8047 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8048 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8049 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8050 else
8051 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8052 : omp::OMPRTL___kmpc_for_static_init_4u)
8053 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8054 : omp::OMPRTL___kmpc_for_static_init_8u);
8055
8056 return getOrCreateRuntimeFunction(M, Name);
8057}
8058
8060 bool IVSigned) {
8061 assert((IVSize == 32 || IVSize == 64) &&
8062 "IV size is not compatible with the omp runtime");
8063 RuntimeFunction Name = IVSize == 32
8064 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8065 : omp::OMPRTL___kmpc_dispatch_init_4u)
8066 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8067 : omp::OMPRTL___kmpc_dispatch_init_8u);
8068
8069 return getOrCreateRuntimeFunction(M, Name);
8070}
8071
8073 bool IVSigned) {
8074 assert((IVSize == 32 || IVSize == 64) &&
8075 "IV size is not compatible with the omp runtime");
8076 RuntimeFunction Name = IVSize == 32
8077 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8078 : omp::OMPRTL___kmpc_dispatch_next_4u)
8079 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8080 : omp::OMPRTL___kmpc_dispatch_next_8u);
8081
8082 return getOrCreateRuntimeFunction(M, Name);
8083}
8084
8086 bool IVSigned) {
8087 assert((IVSize == 32 || IVSize == 64) &&
8088 "IV size is not compatible with the omp runtime");
8089 RuntimeFunction Name = IVSize == 32
8090 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8091 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8092 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8093 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8094
8095 return getOrCreateRuntimeFunction(M, Name);
8096}
8097
8099 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8100}
8101
8103 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8104 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8105
8106 DISubprogram *NewSP = Func->getSubprogram();
8107 if (!NewSP)
8108 return;
8109
8111
8112 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8113 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8114 // Only use cached variable if the arg number matches. This is important
8115 // so that DIVariable created for privatized variables are not discarded.
8116 if (NewVar && (arg == NewVar->getArg()))
8117 return NewVar;
8118
8120 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8121 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8122 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8123 return NewVar;
8124 };
8125
8126 auto UpdateDebugRecord = [&](auto *DR) {
8127 DILocalVariable *OldVar = DR->getVariable();
8128 unsigned ArgNo = 0;
8129 for (auto Loc : DR->location_ops()) {
8130 auto Iter = ValueReplacementMap.find(Loc);
8131 if (Iter != ValueReplacementMap.end()) {
8132 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8133 ArgNo = std::get<1>(Iter->second) + 1;
8134 }
8135 }
8136 if (ArgNo != 0)
8137 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8138 };
8139
8140 // The location and scope of variable intrinsics and records still point to
8141 // the parent function of the target region. Update them.
8142 for (Instruction &I : instructions(Func)) {
8144 "Unexpected debug intrinsic");
8145 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
8146 UpdateDebugRecord(&DVR);
8147 }
8148 // An extra argument is passed to the device. Create the debug data for it.
8149 if (OMPBuilder.Config.isTargetDevice()) {
8150 DICompileUnit *CU = NewSP->getUnit();
8151 Module *M = Func->getParent();
8152 DIBuilder DB(*M, true, CU);
8153 DIType *VoidPtrTy =
8154 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8155 DILocalVariable *Var = DB.createParameterVariable(
8156 NewSP, "dyn_ptr", /*ArgNo*/ 1, NewSP->getFile(), /*LineNo=*/0,
8157 VoidPtrTy, /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8158 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8159 DB.insertDeclare(&(*Func->arg_begin()), Var, DB.createExpression(), Loc,
8160 &(*Func->begin()));
8161 }
8162}
8163
8165 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8166 return cast<Operator>(V)->getOperand(0);
8167 return V;
8168}
8169
8171 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8173 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8176 SmallVector<Type *> ParameterTypes;
8177 if (OMPBuilder.Config.isTargetDevice()) {
8178 // Add the "implicit" runtime argument we use to provide launch specific
8179 // information for target devices.
8180 auto *Int8PtrTy = PointerType::getUnqual(Builder.getContext());
8181 ParameterTypes.push_back(Int8PtrTy);
8182
8183 // All parameters to target devices are passed as pointers
8184 // or i64. This assumes 64-bit address spaces/pointers.
8185 for (auto &Arg : Inputs)
8186 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8187 ? Arg->getType()
8188 : Type::getInt64Ty(Builder.getContext()));
8189 } else {
8190 for (auto &Arg : Inputs)
8191 ParameterTypes.push_back(Arg->getType());
8192 }
8193
8194 auto BB = Builder.GetInsertBlock();
8195 auto M = BB->getModule();
8196 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8197 /*isVarArg*/ false);
8198 auto Func =
8199 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8200
8201 // Forward target-cpu and target-features function attributes from the
8202 // original function to the new outlined function.
8203 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8204
8205 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8206 if (TargetCpuAttr.isStringAttribute())
8207 Func->addFnAttr(TargetCpuAttr);
8208
8209 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8210 if (TargetFeaturesAttr.isStringAttribute())
8211 Func->addFnAttr(TargetFeaturesAttr);
8212
8213 if (OMPBuilder.Config.isTargetDevice()) {
8214 Value *ExecMode =
8215 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8216 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8217 }
8218
8219 // Save insert point.
8220 IRBuilder<>::InsertPointGuard IPG(Builder);
8221 // We will generate the entries in the outlined function but the debug
8222 // location may still be pointing to the parent function. Reset it now.
8223 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8224
8225 // Generate the region into the function.
8226 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8227 Builder.SetInsertPoint(EntryBB);
8228
8229 // Insert target init call in the device compilation pass.
8230 if (OMPBuilder.Config.isTargetDevice())
8231 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8232
8233 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8234
8235 // As we embed the user code in the middle of our target region after we
8236 // generate entry code, we must move what allocas we can into the entry
8237 // block to avoid possible breaking optimisations for device
8238 if (OMPBuilder.Config.isTargetDevice())
8240
8241 // Insert target deinit call in the device compilation pass.
8242 BasicBlock *OutlinedBodyBB =
8243 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8245 Builder.saveIP(),
8246 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8247 if (!AfterIP)
8248 return AfterIP.takeError();
8249 Builder.restoreIP(*AfterIP);
8250 if (OMPBuilder.Config.isTargetDevice())
8251 OMPBuilder.createTargetDeinit(Builder);
8252
8253 // Insert return instruction.
8254 Builder.CreateRetVoid();
8255
8256 // New Alloca IP at entry point of created device function.
8257 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8258 auto AllocaIP = Builder.saveIP();
8259
8260 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8261
8262 // Skip the artificial dyn_ptr on the device.
8263 const auto &ArgRange =
8264 OMPBuilder.Config.isTargetDevice()
8265 ? make_range(Func->arg_begin() + 1, Func->arg_end())
8266 : Func->args();
8267
8269
8270 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8271 // Things like GEP's can come in the form of Constants. Constants and
8272 // ConstantExpr's do not have access to the knowledge of what they're
8273 // contained in, so we must dig a little to find an instruction so we
8274 // can tell if they're used inside of the function we're outlining. We
8275 // also replace the original constant expression with a new instruction
8276 // equivalent; an instruction as it allows easy modification in the
8277 // following loop, as we can now know the constant (instruction) is
8278 // owned by our target function and replaceUsesOfWith can now be invoked
8279 // on it (cannot do this with constants it seems). A brand new one also
8280 // allows us to be cautious as it is perhaps possible the old expression
8281 // was used inside of the function but exists and is used externally
8282 // (unlikely by the nature of a Constant, but still).
8283 // NOTE: We cannot remove dead constants that have been rewritten to
8284 // instructions at this stage, we run the risk of breaking later lowering
8285 // by doing so as we could still be in the process of lowering the module
8286 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8287 // constants we have created rewritten versions of.
8288 if (auto *Const = dyn_cast<Constant>(Input))
8289 convertUsersOfConstantsToInstructions(Const, Func, false);
8290
8291 // Collect users before iterating over them to avoid invalidating the
8292 // iteration in case a user uses Input more than once (e.g. a call
8293 // instruction).
8294 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8295 // Collect all the instructions
8297 if (auto *Instr = dyn_cast<Instruction>(User))
8298 if (Instr->getFunction() == Func)
8299 Instr->replaceUsesOfWith(Input, InputCopy);
8300 };
8301
8302 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8303
8304 // Rewrite uses of input valus to parameters.
8305 for (auto InArg : zip(Inputs, ArgRange)) {
8306 Value *Input = std::get<0>(InArg);
8307 Argument &Arg = std::get<1>(InArg);
8308 Value *InputCopy = nullptr;
8309
8311 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8312 if (!AfterIP)
8313 return AfterIP.takeError();
8314 Builder.restoreIP(*AfterIP);
8315 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8316
8317 // In certain cases a Global may be set up for replacement, however, this
8318 // Global may be used in multiple arguments to the kernel, just segmented
8319 // apart, for example, if we have a global array, that is sectioned into
8320 // multiple mappings (technically not legal in OpenMP, but there is a case
8321 // in Fortran for Common Blocks where this is neccesary), we will end up
8322 // with GEP's into this array inside the kernel, that refer to the Global
8323 // but are technically seperate arguments to the kernel for all intents and
8324 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8325 // index, it will fold into an referal to the Global, if we then encounter
8326 // this folded GEP during replacement all of the references to the
8327 // Global in the kernel will be replaced with the argument we have generated
8328 // that corresponds to it, including any other GEP's that refer to the
8329 // Global that may be other arguments. This will invalidate all of the other
8330 // preceding mapped arguments that refer to the same global that may be
8331 // seperate segments. To prevent this, we defer global processing until all
8332 // other processing has been performed.
8335 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8336 continue;
8337 }
8338
8340 continue;
8341
8342 ReplaceValue(Input, InputCopy, Func);
8343 }
8344
8345 // Replace all of our deferred Input values, currently just Globals.
8346 for (auto Deferred : DeferredReplacement)
8347 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8348
8349 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8350 ValueReplacementMap);
8351 return Func;
8352}
8353/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8354/// of pointers containing shared data between the parent task and the created
8355/// task.
8357 IRBuilderBase &Builder,
8358 Value *TaskWithPrivates,
8359 Type *TaskWithPrivatesTy) {
8360
8361 Type *TaskTy = OMPIRBuilder.Task;
8362 LLVMContext &Ctx = Builder.getContext();
8363 Value *TaskT =
8364 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8365 Value *Shareds = TaskT;
8366 // TaskWithPrivatesTy can be one of the following
8367 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8368 // %struct.privates }
8369 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8370 //
8371 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8372 // its first member has to be the task descriptor. TaskTy is the type of the
8373 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8374 // first member of TaskT, gives us the pointer to shared data.
8375 if (TaskWithPrivatesTy != TaskTy)
8376 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8377 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8378}
8379/// Create an entry point for a target task with the following.
8380/// It'll have the following signature
8381/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8382/// This function is called from emitTargetTask once the
8383/// code to launch the target kernel has been outlined already.
8384/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8385/// into the task structure so that the deferred target task can access this
8386/// data even after the stack frame of the generating task has been rolled
8387/// back. Offloading arrays contain base pointers, pointers, sizes etc
8388/// of the data that the target kernel will access. These in effect are the
8389/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8391 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8392 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8393 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8394
8395 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8396 // This is because PrivatesTy is the type of the structure in which
8397 // we pass the offloading arrays to the deferred target task.
8398 assert((!NumOffloadingArrays || PrivatesTy) &&
8399 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8400 "to privatize");
8401
8402 Module &M = OMPBuilder.M;
8403 // KernelLaunchFunction is the target launch function, i.e.
8404 // the function that sets up kernel arguments and calls
8405 // __tgt_target_kernel to launch the kernel on the device.
8406 //
8407 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8408
8409 // StaleCI is the CallInst which is the call to the outlined
8410 // target kernel launch function. If there are local live-in values
8411 // that the outlined function uses then these are aggregated into a structure
8412 // which is passed as the second argument. If there are no local live-in
8413 // values or if all values used by the outlined kernel are global variables,
8414 // then there's only one argument, the threadID. So, StaleCI can be
8415 //
8416 // %structArg = alloca { ptr, ptr }, align 8
8417 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8418 // store ptr %20, ptr %gep_, align 8
8419 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8420 // store ptr %21, ptr %gep_8, align 8
8421 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8422 //
8423 // OR
8424 //
8425 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8427 StaleCI->getIterator());
8428
8429 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8430
8431 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8432 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8433 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8434
8435 auto ProxyFnTy =
8436 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8437 /* isVarArg */ false);
8438 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8439 ".omp_target_task_proxy_func",
8440 Builder.GetInsertBlock()->getModule());
8441 Value *ThreadId = ProxyFn->getArg(0);
8442 Value *TaskWithPrivates = ProxyFn->getArg(1);
8443 ThreadId->setName("thread.id");
8444 TaskWithPrivates->setName("task");
8445
8446 bool HasShareds = SharedArgsOperandNo > 0;
8447 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8448 BasicBlock *EntryBB =
8449 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8450 Builder.SetInsertPoint(EntryBB);
8451
8452 SmallVector<Value *> KernelLaunchArgs;
8453 KernelLaunchArgs.reserve(StaleCI->arg_size());
8454 KernelLaunchArgs.push_back(ThreadId);
8455
8456 if (HasOffloadingArrays) {
8457 assert(TaskTy != TaskWithPrivatesTy &&
8458 "If there are offloading arrays to pass to the target"
8459 "TaskTy cannot be the same as TaskWithPrivatesTy");
8460 (void)TaskTy;
8461 Value *Privates =
8462 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8463 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8464 KernelLaunchArgs.push_back(
8465 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8466 }
8467
8468 if (HasShareds) {
8469 auto *ArgStructAlloca =
8470 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8471 assert(ArgStructAlloca &&
8472 "Unable to find the alloca instruction corresponding to arguments "
8473 "for extracted function");
8474 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8475
8476 AllocaInst *NewArgStructAlloca =
8477 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8478
8479 Value *SharedsSize =
8480 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8481
8483 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8484
8485 Builder.CreateMemCpy(
8486 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8487 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8488 KernelLaunchArgs.push_back(NewArgStructAlloca);
8489 }
8490 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8491 Builder.CreateRetVoid();
8492 return ProxyFn;
8493}
8495
8496 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8497 return GEP->getSourceElementType();
8498 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8499 return Alloca->getAllocatedType();
8500
8501 llvm_unreachable("Unhandled Instruction type");
8502 return nullptr;
8503}
8504// This function returns a struct that has at most two members.
8505// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8506// descriptor. The second member, if needed, is a struct containing arrays
8507// that need to be passed to the offloaded target kernel. For example,
8508// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8509// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8510// respectively, then the types created by this function are
8511//
8512// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8513// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8514// %struct.privates }
8515// %struct.task_with_privates is returned by this function.
8516// If there aren't any offloading arrays to pass to the target kernel,
8517// %struct.kmp_task_ompbuilder_t is returned.
8518static StructType *
8520 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8521
8522 if (OffloadingArraysToPrivatize.empty())
8523 return OMPIRBuilder.Task;
8524
8525 SmallVector<Type *, 4> StructFieldTypes;
8526 for (Value *V : OffloadingArraysToPrivatize) {
8527 assert(V->getType()->isPointerTy() &&
8528 "Expected pointer to array to privatize. Got a non-pointer value "
8529 "instead");
8530 Type *ArrayTy = getOffloadingArrayType(V);
8531 assert(ArrayTy && "ArrayType cannot be nullptr");
8532 StructFieldTypes.push_back(ArrayTy);
8533 }
8534 StructType *PrivatesStructTy =
8535 StructType::create(StructFieldTypes, "struct.privates");
8536 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8537 "struct.task_with_privates");
8538}
8540 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8541 TargetRegionEntryInfo &EntryInfo,
8543 Function *&OutlinedFn, Constant *&OutlinedFnID,
8547
8548 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8549 [&](StringRef EntryFnName) {
8550 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8551 EntryFnName, Inputs, CBFunc,
8552 ArgAccessorFuncCB);
8553 };
8554
8555 return OMPBuilder.emitTargetRegionFunction(
8556 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8557 OutlinedFnID);
8558}
8559
8561 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8564 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8565
8566 // The following explains the code-gen scenario for the `target` directive. A
8567 // similar scneario is followed for other device-related directives (e.g.
8568 // `target enter data`) but in similar fashion since we only need to emit task
8569 // that encapsulates the proper runtime call.
8570 //
8571 // When we arrive at this function, the target region itself has been
8572 // outlined into the function OutlinedFn.
8573 // So at ths point, for
8574 // --------------------------------------------------------------
8575 // void user_code_that_offloads(...) {
8576 // omp target depend(..) map(from:a) map(to:b) private(i)
8577 // do i = 1, 10
8578 // a(i) = b(i) + n
8579 // }
8580 //
8581 // --------------------------------------------------------------
8582 //
8583 // we have
8584 //
8585 // --------------------------------------------------------------
8586 //
8587 // void user_code_that_offloads(...) {
8588 // %.offload_baseptrs = alloca [2 x ptr], align 8
8589 // %.offload_ptrs = alloca [2 x ptr], align 8
8590 // %.offload_mappers = alloca [2 x ptr], align 8
8591 // ;; target region has been outlined and now we need to
8592 // ;; offload to it via a target task.
8593 // }
8594 // void outlined_device_function(ptr a, ptr b, ptr n) {
8595 // n = *n_ptr;
8596 // do i = 1, 10
8597 // a(i) = b(i) + n
8598 // }
8599 //
8600 // We have to now do the following
8601 // (i) Make an offloading call to outlined_device_function using the OpenMP
8602 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8603 // emitted by emitKernelLaunch
8604 // (ii) Create a task entry point function that calls kernel_launch_function
8605 // and is the entry point for the target task. See
8606 // '@.omp_target_task_proxy_func in the pseudocode below.
8607 // (iii) Create a task with the task entry point created in (ii)
8608 //
8609 // That is we create the following
8610 // struct task_with_privates {
8611 // struct kmp_task_ompbuilder_t task_struct;
8612 // struct privates {
8613 // [2 x ptr] ; baseptrs
8614 // [2 x ptr] ; ptrs
8615 // [2 x i64] ; sizes
8616 // }
8617 // }
8618 // void user_code_that_offloads(...) {
8619 // %.offload_baseptrs = alloca [2 x ptr], align 8
8620 // %.offload_ptrs = alloca [2 x ptr], align 8
8621 // %.offload_sizes = alloca [2 x i64], align 8
8622 //
8623 // %structArg = alloca { ptr, ptr, ptr }, align 8
8624 // %strucArg[0] = a
8625 // %strucArg[1] = b
8626 // %strucArg[2] = &n
8627 //
8628 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8629 // sizeof(kmp_task_ompbuilder_t),
8630 // sizeof(structArg),
8631 // @.omp_target_task_proxy_func,
8632 // ...)
8633 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8634 // sizeof(structArg))
8635 // memcpy(target_task_with_privates->privates->baseptrs,
8636 // offload_baseptrs, sizeof(offload_baseptrs)
8637 // memcpy(target_task_with_privates->privates->ptrs,
8638 // offload_ptrs, sizeof(offload_ptrs)
8639 // memcpy(target_task_with_privates->privates->sizes,
8640 // offload_sizes, sizeof(offload_sizes)
8641 // dependencies_array = ...
8642 // ;; if nowait not present
8643 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8644 // call @__kmpc_omp_task_begin_if0(...)
8645 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8646 // %target_task_with_privates)
8647 // call @__kmpc_omp_task_complete_if0(...)
8648 // }
8649 //
8650 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8651 // ptr %task) {
8652 // %structArg = alloca {ptr, ptr, ptr}
8653 // %task_ptr = getelementptr(%task, 0, 0)
8654 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8655 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8656 //
8657 // %offloading_arrays = getelementptr(%task, 0, 1)
8658 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8659 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8660 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8661 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8662 // %offload_sizes, %structArg)
8663 // }
8664 //
8665 // We need the proxy function because the signature of the task entry point
8666 // expected by kmpc_omp_task is always the same and will be different from
8667 // that of the kernel_launch function.
8668 //
8669 // kernel_launch_function is generated by emitKernelLaunch and has the
8670 // always_inline attribute. For this example, it'll look like so:
8671 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8672 // %offload_sizes, %structArg) alwaysinline {
8673 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8674 // ; load aggregated data from %structArg
8675 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8676 // ; offload_sizes
8677 // call i32 @__tgt_target_kernel(...,
8678 // outlined_device_function,
8679 // ptr %kernel_args)
8680 // }
8681 // void outlined_device_function(ptr a, ptr b, ptr n) {
8682 // n = *n_ptr;
8683 // do i = 1, 10
8684 // a(i) = b(i) + n
8685 // }
8686 //
8687 BasicBlock *TargetTaskBodyBB =
8688 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8689 BasicBlock *TargetTaskAllocaBB =
8690 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8691
8692 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8693 TargetTaskAllocaBB->begin());
8694 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8695
8696 OutlineInfo OI;
8697 OI.EntryBB = TargetTaskAllocaBB;
8698 OI.OuterAllocaBB = AllocaIP.getBlock();
8699
8700 // Add the thread ID argument.
8703 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8704
8705 // Generate the task body which will subsequently be outlined.
8706 Builder.restoreIP(TargetTaskBodyIP);
8707 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8708 return Err;
8709
8710 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8711 // it is given. These blocks are enumerated by
8712 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8713 // to be outside the region. In other words, OI.ExitBlock is expected to be
8714 // the start of the region after the outlining. We used to set OI.ExitBlock
8715 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8716 // except when the task body is a single basic block. In that case,
8717 // OI.ExitBlock is set to the single task body block and will get left out of
8718 // the outlining process. So, simply create a new empty block to which we
8719 // uncoditionally branch from where TaskBodyCB left off
8720 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8721 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8722 /*IsFinished=*/true);
8723
8724 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8725 bool NeedsTargetTask = HasNoWait && DeviceID;
8726 if (NeedsTargetTask) {
8727 for (auto *V :
8728 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8729 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8730 RTArgs.SizesArray}) {
8732 OffloadingArraysToPrivatize.push_back(V);
8734 }
8735 }
8736 }
8737 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8738 DeviceID, OffloadingArraysToPrivatize](
8739 Function &OutlinedFn) mutable {
8740 assert(OutlinedFn.hasOneUse() &&
8741 "there must be a single user for the outlined function");
8742
8743 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8744
8745 // The first argument of StaleCI is always the thread id.
8746 // The next few arguments are the pointers to offloading arrays
8747 // if any. (see OffloadingArraysToPrivatize)
8748 // Finally, all other local values that are live-in into the outlined region
8749 // end up in a structure whose pointer is passed as the last argument. This
8750 // piece of data is passed in the "shared" field of the task structure. So,
8751 // we know we have to pass shareds to the task if the number of arguments is
8752 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8753 // thread id. Further, for safety, we assert that the number of arguments of
8754 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8755 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8756 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8757 assert((!HasShareds ||
8758 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8759 "Wrong number of arguments for StaleCI when shareds are present");
8760 int SharedArgOperandNo =
8761 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8762
8763 StructType *TaskWithPrivatesTy =
8764 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8765 StructType *PrivatesTy = nullptr;
8766
8767 if (!OffloadingArraysToPrivatize.empty())
8768 PrivatesTy =
8769 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
8770
8772 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
8773 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
8774
8775 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
8776 << "\n");
8777
8778 Builder.SetInsertPoint(StaleCI);
8779
8780 // Gather the arguments for emitting the runtime call.
8781 uint32_t SrcLocStrSize;
8782 Constant *SrcLocStr =
8784 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8785
8786 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
8787 //
8788 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
8789 // the DeviceID to the deferred task and also since
8790 // @__kmpc_omp_target_task_alloc creates an untied/async task.
8791 Function *TaskAllocFn =
8792 !NeedsTargetTask
8793 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
8795 OMPRTL___kmpc_omp_target_task_alloc);
8796
8797 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
8798 // call.
8799 Value *ThreadID = getOrCreateThreadID(Ident);
8800
8801 // Argument - `sizeof_kmp_task_t` (TaskSize)
8802 // Tasksize refers to the size in bytes of kmp_task_t data structure
8803 // plus any other data to be passed to the target task, if any, which
8804 // is packed into a struct. kmp_task_t and the struct so created are
8805 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
8806 Value *TaskSize = Builder.getInt64(
8807 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
8808
8809 // Argument - `sizeof_shareds` (SharedsSize)
8810 // SharedsSize refers to the shareds array size in the kmp_task_t data
8811 // structure.
8812 Value *SharedsSize = Builder.getInt64(0);
8813 if (HasShareds) {
8814 auto *ArgStructAlloca =
8815 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
8816 assert(ArgStructAlloca &&
8817 "Unable to find the alloca instruction corresponding to arguments "
8818 "for extracted function");
8819 auto *ArgStructType =
8820 dyn_cast<StructType>(ArgStructAlloca->getAllocatedType());
8821 assert(ArgStructType && "Unable to find struct type corresponding to "
8822 "arguments for extracted function");
8823 SharedsSize =
8824 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ArgStructType));
8825 }
8826
8827 // Argument - `flags`
8828 // Task is tied iff (Flags & 1) == 1.
8829 // Task is untied iff (Flags & 1) == 0.
8830 // Task is final iff (Flags & 2) == 2.
8831 // Task is not final iff (Flags & 2) == 0.
8832 // A target task is not final and is untied.
8833 Value *Flags = Builder.getInt32(0);
8834
8835 // Emit the @__kmpc_omp_task_alloc runtime call
8836 // The runtime call returns a pointer to an area where the task captured
8837 // variables must be copied before the task is run (TaskData)
8838 CallInst *TaskData = nullptr;
8839
8840 SmallVector<llvm::Value *> TaskAllocArgs = {
8841 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
8842 /*flags=*/Flags,
8843 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
8844 /*task_func=*/ProxyFn};
8845
8846 if (NeedsTargetTask) {
8847 assert(DeviceID && "Expected non-empty device ID.");
8848 TaskAllocArgs.push_back(DeviceID);
8849 }
8850
8851 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
8852
8853 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
8854 if (HasShareds) {
8855 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
8857 *this, Builder, TaskData, TaskWithPrivatesTy);
8858 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
8859 SharedsSize);
8860 }
8861 if (!OffloadingArraysToPrivatize.empty()) {
8862 Value *Privates =
8863 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
8864 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
8865 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
8866 [[maybe_unused]] Type *ArrayType =
8867 getOffloadingArrayType(PtrToPrivatize);
8868 assert(ArrayType && "ArrayType cannot be nullptr");
8869
8870 Type *ElementType = PrivatesTy->getElementType(i);
8871 assert(ElementType == ArrayType &&
8872 "ElementType should match ArrayType");
8873 (void)ArrayType;
8874
8875 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
8876 Builder.CreateMemCpy(
8877 Dst, Alignment, PtrToPrivatize, Alignment,
8878 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
8879 }
8880 }
8881
8882 Value *DepArray = emitTaskDependencies(*this, Dependencies);
8883
8884 // ---------------------------------------------------------------
8885 // V5.2 13.8 target construct
8886 // If the nowait clause is present, execution of the target task
8887 // may be deferred. If the nowait clause is not present, the target task is
8888 // an included task.
8889 // ---------------------------------------------------------------
8890 // The above means that the lack of a nowait on the target construct
8891 // translates to '#pragma omp task if(0)'
8892 if (!NeedsTargetTask) {
8893 if (DepArray) {
8894 Function *TaskWaitFn =
8895 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
8897 TaskWaitFn,
8898 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
8899 /*ndeps=*/Builder.getInt32(Dependencies.size()),
8900 /*dep_list=*/DepArray,
8901 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
8902 /*noalias_dep_list=*/
8904 }
8905 // Included task.
8906 Function *TaskBeginFn =
8907 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
8908 Function *TaskCompleteFn =
8909 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
8910 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
8911 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
8912 CI->setDebugLoc(StaleCI->getDebugLoc());
8913 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
8914 } else if (DepArray) {
8915 // HasNoWait - meaning the task may be deferred. Call
8916 // __kmpc_omp_task_with_deps if there are dependencies,
8917 // else call __kmpc_omp_task
8918 Function *TaskFn =
8919 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
8921 TaskFn,
8922 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
8923 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
8925 } else {
8926 // Emit the @__kmpc_omp_task runtime call to spawn the task
8927 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
8928 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
8929 }
8930
8931 StaleCI->eraseFromParent();
8932 for (Instruction *I : llvm::reverse(ToBeDeleted))
8933 I->eraseFromParent();
8934 };
8935 addOutlineInfo(std::move(OI));
8936
8937 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
8938 << *(Builder.GetInsertBlock()) << "\n");
8939 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
8940 << *(Builder.GetInsertBlock()->getParent()->getParent())
8941 << "\n");
8942 return Builder.saveIP();
8943}
8944
8946 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
8947 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
8948 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
8949 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
8950 if (Error Err =
8951 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
8952 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
8953 return Err;
8954 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
8955 return Error::success();
8956}
8957
8958static void emitTargetCall(
8959 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8964 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
8969 bool HasNoWait, Value *DynCGroupMem,
8970 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
8971 // Generate a function call to the host fallback implementation of the target
8972 // region. This is called by the host when no offload entry was generated for
8973 // the target region and when the offloading call fails at runtime.
8974 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
8976 Builder.restoreIP(IP);
8977 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, Args);
8978 return Builder.saveIP();
8979 };
8980
8981 bool HasDependencies = Dependencies.size() > 0;
8982 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
8983
8985
8986 auto TaskBodyCB =
8987 [&](Value *DeviceID, Value *RTLoc,
8988 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
8989 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
8990 // produce any.
8992 // emitKernelLaunch makes the necessary runtime call to offload the
8993 // kernel. We then outline all that code into a separate function
8994 // ('kernel_launch_function' in the pseudo code above). This function is
8995 // then called by the target task proxy function (see
8996 // '@.omp_target_task_proxy_func' in the pseudo code above)
8997 // "@.omp_target_task_proxy_func' is generated by
8998 // emitTargetTaskProxyFunction.
8999 if (OutlinedFnID && DeviceID)
9000 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9001 EmitTargetCallFallbackCB, KArgs,
9002 DeviceID, RTLoc, TargetTaskAllocaIP);
9003
9004 // We only need to do the outlining if `DeviceID` is set to avoid calling
9005 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9006 // generating the `else` branch of an `if` clause.
9007 //
9008 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9009 // In this case, we execute the host implementation directly.
9010 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9011 }());
9012
9013 OMPBuilder.Builder.restoreIP(AfterIP);
9014 return Error::success();
9015 };
9016
9017 auto &&EmitTargetCallElse =
9018 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9020 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9021 // produce any.
9023 if (RequiresOuterTargetTask) {
9024 // Arguments that are intended to be directly forwarded to an
9025 // emitKernelLaunch call are pased as nullptr, since
9026 // OutlinedFnID=nullptr results in that call not being done.
9028 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9029 /*RTLoc=*/nullptr, AllocaIP,
9030 Dependencies, EmptyRTArgs, HasNoWait);
9031 }
9032 return EmitTargetCallFallbackCB(Builder.saveIP());
9033 }());
9034
9035 Builder.restoreIP(AfterIP);
9036 return Error::success();
9037 };
9038
9039 auto &&EmitTargetCallThen =
9040 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9042 Info.HasNoWait = HasNoWait;
9043 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9045 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9046 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9047 /*IsNonContiguous=*/true,
9048 /*ForEndCall=*/false))
9049 return Err;
9050
9051 SmallVector<Value *, 3> NumTeamsC;
9052 for (auto [DefaultVal, RuntimeVal] :
9053 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9054 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9055 : Builder.getInt32(DefaultVal));
9056
9057 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9058 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9059 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9060 if (Clause)
9061 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9062 /*isSigned=*/false);
9063 return Clause;
9064 };
9065 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9066 if (Clause)
9067 Result =
9068 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9069 Result, Clause)
9070 : Clause;
9071 };
9072
9073 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9074 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9075 SmallVector<Value *, 3> NumThreadsC;
9076 Value *MaxThreadsClause =
9077 RuntimeAttrs.TeamsThreadLimit.size() == 1
9078 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9079 : nullptr;
9080
9081 for (auto [TeamsVal, TargetVal] : zip_equal(
9082 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9083 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9084 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9085
9086 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9087 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9088
9089 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9090 }
9091
9092 unsigned NumTargetItems = Info.NumberOfPtrs;
9093 uint32_t SrcLocStrSize;
9094 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9095 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9096 llvm::omp::IdentFlag(0), 0);
9097
9098 Value *TripCount = RuntimeAttrs.LoopTripCount
9099 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9100 Builder.getInt64Ty(),
9101 /*isSigned=*/false)
9102 : Builder.getInt64(0);
9103
9104 // Request zero groupprivate bytes by default.
9105 if (!DynCGroupMem)
9106 DynCGroupMem = Builder.getInt32(0);
9107
9109 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9110 HasNoWait, DynCGroupMemFallback);
9111
9112 // Assume no error was returned because TaskBodyCB and
9113 // EmitTargetCallFallbackCB don't produce any.
9115 // The presence of certain clauses on the target directive require the
9116 // explicit generation of the target task.
9117 if (RequiresOuterTargetTask)
9118 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9119 RTLoc, AllocaIP, Dependencies,
9120 KArgs.RTArgs, Info.HasNoWait);
9121
9122 return OMPBuilder.emitKernelLaunch(
9123 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9124 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9125 }());
9126
9127 Builder.restoreIP(AfterIP);
9128 return Error::success();
9129 };
9130
9131 // If we don't have an ID for the target region, it means an offload entry
9132 // wasn't created. In this case we just run the host fallback directly and
9133 // ignore any potential 'if' clauses.
9134 if (!OutlinedFnID) {
9135 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9136 return;
9137 }
9138
9139 // If there's no 'if' clause, only generate the kernel launch code path.
9140 if (!IfCond) {
9141 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9142 return;
9143 }
9144
9145 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9146 EmitTargetCallElse, AllocaIP));
9147}
9148
9150 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9151 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9152 TargetRegionEntryInfo &EntryInfo,
9153 const TargetKernelDefaultAttrs &DefaultAttrs,
9154 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9155 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9158 CustomMapperCallbackTy CustomMapperCB,
9159 const SmallVector<DependData> &Dependencies, bool HasNowait,
9160 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9161
9162 if (!updateToLocation(Loc))
9163 return InsertPointTy();
9164
9165 Builder.restoreIP(CodeGenIP);
9166
9167 Function *OutlinedFn;
9168 Constant *OutlinedFnID = nullptr;
9169 // The target region is outlined into its own function. The LLVM IR for
9170 // the target region itself is generated using the callbacks CBFunc
9171 // and ArgAccessorFuncCB
9173 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9174 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9175 return Err;
9176
9177 // If we are not on the target device, then we need to generate code
9178 // to make a remote call (offload) to the previously outlined function
9179 // that represents the target region. Do that now.
9180 if (!Config.isTargetDevice())
9181 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9182 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9183 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9184 DynCGroupMemFallback);
9185 return Builder.saveIP();
9186}
9187
9188std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9189 StringRef FirstSeparator,
9190 StringRef Separator) {
9191 SmallString<128> Buffer;
9192 llvm::raw_svector_ostream OS(Buffer);
9193 StringRef Sep = FirstSeparator;
9194 for (StringRef Part : Parts) {
9195 OS << Sep << Part;
9196 Sep = Separator;
9197 }
9198 return OS.str().str();
9199}
9200
9201std::string
9203 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9204 Config.separator());
9205}
9206
9208 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9209 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9210 if (Elem.second) {
9211 assert(Elem.second->getValueType() == Ty &&
9212 "OMP internal variable has different type than requested");
9213 } else {
9214 // TODO: investigate the appropriate linkage type used for the global
9215 // variable for possibly changing that to internal or private, or maybe
9216 // create different versions of the function for different OMP internal
9217 // variables.
9218 const DataLayout &DL = M.getDataLayout();
9219 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9220 // default global AS is 1.
9221 // See double-target-call-with-declare-target.f90 and
9222 // declare-target-vars-in-target-region.f90 libomptarget
9223 // tests.
9224 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9225 : M.getTargetTriple().isAMDGPU()
9226 ? 0
9227 : DL.getDefaultGlobalsAddressSpace();
9228 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9231 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9232 Constant::getNullValue(Ty), Elem.first(),
9233 /*InsertBefore=*/nullptr,
9234 GlobalValue::NotThreadLocal, AddressSpaceVal);
9235 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9236 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9237 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9238 Elem.second = GV;
9239 }
9240
9241 return Elem.second;
9242}
9243
9244Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9245 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9246 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9247 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9248}
9249
9251 LLVMContext &Ctx = Builder.getContext();
9252 Value *Null =
9253 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9254 Value *SizeGep =
9255 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9256 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9257 return SizePtrToInt;
9258}
9259
9262 std::string VarName) {
9263 llvm::Constant *MaptypesArrayInit =
9264 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9265 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9266 M, MaptypesArrayInit->getType(),
9267 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9268 VarName);
9269 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9270 return MaptypesArrayGlobal;
9271}
9272
9274 InsertPointTy AllocaIP,
9275 unsigned NumOperands,
9276 struct MapperAllocas &MapperAllocas) {
9277 if (!updateToLocation(Loc))
9278 return;
9279
9280 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9281 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9282 Builder.restoreIP(AllocaIP);
9283 AllocaInst *ArgsBase = Builder.CreateAlloca(
9284 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9285 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9286 ".offload_ptrs");
9287 AllocaInst *ArgSizes = Builder.CreateAlloca(
9288 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9290 MapperAllocas.ArgsBase = ArgsBase;
9291 MapperAllocas.Args = Args;
9292 MapperAllocas.ArgSizes = ArgSizes;
9293}
9294
9296 Function *MapperFunc, Value *SrcLocInfo,
9297 Value *MaptypesArg, Value *MapnamesArg,
9299 int64_t DeviceID, unsigned NumOperands) {
9300 if (!updateToLocation(Loc))
9301 return;
9302
9303 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9304 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9305 Value *ArgsBaseGEP =
9306 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9307 {Builder.getInt32(0), Builder.getInt32(0)});
9308 Value *ArgsGEP =
9309 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9310 {Builder.getInt32(0), Builder.getInt32(0)});
9311 Value *ArgSizesGEP =
9312 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9313 {Builder.getInt32(0), Builder.getInt32(0)});
9314 Value *NullPtr =
9315 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9316 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9317 Builder.getInt32(NumOperands),
9318 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9319 MaptypesArg, MapnamesArg, NullPtr});
9320}
9321
9323 TargetDataRTArgs &RTArgs,
9324 TargetDataInfo &Info,
9325 bool ForEndCall) {
9326 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9327 "expected region end call to runtime only when end call is separate");
9328 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9329 auto VoidPtrTy = UnqualPtrTy;
9330 auto VoidPtrPtrTy = UnqualPtrTy;
9331 auto Int64Ty = Type::getInt64Ty(M.getContext());
9332 auto Int64PtrTy = UnqualPtrTy;
9333
9334 if (!Info.NumberOfPtrs) {
9335 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9336 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9337 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9338 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9339 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9340 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9341 return;
9342 }
9343
9344 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9345 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9346 Info.RTArgs.BasePointersArray,
9347 /*Idx0=*/0, /*Idx1=*/0);
9348 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9349 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9350 /*Idx0=*/0,
9351 /*Idx1=*/0);
9352 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9353 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9354 /*Idx0=*/0, /*Idx1=*/0);
9355 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9356 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9357 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9358 : Info.RTArgs.MapTypesArray,
9359 /*Idx0=*/0,
9360 /*Idx1=*/0);
9361
9362 // Only emit the mapper information arrays if debug information is
9363 // requested.
9364 if (!Info.EmitDebug)
9365 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9366 else
9367 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9368 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9369 /*Idx0=*/0,
9370 /*Idx1=*/0);
9371 // If there is no user-defined mapper, set the mapper array to nullptr to
9372 // avoid an unnecessary data privatization
9373 if (!Info.HasMapper)
9374 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9375 else
9376 RTArgs.MappersArray =
9377 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9378}
9379
9381 InsertPointTy CodeGenIP,
9382 MapInfosTy &CombinedInfo,
9383 TargetDataInfo &Info) {
9385 CombinedInfo.NonContigInfo;
9386
9387 // Build an array of struct descriptor_dim and then assign it to
9388 // offload_args.
9389 //
9390 // struct descriptor_dim {
9391 // uint64_t offset;
9392 // uint64_t count;
9393 // uint64_t stride
9394 // };
9395 Type *Int64Ty = Builder.getInt64Ty();
9397 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9398 "struct.descriptor_dim");
9399
9400 enum { OffsetFD = 0, CountFD, StrideFD };
9401 // We need two index variable here since the size of "Dims" is the same as
9402 // the size of Components, however, the size of offset, count, and stride is
9403 // equal to the size of base declaration that is non-contiguous.
9404 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9405 // Skip emitting ir if dimension size is 1 since it cannot be
9406 // non-contiguous.
9407 if (NonContigInfo.Dims[I] == 1)
9408 continue;
9409 Builder.restoreIP(AllocaIP);
9410 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9411 AllocaInst *DimsAddr =
9412 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9413 Builder.restoreIP(CodeGenIP);
9414 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9415 unsigned RevIdx = EE - II - 1;
9416 Value *DimsLVal = Builder.CreateInBoundsGEP(
9417 DimsAddr->getAllocatedType(), DimsAddr,
9418 {Builder.getInt64(0), Builder.getInt64(II)});
9419 // Offset
9420 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9421 Builder.CreateAlignedStore(
9422 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9423 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9424 // Count
9425 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9426 Builder.CreateAlignedStore(
9427 NonContigInfo.Counts[L][RevIdx], CountLVal,
9428 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9429 // Stride
9430 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9431 Builder.CreateAlignedStore(
9432 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9433 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9434 }
9435 // args[I] = &dims
9436 Builder.restoreIP(CodeGenIP);
9437 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9438 DimsAddr, Builder.getPtrTy());
9439 Value *P = Builder.CreateConstInBoundsGEP2_32(
9440 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9441 Info.RTArgs.PointersArray, 0, I);
9442 Builder.CreateAlignedStore(
9443 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9444 ++L;
9445 }
9446}
9447
9448void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9449 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9450 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9451 BasicBlock *ExitBB, bool IsInit) {
9452 StringRef Prefix = IsInit ? ".init" : ".del";
9453
9454 // Evaluate if this is an array section.
9456 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9457 Value *IsArray =
9458 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9459 Value *DeleteBit = Builder.CreateAnd(
9460 MapType,
9461 Builder.getInt64(
9462 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9463 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9464 Value *DeleteCond;
9465 Value *Cond;
9466 if (IsInit) {
9467 // base != begin?
9468 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9469 // IsPtrAndObj?
9470 Value *PtrAndObjBit = Builder.CreateAnd(
9471 MapType,
9472 Builder.getInt64(
9473 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9474 OpenMPOffloadMappingFlags::OMP_MAP_PTR_AND_OBJ)));
9475 PtrAndObjBit = Builder.CreateIsNotNull(PtrAndObjBit);
9476 BaseIsBegin = Builder.CreateAnd(BaseIsBegin, PtrAndObjBit);
9477 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9478 DeleteCond = Builder.CreateIsNull(
9479 DeleteBit,
9480 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9481 } else {
9482 Cond = IsArray;
9483 DeleteCond = Builder.CreateIsNotNull(
9484 DeleteBit,
9485 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9486 }
9487 Cond = Builder.CreateAnd(Cond, DeleteCond);
9488 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9489
9490 emitBlock(BodyBB, MapperFn);
9491 // Get the array size by multiplying element size and element number (i.e., \p
9492 // Size).
9493 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9494 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9495 // memory allocation/deletion purpose only.
9496 Value *MapTypeArg = Builder.CreateAnd(
9497 MapType,
9498 Builder.getInt64(
9499 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9500 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9501 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9502 MapTypeArg = Builder.CreateOr(
9503 MapTypeArg,
9504 Builder.getInt64(
9505 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9506 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9507
9508 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9509 // data structure.
9510 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9511 ArraySize, MapTypeArg, MapName};
9513 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9514 OffloadingArgs);
9515}
9516
9519 llvm::Value *BeginArg)>
9520 GenMapInfoCB,
9521 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9522 SmallVector<Type *> Params;
9523 Params.emplace_back(Builder.getPtrTy());
9524 Params.emplace_back(Builder.getPtrTy());
9525 Params.emplace_back(Builder.getPtrTy());
9526 Params.emplace_back(Builder.getInt64Ty());
9527 Params.emplace_back(Builder.getInt64Ty());
9528 Params.emplace_back(Builder.getPtrTy());
9529
9530 auto *FnTy =
9531 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9532
9533 SmallString<64> TyStr;
9534 raw_svector_ostream Out(TyStr);
9535 Function *MapperFn =
9537 MapperFn->addFnAttr(Attribute::NoInline);
9538 MapperFn->addFnAttr(Attribute::NoUnwind);
9539 MapperFn->addParamAttr(0, Attribute::NoUndef);
9540 MapperFn->addParamAttr(1, Attribute::NoUndef);
9541 MapperFn->addParamAttr(2, Attribute::NoUndef);
9542 MapperFn->addParamAttr(3, Attribute::NoUndef);
9543 MapperFn->addParamAttr(4, Attribute::NoUndef);
9544 MapperFn->addParamAttr(5, Attribute::NoUndef);
9545
9546 // Start the mapper function code generation.
9547 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9548 auto SavedIP = Builder.saveIP();
9549 Builder.SetInsertPoint(EntryBB);
9550
9551 Value *MapperHandle = MapperFn->getArg(0);
9552 Value *BaseIn = MapperFn->getArg(1);
9553 Value *BeginIn = MapperFn->getArg(2);
9554 Value *Size = MapperFn->getArg(3);
9555 Value *MapType = MapperFn->getArg(4);
9556 Value *MapName = MapperFn->getArg(5);
9557
9558 // Compute the starting and end addresses of array elements.
9559 // Prepare common arguments for array initiation and deletion.
9560 // Convert the size in bytes into the number of array elements.
9561 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9562 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9563 Value *PtrBegin = BeginIn;
9564 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9565
9566 // Emit array initiation if this is an array section and \p MapType indicates
9567 // that memory allocation is required.
9568 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9569 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9570 MapType, MapName, ElementSize, HeadBB,
9571 /*IsInit=*/true);
9572
9573 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9574
9575 // Emit the loop header block.
9576 emitBlock(HeadBB, MapperFn);
9577 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9578 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9579 // Evaluate whether the initial condition is satisfied.
9580 Value *IsEmpty =
9581 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9582 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9583
9584 // Emit the loop body block.
9585 emitBlock(BodyBB, MapperFn);
9586 BasicBlock *LastBB = BodyBB;
9587 PHINode *PtrPHI =
9588 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9589 PtrPHI->addIncoming(PtrBegin, HeadBB);
9590
9591 // Get map clause information. Fill up the arrays with all mapped variables.
9592 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9593 if (!Info)
9594 return Info.takeError();
9595
9596 // Call the runtime API __tgt_mapper_num_components to get the number of
9597 // pre-existing components.
9598 Value *OffloadingArgs[] = {MapperHandle};
9599 Value *PreviousSize = createRuntimeFunctionCall(
9600 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9601 OffloadingArgs);
9602 Value *ShiftedPreviousSize =
9603 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9604
9605 // Fill up the runtime mapper handle for all components.
9606 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9607 Value *CurBaseArg = Info->BasePointers[I];
9608 Value *CurBeginArg = Info->Pointers[I];
9609 Value *CurSizeArg = Info->Sizes[I];
9610 Value *CurNameArg = Info->Names.size()
9611 ? Info->Names[I]
9612 : Constant::getNullValue(Builder.getPtrTy());
9613
9614 // Extract the MEMBER_OF field from the map type.
9615 Value *OriMapType = Builder.getInt64(
9616 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9617 Info->Types[I]));
9618 Value *MemberMapType =
9619 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9620
9621 // Combine the map type inherited from user-defined mapper with that
9622 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9623 // bits of the \a MapType, which is the input argument of the mapper
9624 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9625 // bits of MemberMapType.
9626 // [OpenMP 5.0], 1.2.6. map-type decay.
9627 // | alloc | to | from | tofrom | release | delete
9628 // ----------------------------------------------------------
9629 // alloc | alloc | alloc | alloc | alloc | release | delete
9630 // to | alloc | to | alloc | to | release | delete
9631 // from | alloc | alloc | from | from | release | delete
9632 // tofrom | alloc | to | from | tofrom | release | delete
9633 Value *LeftToFrom = Builder.CreateAnd(
9634 MapType,
9635 Builder.getInt64(
9636 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9637 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9638 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9639 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9640 BasicBlock *AllocElseBB =
9641 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9642 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9643 BasicBlock *ToElseBB =
9644 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9645 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9646 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9647 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9648 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9649 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9650 emitBlock(AllocBB, MapperFn);
9651 Value *AllocMapType = Builder.CreateAnd(
9652 MemberMapType,
9653 Builder.getInt64(
9654 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9655 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9656 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9657 Builder.CreateBr(EndBB);
9658 emitBlock(AllocElseBB, MapperFn);
9659 Value *IsTo = Builder.CreateICmpEQ(
9660 LeftToFrom,
9661 Builder.getInt64(
9662 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9663 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9664 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9665 // In case of to, clear OMP_MAP_FROM.
9666 emitBlock(ToBB, MapperFn);
9667 Value *ToMapType = Builder.CreateAnd(
9668 MemberMapType,
9669 Builder.getInt64(
9670 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9671 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9672 Builder.CreateBr(EndBB);
9673 emitBlock(ToElseBB, MapperFn);
9674 Value *IsFrom = Builder.CreateICmpEQ(
9675 LeftToFrom,
9676 Builder.getInt64(
9677 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9678 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9679 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9680 // In case of from, clear OMP_MAP_TO.
9681 emitBlock(FromBB, MapperFn);
9682 Value *FromMapType = Builder.CreateAnd(
9683 MemberMapType,
9684 Builder.getInt64(
9685 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9686 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9687 // In case of tofrom, do nothing.
9688 emitBlock(EndBB, MapperFn);
9689 LastBB = EndBB;
9690 PHINode *CurMapType =
9691 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9692 CurMapType->addIncoming(AllocMapType, AllocBB);
9693 CurMapType->addIncoming(ToMapType, ToBB);
9694 CurMapType->addIncoming(FromMapType, FromBB);
9695 CurMapType->addIncoming(MemberMapType, ToElseBB);
9696
9697 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9698 CurSizeArg, CurMapType, CurNameArg};
9699
9700 auto ChildMapperFn = CustomMapperCB(I);
9701 if (!ChildMapperFn)
9702 return ChildMapperFn.takeError();
9703 if (*ChildMapperFn) {
9704 // Call the corresponding mapper function.
9705 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9706 ->setDoesNotThrow();
9707 } else {
9708 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9709 // data structure.
9711 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9712 OffloadingArgs);
9713 }
9714 }
9715
9716 // Update the pointer to point to the next element that needs to be mapped,
9717 // and check whether we have mapped all elements.
9718 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9719 "omp.arraymap.next");
9720 PtrPHI->addIncoming(PtrNext, LastBB);
9721 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9722 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9723 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9724
9725 emitBlock(ExitBB, MapperFn);
9726 // Emit array deletion if this is an array section and \p MapType indicates
9727 // that deletion is required.
9728 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9729 MapType, MapName, ElementSize, DoneBB,
9730 /*IsInit=*/false);
9731
9732 // Emit the function exit block.
9733 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9734
9735 Builder.CreateRetVoid();
9736 Builder.restoreIP(SavedIP);
9737 return MapperFn;
9738}
9739
9741 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9742 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9743 bool IsNonContiguous,
9744 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9745
9746 // Reset the array information.
9747 Info.clearArrayInfo();
9748 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9749
9750 if (Info.NumberOfPtrs == 0)
9751 return Error::success();
9752
9753 Builder.restoreIP(AllocaIP);
9754 // Detect if we have any capture size requiring runtime evaluation of the
9755 // size so that a constant array could be eventually used.
9756 ArrayType *PointerArrayType =
9757 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9758
9759 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9760 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9761
9762 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9763 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9764 AllocaInst *MappersArray = Builder.CreateAlloca(
9765 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9766 Info.RTArgs.MappersArray = MappersArray;
9767
9768 // If we don't have any VLA types or other types that require runtime
9769 // evaluation, we can use a constant array for the map sizes, otherwise we
9770 // need to fill up the arrays as we do for the pointers.
9771 Type *Int64Ty = Builder.getInt64Ty();
9772 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9773 ConstantInt::get(Int64Ty, 0));
9774 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
9775 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
9776 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
9777 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
9778 if (IsNonContiguous &&
9779 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9780 CombinedInfo.Types[I] &
9781 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG))
9782 ConstSizes[I] =
9783 ConstantInt::get(Int64Ty, CombinedInfo.NonContigInfo.Dims[I]);
9784 else
9785 ConstSizes[I] = CI;
9786 continue;
9787 }
9788 }
9789 RuntimeSizes.set(I);
9790 }
9791
9792 if (RuntimeSizes.all()) {
9793 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9794 Info.RTArgs.SizesArray = Builder.CreateAlloca(
9795 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9796 restoreIPandDebugLoc(Builder, CodeGenIP);
9797 } else {
9798 auto *SizesArrayInit = ConstantArray::get(
9799 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
9800 std::string Name = createPlatformSpecificName({"offload_sizes"});
9801 auto *SizesArrayGbl =
9802 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
9803 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
9804 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
9805
9806 if (!RuntimeSizes.any()) {
9807 Info.RTArgs.SizesArray = SizesArrayGbl;
9808 } else {
9809 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9810 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
9811 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
9812 AllocaInst *Buffer = Builder.CreateAlloca(
9813 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
9814 Buffer->setAlignment(OffloadSizeAlign);
9815 restoreIPandDebugLoc(Builder, CodeGenIP);
9816 Builder.CreateMemCpy(
9817 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
9818 SizesArrayGbl, OffloadSizeAlign,
9819 Builder.getIntN(
9820 IndexSize,
9821 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
9822
9823 Info.RTArgs.SizesArray = Buffer;
9824 }
9825 restoreIPandDebugLoc(Builder, CodeGenIP);
9826 }
9827
9828 // The map types are always constant so we don't need to generate code to
9829 // fill arrays. Instead, we create an array constant.
9831 for (auto mapFlag : CombinedInfo.Types)
9832 Mapping.push_back(
9833 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9834 mapFlag));
9835 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
9836 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9837 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
9838
9839 // The information types are only built if provided.
9840 if (!CombinedInfo.Names.empty()) {
9841 auto *MapNamesArrayGbl = createOffloadMapnames(
9842 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
9843 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
9844 Info.EmitDebug = true;
9845 } else {
9846 Info.RTArgs.MapNamesArray =
9848 Info.EmitDebug = false;
9849 }
9850
9851 // If there's a present map type modifier, it must not be applied to the end
9852 // of a region, so generate a separate map type array in that case.
9853 if (Info.separateBeginEndCalls()) {
9854 bool EndMapTypesDiffer = false;
9855 for (uint64_t &Type : Mapping) {
9856 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9857 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
9858 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9859 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
9860 EndMapTypesDiffer = true;
9861 }
9862 }
9863 if (EndMapTypesDiffer) {
9864 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
9865 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
9866 }
9867 }
9868
9869 PointerType *PtrTy = Builder.getPtrTy();
9870 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
9871 Value *BPVal = CombinedInfo.BasePointers[I];
9872 Value *BP = Builder.CreateConstInBoundsGEP2_32(
9873 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
9874 0, I);
9875 Builder.CreateAlignedStore(BPVal, BP,
9876 M.getDataLayout().getPrefTypeAlign(PtrTy));
9877
9878 if (Info.requiresDevicePointerInfo()) {
9879 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
9880 CodeGenIP = Builder.saveIP();
9881 Builder.restoreIP(AllocaIP);
9882 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
9883 Builder.restoreIP(CodeGenIP);
9884 if (DeviceAddrCB)
9885 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
9886 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
9887 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
9888 if (DeviceAddrCB)
9889 DeviceAddrCB(I, BP);
9890 }
9891 }
9892
9893 Value *PVal = CombinedInfo.Pointers[I];
9894 Value *P = Builder.CreateConstInBoundsGEP2_32(
9895 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
9896 I);
9897 // TODO: Check alignment correct.
9898 Builder.CreateAlignedStore(PVal, P,
9899 M.getDataLayout().getPrefTypeAlign(PtrTy));
9900
9901 if (RuntimeSizes.test(I)) {
9902 Value *S = Builder.CreateConstInBoundsGEP2_32(
9903 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9904 /*Idx0=*/0,
9905 /*Idx1=*/I);
9906 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
9907 Int64Ty,
9908 /*isSigned=*/true),
9909 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
9910 }
9911 // Fill up the mapper array.
9912 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
9913 Value *MFunc = ConstantPointerNull::get(PtrTy);
9914
9915 auto CustomMFunc = CustomMapperCB(I);
9916 if (!CustomMFunc)
9917 return CustomMFunc.takeError();
9918 if (*CustomMFunc)
9919 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
9920
9921 Value *MAddr = Builder.CreateInBoundsGEP(
9922 MappersArray->getAllocatedType(), MappersArray,
9923 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
9924 Builder.CreateAlignedStore(
9925 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
9926 }
9927
9928 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
9929 Info.NumberOfPtrs == 0)
9930 return Error::success();
9931 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
9932 return Error::success();
9933}
9934
9936 BasicBlock *CurBB = Builder.GetInsertBlock();
9937
9938 if (!CurBB || CurBB->getTerminator()) {
9939 // If there is no insert point or the previous block is already
9940 // terminated, don't touch it.
9941 } else {
9942 // Otherwise, create a fall-through branch.
9943 Builder.CreateBr(Target);
9944 }
9945
9946 Builder.ClearInsertionPoint();
9947}
9948
9950 bool IsFinished) {
9951 BasicBlock *CurBB = Builder.GetInsertBlock();
9952
9953 // Fall out of the current block (if necessary).
9954 emitBranch(BB);
9955
9956 if (IsFinished && BB->use_empty()) {
9957 BB->eraseFromParent();
9958 return;
9959 }
9960
9961 // Place the block after the current block, if possible, or else at
9962 // the end of the function.
9963 if (CurBB && CurBB->getParent())
9964 CurFn->insert(std::next(CurBB->getIterator()), BB);
9965 else
9966 CurFn->insert(CurFn->end(), BB);
9967 Builder.SetInsertPoint(BB);
9968}
9969
9971 BodyGenCallbackTy ElseGen,
9972 InsertPointTy AllocaIP) {
9973 // If the condition constant folds and can be elided, try to avoid emitting
9974 // the condition and the dead arm of the if/else.
9975 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
9976 auto CondConstant = CI->getSExtValue();
9977 if (CondConstant)
9978 return ThenGen(AllocaIP, Builder.saveIP());
9979
9980 return ElseGen(AllocaIP, Builder.saveIP());
9981 }
9982
9983 Function *CurFn = Builder.GetInsertBlock()->getParent();
9984
9985 // Otherwise, the condition did not fold, or we couldn't elide it. Just
9986 // emit the conditional branch.
9987 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
9988 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
9989 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
9990 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
9991 // Emit the 'then' code.
9992 emitBlock(ThenBlock, CurFn);
9993 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
9994 return Err;
9995 emitBranch(ContBlock);
9996 // Emit the 'else' code if present.
9997 // There is no need to emit line number for unconditional branch.
9998 emitBlock(ElseBlock, CurFn);
9999 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10000 return Err;
10001 // There is no need to emit line number for unconditional branch.
10002 emitBranch(ContBlock);
10003 // Emit the continuation block for code after the if.
10004 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10005 return Error::success();
10006}
10007
10008bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10009 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10012 "Unexpected Atomic Ordering.");
10013
10014 bool Flush = false;
10016
10017 switch (AK) {
10018 case Read:
10021 FlushAO = AtomicOrdering::Acquire;
10022 Flush = true;
10023 }
10024 break;
10025 case Write:
10026 case Compare:
10027 case Update:
10030 FlushAO = AtomicOrdering::Release;
10031 Flush = true;
10032 }
10033 break;
10034 case Capture:
10035 switch (AO) {
10037 FlushAO = AtomicOrdering::Acquire;
10038 Flush = true;
10039 break;
10041 FlushAO = AtomicOrdering::Release;
10042 Flush = true;
10043 break;
10047 Flush = true;
10048 break;
10049 default:
10050 // do nothing - leave silently.
10051 break;
10052 }
10053 }
10054
10055 if (Flush) {
10056 // Currently Flush RT call still doesn't take memory_ordering, so for when
10057 // that happens, this tries to do the resolution of which atomic ordering
10058 // to use with but issue the flush call
10059 // TODO: pass `FlushAO` after memory ordering support is added
10060 (void)FlushAO;
10061 emitFlush(Loc);
10062 }
10063
10064 // for AO == AtomicOrdering::Monotonic and all other case combinations
10065 // do nothing
10066 return Flush;
10067}
10068
10072 AtomicOrdering AO, InsertPointTy AllocaIP) {
10073 if (!updateToLocation(Loc))
10074 return Loc.IP;
10075
10076 assert(X.Var->getType()->isPointerTy() &&
10077 "OMP Atomic expects a pointer to target memory");
10078 Type *XElemTy = X.ElemTy;
10079 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10080 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10081 "OMP atomic read expected a scalar type");
10082
10083 Value *XRead = nullptr;
10084
10085 if (XElemTy->isIntegerTy()) {
10086 LoadInst *XLD =
10087 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10088 XLD->setAtomic(AO);
10089 XRead = cast<Value>(XLD);
10090 } else if (XElemTy->isStructTy()) {
10091 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10092 // target does not support `atomicrmw` of the size of the struct
10093 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10094 OldVal->setAtomic(AO);
10095 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10096 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10097 OpenMPIRBuilder::AtomicInfo atomicInfo(
10098 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10099 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10100 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10101 XRead = AtomicLoadRes.first;
10102 OldVal->eraseFromParent();
10103 } else {
10104 // We need to perform atomic op as integer
10105 IntegerType *IntCastTy =
10106 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10107 LoadInst *XLoad =
10108 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10109 XLoad->setAtomic(AO);
10110 if (XElemTy->isFloatingPointTy()) {
10111 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10112 } else {
10113 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10114 }
10115 }
10116 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10117 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10118 return Builder.saveIP();
10119}
10120
10123 AtomicOpValue &X, Value *Expr,
10124 AtomicOrdering AO, InsertPointTy AllocaIP) {
10125 if (!updateToLocation(Loc))
10126 return Loc.IP;
10127
10128 assert(X.Var->getType()->isPointerTy() &&
10129 "OMP Atomic expects a pointer to target memory");
10130 Type *XElemTy = X.ElemTy;
10131 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10132 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10133 "OMP atomic write expected a scalar type");
10134
10135 if (XElemTy->isIntegerTy()) {
10136 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10137 XSt->setAtomic(AO);
10138 } else if (XElemTy->isStructTy()) {
10139 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10140 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10141 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10142 OpenMPIRBuilder::AtomicInfo atomicInfo(
10143 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10144 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10145 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10146 OldVal->eraseFromParent();
10147 } else {
10148 // We need to bitcast and perform atomic op as integers
10149 IntegerType *IntCastTy =
10150 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10151 Value *ExprCast =
10152 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10153 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10154 XSt->setAtomic(AO);
10155 }
10156
10157 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10158 return Builder.saveIP();
10159}
10160
10163 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10164 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10165 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10166 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10167 if (!updateToLocation(Loc))
10168 return Loc.IP;
10169
10170 LLVM_DEBUG({
10171 Type *XTy = X.Var->getType();
10172 assert(XTy->isPointerTy() &&
10173 "OMP Atomic expects a pointer to target memory");
10174 Type *XElemTy = X.ElemTy;
10175 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10176 XElemTy->isPointerTy()) &&
10177 "OMP atomic update expected a scalar type");
10178 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10179 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10180 "OpenMP atomic does not support LT or GT operations");
10181 });
10182
10183 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10184 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10185 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10186 if (!AtomicResult)
10187 return AtomicResult.takeError();
10188 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10189 return Builder.saveIP();
10190}
10191
10192// FIXME: Duplicating AtomicExpand
10193Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10194 AtomicRMWInst::BinOp RMWOp) {
10195 switch (RMWOp) {
10196 case AtomicRMWInst::Add:
10197 return Builder.CreateAdd(Src1, Src2);
10198 case AtomicRMWInst::Sub:
10199 return Builder.CreateSub(Src1, Src2);
10200 case AtomicRMWInst::And:
10201 return Builder.CreateAnd(Src1, Src2);
10203 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10204 case AtomicRMWInst::Or:
10205 return Builder.CreateOr(Src1, Src2);
10206 case AtomicRMWInst::Xor:
10207 return Builder.CreateXor(Src1, Src2);
10212 case AtomicRMWInst::Max:
10213 case AtomicRMWInst::Min:
10224 llvm_unreachable("Unsupported atomic update operation");
10225 }
10226 llvm_unreachable("Unsupported atomic update operation");
10227}
10228
10229Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10230 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10232 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10233 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10234 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10235 // or a complex datatype.
10236 bool emitRMWOp = false;
10237 switch (RMWOp) {
10238 case AtomicRMWInst::Add:
10239 case AtomicRMWInst::And:
10241 case AtomicRMWInst::Or:
10242 case AtomicRMWInst::Xor:
10244 emitRMWOp = XElemTy;
10245 break;
10246 case AtomicRMWInst::Sub:
10247 emitRMWOp = (IsXBinopExpr && XElemTy);
10248 break;
10249 default:
10250 emitRMWOp = false;
10251 }
10252 emitRMWOp &= XElemTy->isIntegerTy();
10253
10254 std::pair<Value *, Value *> Res;
10255 if (emitRMWOp) {
10256 AtomicRMWInst *RMWInst =
10257 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10258 if (T.isAMDGPU()) {
10259 if (IsIgnoreDenormalMode)
10260 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10261 llvm::MDNode::get(Builder.getContext(), {}));
10262 if (!IsFineGrainedMemory)
10263 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10264 llvm::MDNode::get(Builder.getContext(), {}));
10265 if (!IsRemoteMemory)
10266 RMWInst->setMetadata("amdgpu.no.remote.memory",
10267 llvm::MDNode::get(Builder.getContext(), {}));
10268 }
10269 Res.first = RMWInst;
10270 // not needed except in case of postfix captures. Generate anyway for
10271 // consistency with the else part. Will be removed with any DCE pass.
10272 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10273 if (RMWOp == AtomicRMWInst::Xchg)
10274 Res.second = Res.first;
10275 else
10276 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10277 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10278 XElemTy->isStructTy()) {
10279 LoadInst *OldVal =
10280 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10281 OldVal->setAtomic(AO);
10282 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10283 unsigned LoadSize =
10284 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10285
10286 OpenMPIRBuilder::AtomicInfo atomicInfo(
10287 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10288 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10289 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10290 BasicBlock *CurBB = Builder.GetInsertBlock();
10291 Instruction *CurBBTI = CurBB->getTerminator();
10292 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10293 BasicBlock *ExitBB =
10294 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10295 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10296 X->getName() + ".atomic.cont");
10297 ContBB->getTerminator()->eraseFromParent();
10298 Builder.restoreIP(AllocaIP);
10299 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10300 NewAtomicAddr->setName(X->getName() + "x.new.val");
10301 Builder.SetInsertPoint(ContBB);
10302 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10303 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10304 Value *OldExprVal = PHI;
10305 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10306 if (!CBResult)
10307 return CBResult.takeError();
10308 Value *Upd = *CBResult;
10309 Builder.CreateStore(Upd, NewAtomicAddr);
10312 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10313 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10314 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10315 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10316 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10317 OldVal->eraseFromParent();
10318 Res.first = OldExprVal;
10319 Res.second = Upd;
10320
10321 if (UnreachableInst *ExitTI =
10323 CurBBTI->eraseFromParent();
10324 Builder.SetInsertPoint(ExitBB);
10325 } else {
10326 Builder.SetInsertPoint(ExitTI);
10327 }
10328 } else {
10329 IntegerType *IntCastTy =
10330 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10331 LoadInst *OldVal =
10332 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10333 OldVal->setAtomic(AO);
10334 // CurBB
10335 // | /---\
10336 // ContBB |
10337 // | \---/
10338 // ExitBB
10339 BasicBlock *CurBB = Builder.GetInsertBlock();
10340 Instruction *CurBBTI = CurBB->getTerminator();
10341 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10342 BasicBlock *ExitBB =
10343 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10344 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10345 X->getName() + ".atomic.cont");
10346 ContBB->getTerminator()->eraseFromParent();
10347 Builder.restoreIP(AllocaIP);
10348 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10349 NewAtomicAddr->setName(X->getName() + "x.new.val");
10350 Builder.SetInsertPoint(ContBB);
10351 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10352 PHI->addIncoming(OldVal, CurBB);
10353 bool IsIntTy = XElemTy->isIntegerTy();
10354 Value *OldExprVal = PHI;
10355 if (!IsIntTy) {
10356 if (XElemTy->isFloatingPointTy()) {
10357 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10358 X->getName() + ".atomic.fltCast");
10359 } else {
10360 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10361 X->getName() + ".atomic.ptrCast");
10362 }
10363 }
10364
10365 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10366 if (!CBResult)
10367 return CBResult.takeError();
10368 Value *Upd = *CBResult;
10369 Builder.CreateStore(Upd, NewAtomicAddr);
10370 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10373 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10374 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10375 Result->setVolatile(VolatileX);
10376 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10377 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10378 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10379 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10380
10381 Res.first = OldExprVal;
10382 Res.second = Upd;
10383
10384 // set Insertion point in exit block
10385 if (UnreachableInst *ExitTI =
10387 CurBBTI->eraseFromParent();
10388 Builder.SetInsertPoint(ExitBB);
10389 } else {
10390 Builder.SetInsertPoint(ExitTI);
10391 }
10392 }
10393
10394 return Res;
10395}
10396
10399 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10400 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10401 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10402 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10403 if (!updateToLocation(Loc))
10404 return Loc.IP;
10405
10406 LLVM_DEBUG({
10407 Type *XTy = X.Var->getType();
10408 assert(XTy->isPointerTy() &&
10409 "OMP Atomic expects a pointer to target memory");
10410 Type *XElemTy = X.ElemTy;
10411 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10412 XElemTy->isPointerTy()) &&
10413 "OMP atomic capture expected a scalar type");
10414 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10415 "OpenMP atomic does not support LT or GT operations");
10416 });
10417
10418 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10419 // 'x' is simply atomically rewritten with 'expr'.
10420 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10421 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10422 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10423 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10424 if (!AtomicResult)
10425 return AtomicResult.takeError();
10426 Value *CapturedVal =
10427 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10428 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10429
10430 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10431 return Builder.saveIP();
10432}
10433
10437 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10438 bool IsFailOnly) {
10439
10441 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10442 IsPostfixUpdate, IsFailOnly, Failure);
10443}
10444
10448 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10449 bool IsFailOnly, AtomicOrdering Failure) {
10450
10451 if (!updateToLocation(Loc))
10452 return Loc.IP;
10453
10454 assert(X.Var->getType()->isPointerTy() &&
10455 "OMP atomic expects a pointer to target memory");
10456 // compare capture
10457 if (V.Var) {
10458 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10459 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10460 }
10461
10462 bool IsInteger = E->getType()->isIntegerTy();
10463
10464 if (Op == OMPAtomicCompareOp::EQ) {
10465 AtomicCmpXchgInst *Result = nullptr;
10466 if (!IsInteger) {
10467 IntegerType *IntCastTy =
10468 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10469 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10470 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10471 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10472 AO, Failure);
10473 } else {
10474 Result =
10475 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10476 }
10477
10478 if (V.Var) {
10479 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10480 if (!IsInteger)
10481 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10482 assert(OldValue->getType() == V.ElemTy &&
10483 "OldValue and V must be of same type");
10484 if (IsPostfixUpdate) {
10485 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10486 } else {
10487 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10488 if (IsFailOnly) {
10489 // CurBB----
10490 // | |
10491 // v |
10492 // ContBB |
10493 // | |
10494 // v |
10495 // ExitBB <-
10496 //
10497 // where ContBB only contains the store of old value to 'v'.
10498 BasicBlock *CurBB = Builder.GetInsertBlock();
10499 Instruction *CurBBTI = CurBB->getTerminator();
10500 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10501 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10502 CurBBTI, X.Var->getName() + ".atomic.exit");
10503 BasicBlock *ContBB = CurBB->splitBasicBlock(
10504 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10505 ContBB->getTerminator()->eraseFromParent();
10506 CurBB->getTerminator()->eraseFromParent();
10507
10508 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10509
10510 Builder.SetInsertPoint(ContBB);
10511 Builder.CreateStore(OldValue, V.Var);
10512 Builder.CreateBr(ExitBB);
10513
10514 if (UnreachableInst *ExitTI =
10516 CurBBTI->eraseFromParent();
10517 Builder.SetInsertPoint(ExitBB);
10518 } else {
10519 Builder.SetInsertPoint(ExitTI);
10520 }
10521 } else {
10522 Value *CapturedValue =
10523 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10524 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10525 }
10526 }
10527 }
10528 // The comparison result has to be stored.
10529 if (R.Var) {
10530 assert(R.Var->getType()->isPointerTy() &&
10531 "r.var must be of pointer type");
10532 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10533
10534 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10535 Value *ResultCast = R.IsSigned
10536 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10537 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10538 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10539 }
10540 } else {
10541 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10542 "Op should be either max or min at this point");
10543 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10544
10545 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10546 // Let's take max as example.
10547 // OpenMP form:
10548 // x = x > expr ? expr : x;
10549 // LLVM form:
10550 // *ptr = *ptr > val ? *ptr : val;
10551 // We need to transform to LLVM form.
10552 // x = x <= expr ? x : expr;
10554 if (IsXBinopExpr) {
10555 if (IsInteger) {
10556 if (X.IsSigned)
10557 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10559 else
10560 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10562 } else {
10563 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10565 }
10566 } else {
10567 if (IsInteger) {
10568 if (X.IsSigned)
10569 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10571 else
10572 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10574 } else {
10575 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10577 }
10578 }
10579
10580 AtomicRMWInst *OldValue =
10581 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10582 if (V.Var) {
10583 Value *CapturedValue = nullptr;
10584 if (IsPostfixUpdate) {
10585 CapturedValue = OldValue;
10586 } else {
10587 CmpInst::Predicate Pred;
10588 switch (NewOp) {
10589 case AtomicRMWInst::Max:
10590 Pred = CmpInst::ICMP_SGT;
10591 break;
10593 Pred = CmpInst::ICMP_UGT;
10594 break;
10596 Pred = CmpInst::FCMP_OGT;
10597 break;
10598 case AtomicRMWInst::Min:
10599 Pred = CmpInst::ICMP_SLT;
10600 break;
10602 Pred = CmpInst::ICMP_ULT;
10603 break;
10605 Pred = CmpInst::FCMP_OLT;
10606 break;
10607 default:
10608 llvm_unreachable("unexpected comparison op");
10609 }
10610 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10611 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10612 }
10613 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10614 }
10615 }
10616
10617 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10618
10619 return Builder.saveIP();
10620}
10621
10624 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10625 Value *NumTeamsUpper, Value *ThreadLimit,
10626 Value *IfExpr) {
10627 if (!updateToLocation(Loc))
10628 return InsertPointTy();
10629
10630 uint32_t SrcLocStrSize;
10631 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10632 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10633 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10634
10635 // Outer allocation basicblock is the entry block of the current function.
10636 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10637 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10638 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10639 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10640 }
10641
10642 // The current basic block is split into four basic blocks. After outlining,
10643 // they will be mapped as follows:
10644 // ```
10645 // def current_fn() {
10646 // current_basic_block:
10647 // br label %teams.exit
10648 // teams.exit:
10649 // ; instructions after teams
10650 // }
10651 //
10652 // def outlined_fn() {
10653 // teams.alloca:
10654 // br label %teams.body
10655 // teams.body:
10656 // ; instructions within teams body
10657 // }
10658 // ```
10659 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10660 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10661 BasicBlock *AllocaBB =
10662 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10663
10664 bool SubClausesPresent =
10665 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10666 // Push num_teams
10667 if (!Config.isTargetDevice() && SubClausesPresent) {
10668 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10669 "if lowerbound is non-null, then upperbound must also be non-null "
10670 "for bounds on num_teams");
10671
10672 if (NumTeamsUpper == nullptr)
10673 NumTeamsUpper = Builder.getInt32(0);
10674
10675 if (NumTeamsLower == nullptr)
10676 NumTeamsLower = NumTeamsUpper;
10677
10678 if (IfExpr) {
10679 assert(IfExpr->getType()->isIntegerTy() &&
10680 "argument to if clause must be an integer value");
10681
10682 // upper = ifexpr ? upper : 1
10683 if (IfExpr->getType() != Int1)
10684 IfExpr = Builder.CreateICmpNE(IfExpr,
10685 ConstantInt::get(IfExpr->getType(), 0));
10686 NumTeamsUpper = Builder.CreateSelect(
10687 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10688
10689 // lower = ifexpr ? lower : 1
10690 NumTeamsLower = Builder.CreateSelect(
10691 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10692 }
10693
10694 if (ThreadLimit == nullptr)
10695 ThreadLimit = Builder.getInt32(0);
10696
10697 Value *ThreadNum = getOrCreateThreadID(Ident);
10699 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10700 {Ident, ThreadNum, NumTeamsLower, NumTeamsUpper, ThreadLimit});
10701 }
10702 // Generate the body of teams.
10703 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10704 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10705 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10706 return Err;
10707
10708 OutlineInfo OI;
10709 OI.EntryBB = AllocaBB;
10710 OI.ExitBB = ExitBB;
10711 OI.OuterAllocaBB = &OuterAllocaBB;
10712
10713 // Insert fake values for global tid and bound tid.
10715 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10717 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10719 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10720
10721 auto HostPostOutlineCB = [this, Ident,
10722 ToBeDeleted](Function &OutlinedFn) mutable {
10723 // The stale call instruction will be replaced with a new call instruction
10724 // for runtime call with the outlined function.
10725
10726 assert(OutlinedFn.hasOneUse() &&
10727 "there must be a single user for the outlined function");
10728 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10729 ToBeDeleted.push_back(StaleCI);
10730
10731 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10732 "Outlined function must have two or three arguments only");
10733
10734 bool HasShared = OutlinedFn.arg_size() == 3;
10735
10736 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10737 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10738 if (HasShared)
10739 OutlinedFn.getArg(2)->setName("data");
10740
10741 // Call to the runtime function for teams in the current function.
10742 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10743 "outlined function.");
10744 Builder.SetInsertPoint(StaleCI);
10745 SmallVector<Value *> Args = {
10746 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10747 if (HasShared)
10748 Args.push_back(StaleCI->getArgOperand(2));
10751 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10752 Args);
10753
10754 for (Instruction *I : llvm::reverse(ToBeDeleted))
10755 I->eraseFromParent();
10756 };
10757
10758 if (!Config.isTargetDevice())
10759 OI.PostOutlineCB = HostPostOutlineCB;
10760
10761 addOutlineInfo(std::move(OI));
10762
10763 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10764
10765 return Builder.saveIP();
10766}
10767
10770 InsertPointTy OuterAllocaIP,
10771 BodyGenCallbackTy BodyGenCB) {
10772 if (!updateToLocation(Loc))
10773 return InsertPointTy();
10774
10775 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
10776
10777 if (OuterAllocaBB == Builder.GetInsertBlock()) {
10778 BasicBlock *BodyBB =
10779 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
10780 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10781 }
10782 BasicBlock *ExitBB =
10783 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
10784 BasicBlock *BodyBB =
10785 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
10786 BasicBlock *AllocaBB =
10787 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
10788
10789 // Generate the body of distribute clause
10790 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10791 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10792 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10793 return Err;
10794
10795 // When using target we use different runtime functions which require a
10796 // callback.
10797 if (Config.isTargetDevice()) {
10798 OutlineInfo OI;
10799 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
10800 OI.EntryBB = AllocaBB;
10801 OI.ExitBB = ExitBB;
10802
10803 addOutlineInfo(std::move(OI));
10804 }
10805 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
10806
10807 return Builder.saveIP();
10808}
10809
10812 std::string VarName) {
10813 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
10815 Names.size()),
10816 Names);
10817 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
10818 M, MapNamesArrayInit->getType(),
10819 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
10820 VarName);
10821 return MapNamesArrayGlobal;
10822}
10823
10824// Create all simple and struct types exposed by the runtime and remember
10825// the llvm::PointerTypes of them for easy access later.
10826void OpenMPIRBuilder::initializeTypes(Module &M) {
10827 LLVMContext &Ctx = M.getContext();
10828 StructType *T;
10829 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
10830 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
10831#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
10832#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
10833 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
10834 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
10835#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
10836 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
10837 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
10838#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
10839 T = StructType::getTypeByName(Ctx, StructName); \
10840 if (!T) \
10841 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
10842 VarName = T; \
10843 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
10844#include "llvm/Frontend/OpenMP/OMPKinds.def"
10845}
10846
10849 SmallVectorImpl<BasicBlock *> &BlockVector) {
10851 BlockSet.insert(EntryBB);
10852 BlockSet.insert(ExitBB);
10853
10854 Worklist.push_back(EntryBB);
10855 while (!Worklist.empty()) {
10856 BasicBlock *BB = Worklist.pop_back_val();
10857 BlockVector.push_back(BB);
10858 for (BasicBlock *SuccBB : successors(BB))
10859 if (BlockSet.insert(SuccBB).second)
10860 Worklist.push_back(SuccBB);
10861 }
10862}
10863
10865 uint64_t Size, int32_t Flags,
10867 StringRef Name) {
10868 if (!Config.isGPU()) {
10871 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
10872 return;
10873 }
10874 // TODO: Add support for global variables on the device after declare target
10875 // support.
10876 Function *Fn = dyn_cast<Function>(Addr);
10877 if (!Fn)
10878 return;
10879
10880 // Add a function attribute for the kernel.
10881 Fn->addFnAttr("kernel");
10882 if (T.isAMDGCN())
10883 Fn->addFnAttr("uniform-work-group-size", "true");
10884 Fn->addFnAttr(Attribute::MustProgress);
10885}
10886
10887// We only generate metadata for function that contain target regions.
10890
10891 // If there are no entries, we don't need to do anything.
10892 if (OffloadInfoManager.empty())
10893 return;
10894
10895 LLVMContext &C = M.getContext();
10898 16>
10899 OrderedEntries(OffloadInfoManager.size());
10900
10901 // Auxiliary methods to create metadata values and strings.
10902 auto &&GetMDInt = [this](unsigned V) {
10903 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
10904 };
10905
10906 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
10907
10908 // Create the offloading info metadata node.
10909 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
10910 auto &&TargetRegionMetadataEmitter =
10911 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
10912 const TargetRegionEntryInfo &EntryInfo,
10914 // Generate metadata for target regions. Each entry of this metadata
10915 // contains:
10916 // - Entry 0 -> Kind of this type of metadata (0).
10917 // - Entry 1 -> Device ID of the file where the entry was identified.
10918 // - Entry 2 -> File ID of the file where the entry was identified.
10919 // - Entry 3 -> Mangled name of the function where the entry was
10920 // identified.
10921 // - Entry 4 -> Line in the file where the entry was identified.
10922 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
10923 // - Entry 6 -> Order the entry was created.
10924 // The first element of the metadata node is the kind.
10925 Metadata *Ops[] = {
10926 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
10927 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
10928 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
10929 GetMDInt(E.getOrder())};
10930
10931 // Save this entry in the right position of the ordered entries array.
10932 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
10933
10934 // Add metadata to the named metadata node.
10935 MD->addOperand(MDNode::get(C, Ops));
10936 };
10937
10938 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
10939
10940 // Create function that emits metadata for each device global variable entry;
10941 auto &&DeviceGlobalVarMetadataEmitter =
10942 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
10943 StringRef MangledName,
10945 // Generate metadata for global variables. Each entry of this metadata
10946 // contains:
10947 // - Entry 0 -> Kind of this type of metadata (1).
10948 // - Entry 1 -> Mangled name of the variable.
10949 // - Entry 2 -> Declare target kind.
10950 // - Entry 3 -> Order the entry was created.
10951 // The first element of the metadata node is the kind.
10952 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
10953 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
10954
10955 // Save this entry in the right position of the ordered entries array.
10956 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
10957 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
10958
10959 // Add metadata to the named metadata node.
10960 MD->addOperand(MDNode::get(C, Ops));
10961 };
10962
10963 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
10964 DeviceGlobalVarMetadataEmitter);
10965
10966 for (const auto &E : OrderedEntries) {
10967 assert(E.first && "All ordered entries must exist!");
10968 if (const auto *CE =
10970 E.first)) {
10971 if (!CE->getID() || !CE->getAddress()) {
10972 // Do not blame the entry if the parent funtion is not emitted.
10973 TargetRegionEntryInfo EntryInfo = E.second;
10974 StringRef FnName = EntryInfo.ParentName;
10975 if (!M.getNamedValue(FnName))
10976 continue;
10977 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
10978 continue;
10979 }
10980 createOffloadEntry(CE->getID(), CE->getAddress(),
10981 /*Size=*/0, CE->getFlags(),
10983 } else if (const auto *CE = dyn_cast<
10985 E.first)) {
10988 CE->getFlags());
10989 switch (Flags) {
10992 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
10993 continue;
10994 if (!CE->getAddress()) {
10995 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
10996 continue;
10997 }
10998 // The vaiable has no definition - no need to add the entry.
10999 if (CE->getVarSize() == 0)
11000 continue;
11001 break;
11003 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11004 (!Config.isTargetDevice() && CE->getAddress())) &&
11005 "Declaret target link address is set.");
11006 if (Config.isTargetDevice())
11007 continue;
11008 if (!CE->getAddress()) {
11010 continue;
11011 }
11012 break;
11015 if (!CE->getAddress()) {
11016 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11017 continue;
11018 }
11019 break;
11020 default:
11021 break;
11022 }
11023
11024 // Hidden or internal symbols on the device are not externally visible.
11025 // We should not attempt to register them by creating an offloading
11026 // entry. Indirect variables are handled separately on the device.
11027 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11028 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11029 (Flags !=
11031 Flags != OffloadEntriesInfoManager::
11032 OMPTargetGlobalVarEntryIndirectVTable))
11033 continue;
11034
11035 // Indirect globals need to use a special name that doesn't match the name
11036 // of the associated host global.
11038 Flags ==
11040 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11041 Flags, CE->getLinkage(), CE->getVarName());
11042 else
11043 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11044 Flags, CE->getLinkage());
11045
11046 } else {
11047 llvm_unreachable("Unsupported entry kind.");
11048 }
11049 }
11050
11051 // Emit requires directive globals to a special entry so the runtime can
11052 // register them when the device image is loaded.
11053 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11054 // entries should be redesigned to better suit this use-case.
11055 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11059 ".requires", /*Size=*/0,
11061 Config.getRequiresFlags());
11062}
11063
11066 unsigned FileID, unsigned Line, unsigned Count) {
11067 raw_svector_ostream OS(Name);
11068 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11069 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11070 if (Count)
11071 OS << "_" << Count;
11072}
11073
11075 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11076 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11078 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11079 EntryInfo.Line, NewCount);
11080}
11081
11084 vfs::FileSystem &VFS,
11085 StringRef ParentName) {
11086 sys::fs::UniqueID ID(0xdeadf17e, 0);
11087 auto FileIDInfo = CallBack();
11088 uint64_t FileID = 0;
11089 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11090 ID = Status->getUniqueID();
11091 FileID = Status->getUniqueID().getFile();
11092 } else {
11093 // If the inode ID could not be determined, create a hash value
11094 // the current file name and use that as an ID.
11095 FileID = hash_value(std::get<0>(FileIDInfo));
11096 }
11097
11098 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11099 std::get<1>(FileIDInfo));
11100}
11101
11103 unsigned Offset = 0;
11104 for (uint64_t Remain =
11105 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11107 !(Remain & 1); Remain = Remain >> 1)
11108 Offset++;
11109 return Offset;
11110}
11111
11114 // Rotate by getFlagMemberOffset() bits.
11115 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11116 << getFlagMemberOffset());
11117}
11118
11121 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11122 // If the entry is PTR_AND_OBJ but has not been marked with the special
11123 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11124 // marked as MEMBER_OF.
11125 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11127 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11130 return;
11131
11132 // Entries with ATTACH are not members-of anything. They are handled
11133 // separately by the runtime after other maps have been handled.
11134 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11136 return;
11137
11138 // Reset the placeholder value to prepare the flag for the assignment of the
11139 // proper MEMBER_OF value.
11140 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11141 Flags |= MemberOfFlag;
11142}
11143
11147 bool IsDeclaration, bool IsExternallyVisible,
11148 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11149 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11150 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11151 std::function<Constant *()> GlobalInitializer,
11152 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11153 // TODO: convert this to utilise the IRBuilder Config rather than
11154 // a passed down argument.
11155 if (OpenMPSIMD)
11156 return nullptr;
11157
11160 CaptureClause ==
11162 Config.hasRequiresUnifiedSharedMemory())) {
11163 SmallString<64> PtrName;
11164 {
11165 raw_svector_ostream OS(PtrName);
11166 OS << MangledName;
11167 if (!IsExternallyVisible)
11168 OS << format("_%x", EntryInfo.FileID);
11169 OS << "_decl_tgt_ref_ptr";
11170 }
11171
11172 Value *Ptr = M.getNamedValue(PtrName);
11173
11174 if (!Ptr) {
11175 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11176 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11177
11178 auto *GV = cast<GlobalVariable>(Ptr);
11179 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11180
11181 if (!Config.isTargetDevice()) {
11182 if (GlobalInitializer)
11183 GV->setInitializer(GlobalInitializer());
11184 else
11185 GV->setInitializer(GlobalValue);
11186 }
11187
11189 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11190 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11191 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11192 }
11193
11194 return cast<Constant>(Ptr);
11195 }
11196
11197 return nullptr;
11198}
11199
11203 bool IsDeclaration, bool IsExternallyVisible,
11204 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11205 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11206 std::vector<Triple> TargetTriple,
11207 std::function<Constant *()> GlobalInitializer,
11208 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11209 Constant *Addr) {
11211 (TargetTriple.empty() && !Config.isTargetDevice()))
11212 return;
11213
11215 StringRef VarName;
11216 int64_t VarSize;
11218
11220 CaptureClause ==
11222 !Config.hasRequiresUnifiedSharedMemory()) {
11224 VarName = MangledName;
11225 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11226
11227 if (!IsDeclaration)
11228 VarSize = divideCeil(
11229 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11230 else
11231 VarSize = 0;
11232 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11233
11234 // This is a workaround carried over from Clang which prevents undesired
11235 // optimisation of internal variables.
11236 if (Config.isTargetDevice() &&
11237 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11238 // Do not create a "ref-variable" if the original is not also available
11239 // on the host.
11240 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11241 return;
11242
11243 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11244
11245 if (!M.getNamedValue(RefName)) {
11246 Constant *AddrRef =
11247 getOrCreateInternalVariable(Addr->getType(), RefName);
11248 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11249 GvAddrRef->setConstant(true);
11250 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11251 GvAddrRef->setInitializer(Addr);
11252 GeneratedRefs.push_back(GvAddrRef);
11253 }
11254 }
11255 } else {
11258 else
11260
11261 if (Config.isTargetDevice()) {
11262 VarName = (Addr) ? Addr->getName() : "";
11263 Addr = nullptr;
11264 } else {
11266 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11267 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11268 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11269 VarName = (Addr) ? Addr->getName() : "";
11270 }
11271 VarSize = M.getDataLayout().getPointerSize();
11273 }
11274
11275 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11276 Flags, Linkage);
11277}
11278
11279/// Loads all the offload entries information from the host IR
11280/// metadata.
11282 // If we are in target mode, load the metadata from the host IR. This code has
11283 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11284
11285 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11286 if (!MD)
11287 return;
11288
11289 for (MDNode *MN : MD->operands()) {
11290 auto &&GetMDInt = [MN](unsigned Idx) {
11291 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11292 return cast<ConstantInt>(V->getValue())->getZExtValue();
11293 };
11294
11295 auto &&GetMDString = [MN](unsigned Idx) {
11296 auto *V = cast<MDString>(MN->getOperand(Idx));
11297 return V->getString();
11298 };
11299
11300 switch (GetMDInt(0)) {
11301 default:
11302 llvm_unreachable("Unexpected metadata!");
11303 break;
11304 case OffloadEntriesInfoManager::OffloadEntryInfo::
11305 OffloadingEntryInfoTargetRegion: {
11306 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11307 /*DeviceID=*/GetMDInt(1),
11308 /*FileID=*/GetMDInt(2),
11309 /*Line=*/GetMDInt(4),
11310 /*Count=*/GetMDInt(5));
11311 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11312 /*Order=*/GetMDInt(6));
11313 break;
11314 }
11315 case OffloadEntriesInfoManager::OffloadEntryInfo::
11316 OffloadingEntryInfoDeviceGlobalVar:
11317 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11318 /*MangledName=*/GetMDString(1),
11320 /*Flags=*/GetMDInt(2)),
11321 /*Order=*/GetMDInt(3));
11322 break;
11323 }
11324 }
11325}
11326
11328 StringRef HostFilePath) {
11329 if (HostFilePath.empty())
11330 return;
11331
11332 auto Buf = VFS.getBufferForFile(HostFilePath);
11333 if (std::error_code Err = Buf.getError()) {
11334 report_fatal_error(("error opening host file from host file path inside of "
11335 "OpenMPIRBuilder: " +
11336 Err.message())
11337 .c_str());
11338 }
11339
11340 LLVMContext Ctx;
11342 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11343 if (std::error_code Err = M.getError()) {
11345 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11346 .c_str());
11347 }
11348
11349 loadOffloadInfoMetadata(*M.get());
11350}
11351
11352//===----------------------------------------------------------------------===//
11353// OffloadEntriesInfoManager
11354//===----------------------------------------------------------------------===//
11355
11357 return OffloadEntriesTargetRegion.empty() &&
11358 OffloadEntriesDeviceGlobalVar.empty();
11359}
11360
11361unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11362 const TargetRegionEntryInfo &EntryInfo) const {
11363 auto It = OffloadEntriesTargetRegionCount.find(
11364 getTargetRegionEntryCountKey(EntryInfo));
11365 if (It == OffloadEntriesTargetRegionCount.end())
11366 return 0;
11367 return It->second;
11368}
11369
11370void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11371 const TargetRegionEntryInfo &EntryInfo) {
11372 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11373 EntryInfo.Count + 1;
11374}
11375
11376/// Initialize target region entry.
11378 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11379 OffloadEntriesTargetRegion[EntryInfo] =
11380 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11382 ++OffloadingEntriesNum;
11383}
11384
11386 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11388 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11389
11390 // Update the EntryInfo with the next available count for this location.
11391 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11392
11393 // If we are emitting code for a target, the entry is already initialized,
11394 // only has to be registered.
11395 if (OMPBuilder->Config.isTargetDevice()) {
11396 // This could happen if the device compilation is invoked standalone.
11397 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11398 return;
11399 }
11400 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11401 Entry.setAddress(Addr);
11402 Entry.setID(ID);
11403 Entry.setFlags(Flags);
11404 } else {
11406 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11407 return;
11408 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11409 "Target region entry already registered!");
11410 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11411 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11412 ++OffloadingEntriesNum;
11413 }
11414 incrementTargetRegionEntryInfoCount(EntryInfo);
11415}
11416
11418 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11419
11420 // Update the EntryInfo with the next available count for this location.
11421 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11422
11423 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11424 if (It == OffloadEntriesTargetRegion.end()) {
11425 return false;
11426 }
11427 // Fail if this entry is already registered.
11428 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11429 return false;
11430 return true;
11431}
11432
11434 const OffloadTargetRegionEntryInfoActTy &Action) {
11435 // Scan all target region entries and perform the provided action.
11436 for (const auto &It : OffloadEntriesTargetRegion) {
11437 Action(It.first, It.second);
11438 }
11439}
11440
11442 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11443 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11444 ++OffloadingEntriesNum;
11445}
11446
11448 StringRef VarName, Constant *Addr, int64_t VarSize,
11450 if (OMPBuilder->Config.isTargetDevice()) {
11451 // This could happen if the device compilation is invoked standalone.
11452 if (!hasDeviceGlobalVarEntryInfo(VarName))
11453 return;
11454 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11455 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11456 if (Entry.getVarSize() == 0) {
11457 Entry.setVarSize(VarSize);
11458 Entry.setLinkage(Linkage);
11459 }
11460 return;
11461 }
11462 Entry.setVarSize(VarSize);
11463 Entry.setLinkage(Linkage);
11464 Entry.setAddress(Addr);
11465 } else {
11466 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11467 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11468 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11469 "Entry not initialized!");
11470 if (Entry.getVarSize() == 0) {
11471 Entry.setVarSize(VarSize);
11472 Entry.setLinkage(Linkage);
11473 }
11474 return;
11475 }
11477 Flags ==
11479 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11480 Addr, VarSize, Flags, Linkage,
11481 VarName.str());
11482 else
11483 OffloadEntriesDeviceGlobalVar.try_emplace(
11484 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
11485 ++OffloadingEntriesNum;
11486 }
11487}
11488
11491 // Scan all target region entries and perform the provided action.
11492 for (const auto &E : OffloadEntriesDeviceGlobalVar)
11493 Action(E.getKey(), E.getValue());
11494}
11495
11496//===----------------------------------------------------------------------===//
11497// CanonicalLoopInfo
11498//===----------------------------------------------------------------------===//
11499
11500void CanonicalLoopInfo::collectControlBlocks(
11502 // We only count those BBs as control block for which we do not need to
11503 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
11504 // flow. For consistency, this also means we do not add the Body block, which
11505 // is just the entry to the body code.
11506 BBs.reserve(BBs.size() + 6);
11507 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
11508}
11509
11511 assert(isValid() && "Requires a valid canonical loop");
11512 for (BasicBlock *Pred : predecessors(Header)) {
11513 if (Pred != Latch)
11514 return Pred;
11515 }
11516 llvm_unreachable("Missing preheader");
11517}
11518
11519void CanonicalLoopInfo::setTripCount(Value *TripCount) {
11520 assert(isValid() && "Requires a valid canonical loop");
11521
11522 Instruction *CmpI = &getCond()->front();
11523 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
11524 CmpI->setOperand(1, TripCount);
11525
11526#ifndef NDEBUG
11527 assertOK();
11528#endif
11529}
11530
11531void CanonicalLoopInfo::mapIndVar(
11532 llvm::function_ref<Value *(Instruction *)> Updater) {
11533 assert(isValid() && "Requires a valid canonical loop");
11534
11535 Instruction *OldIV = getIndVar();
11536
11537 // Record all uses excluding those introduced by the updater. Uses by the
11538 // CanonicalLoopInfo itself to keep track of the number of iterations are
11539 // excluded.
11540 SmallVector<Use *> ReplacableUses;
11541 for (Use &U : OldIV->uses()) {
11542 auto *User = dyn_cast<Instruction>(U.getUser());
11543 if (!User)
11544 continue;
11545 if (User->getParent() == getCond())
11546 continue;
11547 if (User->getParent() == getLatch())
11548 continue;
11549 ReplacableUses.push_back(&U);
11550 }
11551
11552 // Run the updater that may introduce new uses
11553 Value *NewIV = Updater(OldIV);
11554
11555 // Replace the old uses with the value returned by the updater.
11556 for (Use *U : ReplacableUses)
11557 U->set(NewIV);
11558
11559#ifndef NDEBUG
11560 assertOK();
11561#endif
11562}
11563
11565#ifndef NDEBUG
11566 // No constraints if this object currently does not describe a loop.
11567 if (!isValid())
11568 return;
11569
11570 BasicBlock *Preheader = getPreheader();
11571 BasicBlock *Body = getBody();
11572 BasicBlock *After = getAfter();
11573
11574 // Verify standard control-flow we use for OpenMP loops.
11575 assert(Preheader);
11576 assert(isa<BranchInst>(Preheader->getTerminator()) &&
11577 "Preheader must terminate with unconditional branch");
11578 assert(Preheader->getSingleSuccessor() == Header &&
11579 "Preheader must jump to header");
11580
11581 assert(Header);
11582 assert(isa<BranchInst>(Header->getTerminator()) &&
11583 "Header must terminate with unconditional branch");
11584 assert(Header->getSingleSuccessor() == Cond &&
11585 "Header must jump to exiting block");
11586
11587 assert(Cond);
11588 assert(Cond->getSinglePredecessor() == Header &&
11589 "Exiting block only reachable from header");
11590
11591 assert(isa<BranchInst>(Cond->getTerminator()) &&
11592 "Exiting block must terminate with conditional branch");
11593 assert(size(successors(Cond)) == 2 &&
11594 "Exiting block must have two successors");
11595 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
11596 "Exiting block's first successor jump to the body");
11597 assert(cast<BranchInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
11598 "Exiting block's second successor must exit the loop");
11599
11600 assert(Body);
11601 assert(Body->getSinglePredecessor() == Cond &&
11602 "Body only reachable from exiting block");
11603 assert(!isa<PHINode>(Body->front()));
11604
11605 assert(Latch);
11606 assert(isa<BranchInst>(Latch->getTerminator()) &&
11607 "Latch must terminate with unconditional branch");
11608 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
11609 // TODO: To support simple redirecting of the end of the body code that has
11610 // multiple; introduce another auxiliary basic block like preheader and after.
11611 assert(Latch->getSinglePredecessor() != nullptr);
11612 assert(!isa<PHINode>(Latch->front()));
11613
11614 assert(Exit);
11615 assert(isa<BranchInst>(Exit->getTerminator()) &&
11616 "Exit block must terminate with unconditional branch");
11617 assert(Exit->getSingleSuccessor() == After &&
11618 "Exit block must jump to after block");
11619
11620 assert(After);
11621 assert(After->getSinglePredecessor() == Exit &&
11622 "After block only reachable from exit block");
11623 assert(After->empty() || !isa<PHINode>(After->front()));
11624
11625 Instruction *IndVar = getIndVar();
11626 assert(IndVar && "Canonical induction variable not found?");
11627 assert(isa<IntegerType>(IndVar->getType()) &&
11628 "Induction variable must be an integer");
11629 assert(cast<PHINode>(IndVar)->getParent() == Header &&
11630 "Induction variable must be a PHI in the loop header");
11631 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
11632 assert(
11633 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
11634 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
11635
11636 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
11637 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
11638 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
11639 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
11640 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
11641 ->isOne());
11642
11643 Value *TripCount = getTripCount();
11644 assert(TripCount && "Loop trip count not found?");
11645 assert(IndVar->getType() == TripCount->getType() &&
11646 "Trip count and induction variable must have the same type");
11647
11648 auto *CmpI = cast<CmpInst>(&Cond->front());
11649 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
11650 "Exit condition must be a signed less-than comparison");
11651 assert(CmpI->getOperand(0) == IndVar &&
11652 "Exit condition must compare the induction variable");
11653 assert(CmpI->getOperand(1) == TripCount &&
11654 "Exit condition must compare with the trip count");
11655#endif
11656}
11657
11659 Header = nullptr;
11660 Cond = nullptr;
11661 Latch = nullptr;
11662 Exit = nullptr;
11663}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
unsigned unsigned DefaultVal
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:361
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:475
bool empty() const
Definition BasicBlock.h:481
const Instruction & back() const
Definition BasicBlock.h:484
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="", bool Before=false)
Split the basic block into two basic blocks at the specified instruction.
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:482
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:477
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:386
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:662
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
Conditional or Unconditional Branch instruction.
static BranchInst * Create(BasicBlock *IfTrue, InsertPosition InsertBefore=nullptr)
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true)
This method constructs a CDS and initializes it with a text string.
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:720
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:568
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:640
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:166
const BasicBlock & getEntryBlock() const
Definition Function.h:807
Argument * arg_iterator
Definition Function.h:72
bool empty() const
Definition Function.h:857
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:209
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:447
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:765
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:352
const Function & getFunction() const
Definition Function.h:164
iterator begin()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:866
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:355
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:668
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:753
size_t arg_size() const
Definition Function.h:899
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:214
iterator end()
Definition Function.h:853
void setCallingConv(CallingConv::ID CC)
Definition Function.h:274
Argument * getArg(unsigned i) const
Definition Function.h:884
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:602
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:969
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1078
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1577
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1440
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:608
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1757
iterator_range< op_iterator > operands()
Definition Metadata.h:1853
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:225
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:143
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:146
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:453
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:273
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:618
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:413
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:619
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1071
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1133
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1149
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:280
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:261
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:293
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:135
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:151
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:238
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI void replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:708
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:86
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
hash_code hash_value(const FixedPointSemantics &Val)
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
Definition STLExtras.h:1667
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:839
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:367
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:296
auto successors(const MachineBasicBlock *BB)
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:302
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1751
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:167
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
LLVM_ABI bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound)
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="", bool Before=false)
Split the specified block at the specified instruction.
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
a struct to pack relevant information while generating atomic Ops
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...