LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
65
66#include <cstdint>
67#include <optional>
68
69#define DEBUG_TYPE "openmp-ir-builder"
70
71using namespace llvm;
72using namespace omp;
73
74static cl::opt<bool>
75 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
76 cl::desc("Use optimistic attributes describing "
77 "'as-if' properties of runtime calls."),
78 cl::init(false));
79
81 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
82 cl::desc("Factor for the unroll threshold to account for code "
83 "simplifications still taking place"),
84 cl::init(1.5));
85
86#ifndef NDEBUG
87/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
88/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
89/// an InsertPoint stores the instruction before something is inserted. For
90/// instance, if both point to the same instruction, two IRBuilders alternating
91/// creating instruction will cause the instructions to be interleaved.
94 if (!IP1.isSet() || !IP2.isSet())
95 return false;
96 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
97}
98
100 // Valid ordered/unordered and base algorithm combinations.
101 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
102 case OMPScheduleType::UnorderedStaticChunked:
103 case OMPScheduleType::UnorderedStatic:
104 case OMPScheduleType::UnorderedDynamicChunked:
105 case OMPScheduleType::UnorderedGuidedChunked:
106 case OMPScheduleType::UnorderedRuntime:
107 case OMPScheduleType::UnorderedAuto:
108 case OMPScheduleType::UnorderedTrapezoidal:
109 case OMPScheduleType::UnorderedGreedy:
110 case OMPScheduleType::UnorderedBalanced:
111 case OMPScheduleType::UnorderedGuidedIterativeChunked:
112 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
113 case OMPScheduleType::UnorderedSteal:
114 case OMPScheduleType::UnorderedStaticBalancedChunked:
115 case OMPScheduleType::UnorderedGuidedSimd:
116 case OMPScheduleType::UnorderedRuntimeSimd:
117 case OMPScheduleType::OrderedStaticChunked:
118 case OMPScheduleType::OrderedStatic:
119 case OMPScheduleType::OrderedDynamicChunked:
120 case OMPScheduleType::OrderedGuidedChunked:
121 case OMPScheduleType::OrderedRuntime:
122 case OMPScheduleType::OrderedAuto:
123 case OMPScheduleType::OrderdTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedStaticChunked:
125 case OMPScheduleType::NomergeUnorderedStatic:
126 case OMPScheduleType::NomergeUnorderedDynamicChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedChunked:
128 case OMPScheduleType::NomergeUnorderedRuntime:
129 case OMPScheduleType::NomergeUnorderedAuto:
130 case OMPScheduleType::NomergeUnorderedTrapezoidal:
131 case OMPScheduleType::NomergeUnorderedGreedy:
132 case OMPScheduleType::NomergeUnorderedBalanced:
133 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
134 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
135 case OMPScheduleType::NomergeUnorderedSteal:
136 case OMPScheduleType::NomergeOrderedStaticChunked:
137 case OMPScheduleType::NomergeOrderedStatic:
138 case OMPScheduleType::NomergeOrderedDynamicChunked:
139 case OMPScheduleType::NomergeOrderedGuidedChunked:
140 case OMPScheduleType::NomergeOrderedRuntime:
141 case OMPScheduleType::NomergeOrderedAuto:
142 case OMPScheduleType::NomergeOrderedTrapezoidal:
143 case OMPScheduleType::OrderedDistributeChunked:
144 case OMPScheduleType::OrderedDistribute:
145 break;
146 default:
147 return false;
148 }
149
150 // Must not set both monotonicity modifiers at the same time.
151 OMPScheduleType MonotonicityFlags =
152 SchedType & OMPScheduleType::MonotonicityMask;
153 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
154 return false;
155
156 return true;
157}
158#endif
159
160/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
161/// debug location to the last instruction in the specified basic block if the
162/// insert point points to the end of the block.
165 Builder.restoreIP(IP);
166 llvm::BasicBlock *BB = Builder.GetInsertBlock();
167 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
168 if (!BB->empty() && I == BB->end())
169 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
170}
171
172static bool hasGridValue(const Triple &T) {
173 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
174}
175
176static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
177 if (T.isAMDGPU()) {
178 StringRef Features =
179 Kernel->getFnAttribute("target-features").getValueAsString();
180 if (Features.count("+wavefrontsize64"))
183 }
184 if (T.isNVPTX())
186 if (T.isSPIRV())
188 llvm_unreachable("No grid value available for this architecture!");
189}
190
191/// Determine which scheduling algorithm to use, determined from schedule clause
192/// arguments.
193static OMPScheduleType
194getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
195 bool HasSimdModifier, bool HasDistScheduleChunks) {
196 // Currently, the default schedule it static.
197 switch (ClauseKind) {
198 case OMP_SCHEDULE_Default:
199 case OMP_SCHEDULE_Static:
200 return HasChunks ? OMPScheduleType::BaseStaticChunked
201 : OMPScheduleType::BaseStatic;
202 case OMP_SCHEDULE_Dynamic:
203 return OMPScheduleType::BaseDynamicChunked;
204 case OMP_SCHEDULE_Guided:
205 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
206 : OMPScheduleType::BaseGuidedChunked;
207 case OMP_SCHEDULE_Auto:
209 case OMP_SCHEDULE_Runtime:
210 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
211 : OMPScheduleType::BaseRuntime;
212 case OMP_SCHEDULE_Distribute:
213 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
214 : OMPScheduleType::BaseDistribute;
215 }
216 llvm_unreachable("unhandled schedule clause argument");
217}
218
219/// Adds ordering modifier flags to schedule type.
220static OMPScheduleType
222 bool HasOrderedClause) {
223 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
224 OMPScheduleType::None &&
225 "Must not have ordering nor monotonicity flags already set");
226
227 OMPScheduleType OrderingModifier = HasOrderedClause
228 ? OMPScheduleType::ModifierOrdered
229 : OMPScheduleType::ModifierUnordered;
230 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
231
232 // Unsupported combinations
233 if (OrderingScheduleType ==
234 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
235 return OMPScheduleType::OrderedGuidedChunked;
236 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
237 OMPScheduleType::ModifierOrdered))
238 return OMPScheduleType::OrderedRuntime;
239
240 return OrderingScheduleType;
241}
242
243/// Adds monotonicity modifier flags to schedule type.
244static OMPScheduleType
246 bool HasSimdModifier, bool HasMonotonic,
247 bool HasNonmonotonic, bool HasOrderedClause) {
248 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
249 OMPScheduleType::None &&
250 "Must not have monotonicity flags already set");
251 assert((!HasMonotonic || !HasNonmonotonic) &&
252 "Monotonic and Nonmonotonic are contradicting each other");
253
254 if (HasMonotonic) {
255 return ScheduleType | OMPScheduleType::ModifierMonotonic;
256 } else if (HasNonmonotonic) {
257 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
258 } else {
259 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
260 // If the static schedule kind is specified or if the ordered clause is
261 // specified, and if the nonmonotonic modifier is not specified, the
262 // effect is as if the monotonic modifier is specified. Otherwise, unless
263 // the monotonic modifier is specified, the effect is as if the
264 // nonmonotonic modifier is specified.
265 OMPScheduleType BaseScheduleType =
266 ScheduleType & ~OMPScheduleType::ModifierMask;
267 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
268 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
269 HasOrderedClause) {
270 // The monotonic is used by default in openmp runtime library, so no need
271 // to set it.
272 return ScheduleType;
273 } else {
274 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
275 }
276 }
277}
278
279/// Determine the schedule type using schedule and ordering clause arguments.
280static OMPScheduleType
281computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
282 bool HasSimdModifier, bool HasMonotonicModifier,
283 bool HasNonmonotonicModifier, bool HasOrderedClause,
284 bool HasDistScheduleChunks) {
286 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
287 OMPScheduleType OrderedSchedule =
288 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
290 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
291 HasNonmonotonicModifier, HasOrderedClause);
292
294 return Result;
295}
296
297/// Make \p Source branch to \p Target.
298///
299/// Handles two situations:
300/// * \p Source already has an unconditional branch.
301/// * \p Source is a degenerate block (no terminator because the BB is
302/// the current head of the IR construction).
304 if (Instruction *Term = Source->getTerminatorOrNull()) {
305 auto *Br = cast<UncondBrInst>(Term);
306 BasicBlock *Succ = Br->getSuccessor();
307 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
308 Br->setSuccessor(Target);
309 return;
310 }
311
312 auto *NewBr = UncondBrInst::Create(Target, Source);
313 NewBr->setDebugLoc(DL);
314}
315
317 bool CreateBranch, DebugLoc DL) {
318 assert(New->getFirstInsertionPt() == New->begin() &&
319 "Target BB must not have PHI nodes");
320
321 // Move instructions to new block.
322 BasicBlock *Old = IP.getBlock();
323 // If the `Old` block is empty then there are no instructions to move. But in
324 // the new debug scheme, it could have trailing debug records which will be
325 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
326 // reasons:
327 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
328 // 2. Even if `New` is not empty, the rationale to move those records to `New`
329 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
330 // assumes that `Old` is optimized out and is going away. This is not the case
331 // here. The `Old` block is still being used e.g. a branch instruction is
332 // added to it later in this function.
333 // So we call `BasicBlock::splice` only when `Old` is not empty.
334 if (!Old->empty())
335 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
336
337 if (CreateBranch) {
338 auto *NewBr = UncondBrInst::Create(New, Old);
339 NewBr->setDebugLoc(DL);
340 }
341}
342
343void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
344 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
345 BasicBlock *Old = Builder.GetInsertBlock();
346
347 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
348 if (CreateBranch)
349 Builder.SetInsertPoint(Old->getTerminator());
350 else
351 Builder.SetInsertPoint(Old);
352
353 // SetInsertPoint also updates the Builder's debug location, but we want to
354 // keep the one the Builder was configured to use.
355 Builder.SetCurrentDebugLocation(DebugLoc);
356}
357
359 DebugLoc DL, llvm::Twine Name) {
360 BasicBlock *Old = IP.getBlock();
362 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
363 Old->getParent(), Old->getNextNode());
364 spliceBB(IP, New, CreateBranch, DL);
365 New->replaceSuccessorsPhiUsesWith(Old, New);
366 return New;
367}
368
369BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
370 llvm::Twine Name) {
371 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
372 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
373 if (CreateBranch)
374 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
375 else
376 Builder.SetInsertPoint(Builder.GetInsertBlock());
377 // SetInsertPoint also updates the Builder's debug location, but we want to
378 // keep the one the Builder was configured to use.
379 Builder.SetCurrentDebugLocation(DebugLoc);
380 return New;
381}
382
383BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
384 llvm::Twine Name) {
385 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
386 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
387 if (CreateBranch)
388 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
389 else
390 Builder.SetInsertPoint(Builder.GetInsertBlock());
391 // SetInsertPoint also updates the Builder's debug location, but we want to
392 // keep the one the Builder was configured to use.
393 Builder.SetCurrentDebugLocation(DebugLoc);
394 return New;
395}
396
398 llvm::Twine Suffix) {
399 BasicBlock *Old = Builder.GetInsertBlock();
400 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
401}
402
403// This function creates a fake integer value and a fake use for the integer
404// value. It returns the fake value created. This is useful in modeling the
405// extra arguments to the outlined functions.
407 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
409 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
410 const Twine &Name = "", bool AsPtr = true,
411 bool Is64Bit = false) {
412 Builder.restoreIP(OuterAllocaIP);
413 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
414 Instruction *FakeVal;
415 AllocaInst *FakeValAddr =
416 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
417 ToBeDeleted.push_back(FakeValAddr);
418
419 if (AsPtr) {
420 FakeVal = FakeValAddr;
421 } else {
422 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
423 ToBeDeleted.push_back(FakeVal);
424 }
425
426 // Generate a fake use of this value
427 Builder.restoreIP(InnerAllocaIP);
428 Instruction *UseFakeVal;
429 if (AsPtr) {
430 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
431 } else {
432 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
433 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
434 }
435 ToBeDeleted.push_back(UseFakeVal);
436 return FakeVal;
437}
438
439//===----------------------------------------------------------------------===//
440// OpenMPIRBuilderConfig
441//===----------------------------------------------------------------------===//
442
443namespace {
445/// Values for bit flags for marking which requires clauses have been used.
446enum OpenMPOffloadingRequiresDirFlags {
447 /// flag undefined.
448 OMP_REQ_UNDEFINED = 0x000,
449 /// no requires directive present.
450 OMP_REQ_NONE = 0x001,
451 /// reverse_offload clause.
452 OMP_REQ_REVERSE_OFFLOAD = 0x002,
453 /// unified_address clause.
454 OMP_REQ_UNIFIED_ADDRESS = 0x004,
455 /// unified_shared_memory clause.
456 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
457 /// dynamic_allocators clause.
458 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
459 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
460};
461
462} // anonymous namespace
463
465 : RequiresFlags(OMP_REQ_UNDEFINED) {}
466
469 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
470 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
473 RequiresFlags(OMP_REQ_UNDEFINED) {
474 if (HasRequiresReverseOffload)
475 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
476 if (HasRequiresUnifiedAddress)
477 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
478 if (HasRequiresUnifiedSharedMemory)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 if (HasRequiresDynamicAllocators)
481 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
482}
483
485 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
486}
487
489 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
490}
491
493 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
494}
495
497 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
498}
499
501 return hasRequiresFlags() ? RequiresFlags
502 : static_cast<int64_t>(OMP_REQ_NONE);
503}
504
506 if (Value)
507 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
508 else
509 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
510}
511
513 if (Value)
514 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
515 else
516 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
517}
518
520 if (Value)
521 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
522 else
523 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
524}
525
527 if (Value)
528 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
529 else
530 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
531}
532
533//===----------------------------------------------------------------------===//
534// OpenMPIRBuilder
535//===----------------------------------------------------------------------===//
536
539 SmallVector<Value *> &ArgsVector) {
541 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
542 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
543 constexpr size_t MaxDim = 3;
544 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
545
546 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
547
548 Value *DynCGroupMemFallbackFlag =
549 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
550 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
551 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
552
553 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
554
555 Value *NumTeams3D =
556 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
557 Value *NumThreads3D =
558 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
559 for (unsigned I :
560 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
561 NumTeams3D =
562 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
563 for (unsigned I :
564 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
565 NumThreads3D =
566 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
567
568 ArgsVector = {Version,
569 PointerNum,
570 KernelArgs.RTArgs.BasePointersArray,
571 KernelArgs.RTArgs.PointersArray,
572 KernelArgs.RTArgs.SizesArray,
573 KernelArgs.RTArgs.MapTypesArray,
574 KernelArgs.RTArgs.MapNamesArray,
575 KernelArgs.RTArgs.MappersArray,
576 KernelArgs.NumIterations,
577 Flags,
578 NumTeams3D,
579 NumThreads3D,
580 KernelArgs.DynCGroupMem};
581}
582
584 LLVMContext &Ctx = Fn.getContext();
585
586 // Get the function's current attributes.
587 auto Attrs = Fn.getAttributes();
588 auto FnAttrs = Attrs.getFnAttrs();
589 auto RetAttrs = Attrs.getRetAttrs();
591 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
592 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
593
594 // Add AS to FnAS while taking special care with integer extensions.
595 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
596 bool Param = true) -> void {
597 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
598 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
599 if (HasSignExt || HasZeroExt) {
600 assert(AS.getNumAttributes() == 1 &&
601 "Currently not handling extension attr combined with others.");
602 if (Param) {
603 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else if (auto AK =
606 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
607 FnAS = FnAS.addAttribute(Ctx, AK);
608 } else {
609 FnAS = FnAS.addAttributes(Ctx, AS);
610 }
611 };
612
613#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
614#include "llvm/Frontend/OpenMP/OMPKinds.def"
615
616 // Add attributes to the function declaration.
617 switch (FnID) {
618#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
619 case Enum: \
620 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
621 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
622 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
623 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
624 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
625 break;
626#include "llvm/Frontend/OpenMP/OMPKinds.def"
627 default:
628 // Attributes are optional.
629 break;
630 }
631}
632
635 FunctionType *FnTy = nullptr;
636 Function *Fn = nullptr;
637
638 // Try to find the declation in the module first.
639 switch (FnID) {
640#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
641 case Enum: \
642 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
643 IsVarArg); \
644 Fn = M.getFunction(Str); \
645 break;
646#include "llvm/Frontend/OpenMP/OMPKinds.def"
647 }
648
649 if (!Fn) {
650 // Create a new declaration if we need one.
651 switch (FnID) {
652#define OMP_RTL(Enum, Str, ...) \
653 case Enum: \
654 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
655 break;
656#include "llvm/Frontend/OpenMP/OMPKinds.def"
657 }
658 Fn->setCallingConv(Config.getRuntimeCC());
659 // Add information if the runtime function takes a callback function
660 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
661 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
662 LLVMContext &Ctx = Fn->getContext();
663 MDBuilder MDB(Ctx);
664 // Annotate the callback behavior of the runtime function:
665 // - The callback callee is argument number 2 (microtask).
666 // - The first two arguments of the callback callee are unknown (-1).
667 // - All variadic arguments to the runtime function are passed to the
668 // callback callee.
669 Fn->addMetadata(
670 LLVMContext::MD_callback,
672 2, {-1, -1}, /* VarArgsArePassed */ true)}));
673 }
674 }
675
676 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 addAttributes(FnID, *Fn);
679
680 } else {
681 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
682 << " with type " << *Fn->getFunctionType() << "\n");
683 }
684
685 assert(Fn && "Failed to create OpenMP runtime function");
686
687 return {FnTy, Fn};
688}
689
692 if (!FiniBB) {
693 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
695 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
696 Builder.SetInsertPoint(FiniBB);
697 // FiniCB adds the branch to the exit stub.
698 if (Error Err = FiniCB(Builder.saveIP()))
699 return Err;
700 }
701 return FiniBB;
702}
703
705 BasicBlock *OtherFiniBB) {
706 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
707 if (!FiniBB) {
708 FiniBB = OtherFiniBB;
709
710 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
711 if (Error Err = FiniCB(Builder.saveIP()))
712 return Err;
713
714 return Error::success();
715 }
716
717 // Move instructions from FiniBB to the start of OtherFiniBB.
718 auto EndIt = FiniBB->end();
719 if (FiniBB->size() >= 1)
720 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
721 EndIt = Prev;
722 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
723 EndIt);
724
725 FiniBB->replaceAllUsesWith(OtherFiniBB);
726 FiniBB->eraseFromParent();
727 FiniBB = OtherFiniBB;
728 return Error::success();
729}
730
733 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
734 assert(Fn && "Failed to create OpenMP runtime function pointer");
735 return Fn;
736}
737
740 StringRef Name) {
741 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
742 Call->setCallingConv(Config.getRuntimeCC());
743 return Call;
744}
745
746void OpenMPIRBuilder::initialize() { initializeTypes(M); }
747
750 BasicBlock &EntryBlock = Function->getEntryBlock();
751 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
752
753 // Loop over blocks looking for constant allocas, skipping the entry block
754 // as any allocas there are already in the desired location.
755 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
756 Block++) {
757 for (auto Inst = Block->getReverseIterator()->begin();
758 Inst != Block->getReverseIterator()->end();) {
760 Inst++;
762 continue;
763 AllocaInst->moveBeforePreserving(MoveLocInst);
764 } else {
765 Inst++;
766 }
767 }
768 }
769}
770
773
774 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
775 // TODO: For now, we support simple static allocations, we might need to
776 // move non-static ones as well. However, this will need further analysis to
777 // move the lenght arguments as well.
779 };
780
781 for (llvm::Instruction &Inst : Block)
783 if (ShouldHoistAlloca(*AllocaInst))
784 AllocasToMove.push_back(AllocaInst);
785
786 auto InsertPoint =
787 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
788
789 for (llvm::Instruction *AllocaInst : AllocasToMove)
791}
792
794 PostDominatorTree PostDomTree(*Func);
795 for (llvm::BasicBlock &BB : *Func)
796 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
798}
799
801 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
803 SmallVector<OutlineInfo, 16> DeferredOutlines;
804 for (OutlineInfo &OI : OutlineInfos) {
805 // Skip functions that have not finalized yet; may happen with nested
806 // function generation.
807 if (Fn && OI.getFunction() != Fn) {
808 DeferredOutlines.push_back(OI);
809 continue;
810 }
811
812 ParallelRegionBlockSet.clear();
813 Blocks.clear();
814 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
815
816 Function *OuterFn = OI.getFunction();
817 CodeExtractorAnalysisCache CEAC(*OuterFn);
818 // If we generate code for the target device, we need to allocate
819 // struct for aggregate params in the device default alloca address space.
820 // OpenMP runtime requires that the params of the extracted functions are
821 // passed as zero address space pointers. This flag ensures that
822 // CodeExtractor generates correct code for extracted functions
823 // which are used by OpenMP runtime.
824 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
825 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
826 /* AggregateArgs */ true,
827 /* BlockFrequencyInfo */ nullptr,
828 /* BranchProbabilityInfo */ nullptr,
829 /* AssumptionCache */ nullptr,
830 /* AllowVarArgs */ true,
831 /* AllowAlloca */ true,
832 /* AllocaBlock*/ OI.OuterAllocaBB,
833 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
834
835 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
836 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
837 << " Exit: " << OI.ExitBB->getName() << "\n");
838 assert(Extractor.isEligible() &&
839 "Expected OpenMP outlining to be possible!");
840
841 for (auto *V : OI.ExcludeArgsFromAggregate)
842 Extractor.excludeArgFromAggregate(V);
843
844 Function *OutlinedFn =
845 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
846
847 // Forward target-cpu, target-features attributes to the outlined function.
848 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
849 if (TargetCpuAttr.isStringAttribute())
850 OutlinedFn->addFnAttr(TargetCpuAttr);
851
852 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
853 if (TargetFeaturesAttr.isStringAttribute())
854 OutlinedFn->addFnAttr(TargetFeaturesAttr);
855
856 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
857 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
858 assert(OutlinedFn->getReturnType()->isVoidTy() &&
859 "OpenMP outlined functions should not return a value!");
860
861 // For compability with the clang CG we move the outlined function after the
862 // one with the parallel region.
863 OutlinedFn->removeFromParent();
864 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
865
866 // Remove the artificial entry introduced by the extractor right away, we
867 // made our own entry block after all.
868 {
869 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
870 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
871 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
872 // Move instructions from the to-be-deleted ArtificialEntry to the entry
873 // basic block of the parallel region. CodeExtractor generates
874 // instructions to unwrap the aggregate argument and may sink
875 // allocas/bitcasts for values that are solely used in the outlined region
876 // and do not escape.
877 assert(!ArtificialEntry.empty() &&
878 "Expected instructions to add in the outlined region entry");
879 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
880 End = ArtificialEntry.rend();
881 It != End;) {
882 Instruction &I = *It;
883 It++;
884
885 if (I.isTerminator()) {
886 // Absorb any debug value that terminator may have
887 if (Instruction *TI = OI.EntryBB->getTerminatorOrNull())
888 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
889 continue;
890 }
891
892 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
893 }
894
895 OI.EntryBB->moveBefore(&ArtificialEntry);
896 ArtificialEntry.eraseFromParent();
897 }
898 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
899 assert(OutlinedFn && OutlinedFn->hasNUses(1));
900
901 // Run a user callback, e.g. to add attributes.
902 if (OI.PostOutlineCB)
903 OI.PostOutlineCB(*OutlinedFn);
904
905 if (OI.FixUpNonEntryAllocas)
907 }
908
909 // Remove work items that have been completed.
910 OutlineInfos = std::move(DeferredOutlines);
911
912 // The createTarget functions embeds user written code into
913 // the target region which may inject allocas which need to
914 // be moved to the entry block of our target or risk malformed
915 // optimisations by later passes, this is only relevant for
916 // the device pass which appears to be a little more delicate
917 // when it comes to optimisations (however, we do not block on
918 // that here, it's up to the inserter to the list to do so).
919 // This notbaly has to occur after the OutlinedInfo candidates
920 // have been extracted so we have an end product that will not
921 // be implicitly adversely affected by any raises unless
922 // intentionally appended to the list.
923 // NOTE: This only does so for ConstantData, it could be extended
924 // to ConstantExpr's with further effort, however, they should
925 // largely be folded when they get here. Extending it to runtime
926 // defined/read+writeable allocation sizes would be non-trivial
927 // (need to factor in movement of any stores to variables the
928 // allocation size depends on, as well as the usual loads,
929 // otherwise it'll yield the wrong result after movement) and
930 // likely be more suitable as an LLVM optimisation pass.
933
934 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
935 [](EmitMetadataErrorKind Kind,
936 const TargetRegionEntryInfo &EntryInfo) -> void {
937 errs() << "Error of kind: " << Kind
938 << " when emitting offload entries and metadata during "
939 "OMPIRBuilder finalization \n";
940 };
941
942 if (!OffloadInfoManager.empty())
944
945 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
946 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
947 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
948 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
949 }
950
951 IsFinalized = true;
952}
953
954bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
955
957 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
958}
959
961 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
962 auto *GV =
963 new GlobalVariable(M, I32Ty,
964 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
965 ConstantInt::get(I32Ty, Value), Name);
966 GV->setVisibility(GlobalValue::HiddenVisibility);
967
968 return GV;
969}
970
972 if (List.empty())
973 return;
974
975 // Convert List to what ConstantArray needs.
977 UsedArray.resize(List.size());
978 for (unsigned I = 0, E = List.size(); I != E; ++I)
980 cast<Constant>(&*List[I]), Builder.getPtrTy());
981
982 if (UsedArray.empty())
983 return;
984 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
985
986 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
987 ConstantArray::get(ATy, UsedArray), Name);
988
989 GV->setSection("llvm.metadata");
990}
991
994 OMPTgtExecModeFlags Mode) {
995 auto *Int8Ty = Builder.getInt8Ty();
996 auto *GVMode = new GlobalVariable(
997 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
998 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
999 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1000 return GVMode;
1001}
1002
1004 uint32_t SrcLocStrSize,
1005 IdentFlag LocFlags,
1006 unsigned Reserve2Flags) {
1007 // Enable "C-mode".
1008 LocFlags |= OMP_IDENT_FLAG_KMPC;
1009
1010 Constant *&Ident =
1011 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1012 if (!Ident) {
1013 Constant *I32Null = ConstantInt::getNullValue(Int32);
1014 Constant *IdentData[] = {I32Null,
1015 ConstantInt::get(Int32, uint32_t(LocFlags)),
1016 ConstantInt::get(Int32, Reserve2Flags),
1017 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1018
1019 size_t SrcLocStrArgIdx = 4;
1020 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1022 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1023 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1024 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1025 Constant *Initializer =
1026 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1027
1028 // Look for existing encoding of the location + flags, not needed but
1029 // minimizes the difference to the existing solution while we transition.
1030 for (GlobalVariable &GV : M.globals())
1031 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1032 if (GV.getInitializer() == Initializer)
1033 Ident = &GV;
1034
1035 if (!Ident) {
1036 auto *GV = new GlobalVariable(
1037 M, OpenMPIRBuilder::Ident,
1038 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1040 M.getDataLayout().getDefaultGlobalsAddressSpace());
1041 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1042 GV->setAlignment(Align(8));
1043 Ident = GV;
1044 }
1045 }
1046
1047 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1048}
1049
1051 uint32_t &SrcLocStrSize) {
1052 SrcLocStrSize = LocStr.size();
1053 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1054 if (!SrcLocStr) {
1055 Constant *Initializer =
1056 ConstantDataArray::getString(M.getContext(), LocStr);
1057
1058 // Look for existing encoding of the location, not needed but minimizes the
1059 // difference to the existing solution while we transition.
1060 for (GlobalVariable &GV : M.globals())
1061 if (GV.isConstant() && GV.hasInitializer() &&
1062 GV.getInitializer() == Initializer)
1063 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1064
1065 SrcLocStr = Builder.CreateGlobalString(
1066 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1067 &M);
1068 }
1069 return SrcLocStr;
1070}
1071
1073 StringRef FileName,
1074 unsigned Line, unsigned Column,
1075 uint32_t &SrcLocStrSize) {
1076 SmallString<128> Buffer;
1077 Buffer.push_back(';');
1078 Buffer.append(FileName);
1079 Buffer.push_back(';');
1080 Buffer.append(FunctionName);
1081 Buffer.push_back(';');
1082 Buffer.append(std::to_string(Line));
1083 Buffer.push_back(';');
1084 Buffer.append(std::to_string(Column));
1085 Buffer.push_back(';');
1086 Buffer.push_back(';');
1087 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1088}
1089
1090Constant *
1092 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1093 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1094}
1095
1097 uint32_t &SrcLocStrSize,
1098 Function *F) {
1099 DILocation *DIL = DL.get();
1100 if (!DIL)
1101 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1102 StringRef FileName = M.getName();
1103 if (DIFile *DIF = DIL->getFile())
1104 if (std::optional<StringRef> Source = DIF->getSource())
1105 FileName = *Source;
1106 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1107 if (Function.empty() && F)
1108 Function = F->getName();
1109 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1110 DIL->getColumn(), SrcLocStrSize);
1111}
1112
1114 uint32_t &SrcLocStrSize) {
1115 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1116 Loc.IP.getBlock()->getParent());
1117}
1118
1121 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1122 "omp_global_thread_num");
1123}
1124
1127 bool ForceSimpleCall, bool CheckCancelFlag) {
1128 if (!updateToLocation(Loc))
1129 return Loc.IP;
1130
1131 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1132 // __kmpc_barrier(loc, thread_id);
1133
1134 IdentFlag BarrierLocFlags;
1135 switch (Kind) {
1136 case OMPD_for:
1137 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1138 break;
1139 case OMPD_sections:
1140 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1141 break;
1142 case OMPD_single:
1143 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1144 break;
1145 case OMPD_barrier:
1146 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1147 break;
1148 default:
1149 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1150 break;
1151 }
1152
1153 uint32_t SrcLocStrSize;
1154 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1155 Value *Args[] = {
1156 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1157 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1158
1159 // If we are in a cancellable parallel region, barriers are cancellation
1160 // points.
1161 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1162 bool UseCancelBarrier =
1163 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1164
1166 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1167 ? OMPRTL___kmpc_cancel_barrier
1168 : OMPRTL___kmpc_barrier),
1169 Args);
1170
1171 if (UseCancelBarrier && CheckCancelFlag)
1172 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1173 return Err;
1174
1175 return Builder.saveIP();
1176}
1177
1180 Value *IfCondition,
1181 omp::Directive CanceledDirective) {
1182 if (!updateToLocation(Loc))
1183 return Loc.IP;
1184
1185 // LLVM utilities like blocks with terminators.
1186 auto *UI = Builder.CreateUnreachable();
1187
1188 Instruction *ThenTI = UI, *ElseTI = nullptr;
1189 if (IfCondition) {
1190 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1191
1192 // Even if the if condition evaluates to false, this should count as a
1193 // cancellation point
1194 Builder.SetInsertPoint(ElseTI);
1195 auto ElseIP = Builder.saveIP();
1196
1198 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1199 if (!IPOrErr)
1200 return IPOrErr;
1201 }
1202
1203 Builder.SetInsertPoint(ThenTI);
1204
1205 Value *CancelKind = nullptr;
1206 switch (CanceledDirective) {
1207#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1208 case DirectiveEnum: \
1209 CancelKind = Builder.getInt32(Value); \
1210 break;
1211#include "llvm/Frontend/OpenMP/OMPKinds.def"
1212 default:
1213 llvm_unreachable("Unknown cancel kind!");
1214 }
1215
1216 uint32_t SrcLocStrSize;
1217 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1218 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1219 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1221 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1222
1223 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1224 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1225 return Err;
1226
1227 // Update the insertion point and remove the terminator we introduced.
1228 Builder.SetInsertPoint(UI->getParent());
1229 UI->eraseFromParent();
1230
1231 return Builder.saveIP();
1232}
1233
1236 omp::Directive CanceledDirective) {
1237 if (!updateToLocation(Loc))
1238 return Loc.IP;
1239
1240 // LLVM utilities like blocks with terminators.
1241 auto *UI = Builder.CreateUnreachable();
1242 Builder.SetInsertPoint(UI);
1243
1244 Value *CancelKind = nullptr;
1245 switch (CanceledDirective) {
1246#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1247 case DirectiveEnum: \
1248 CancelKind = Builder.getInt32(Value); \
1249 break;
1250#include "llvm/Frontend/OpenMP/OMPKinds.def"
1251 default:
1252 llvm_unreachable("Unknown cancel kind!");
1253 }
1254
1255 uint32_t SrcLocStrSize;
1256 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1257 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1258 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1260 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1261
1262 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1263 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1264 return Err;
1265
1266 // Update the insertion point and remove the terminator we introduced.
1267 Builder.SetInsertPoint(UI->getParent());
1268 UI->eraseFromParent();
1269
1270 return Builder.saveIP();
1271}
1272
1274 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1275 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1276 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1277 if (!updateToLocation(Loc))
1278 return Loc.IP;
1279
1280 Builder.restoreIP(AllocaIP);
1281 auto *KernelArgsPtr =
1282 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1284
1285 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1286 llvm::Value *Arg =
1287 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1288 Builder.CreateAlignedStore(
1289 KernelArgs[I], Arg,
1290 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1291 }
1292
1293 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1294 NumThreads, HostPtr, KernelArgsPtr};
1295
1297 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1298 OffloadingArgs);
1299
1300 return Builder.saveIP();
1301}
1302
1304 const LocationDescription &Loc, Value *OutlinedFnID,
1305 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1306 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1307
1308 if (!updateToLocation(Loc))
1309 return Loc.IP;
1310
1311 // On top of the arrays that were filled up, the target offloading call
1312 // takes as arguments the device id as well as the host pointer. The host
1313 // pointer is used by the runtime library to identify the current target
1314 // region, so it only has to be unique and not necessarily point to
1315 // anything. It could be the pointer to the outlined function that
1316 // implements the target region, but we aren't using that so that the
1317 // compiler doesn't need to keep that, and could therefore inline the host
1318 // function if proven worthwhile during optimization.
1319
1320 // From this point on, we need to have an ID of the target region defined.
1321 assert(OutlinedFnID && "Invalid outlined function ID!");
1322 (void)OutlinedFnID;
1323
1324 // Return value of the runtime offloading call.
1325 Value *Return = nullptr;
1326
1327 // Arguments for the target kernel.
1328 SmallVector<Value *> ArgsVector;
1329 getKernelArgsVector(Args, Builder, ArgsVector);
1330
1331 // The target region is an outlined function launched by the runtime
1332 // via calls to __tgt_target_kernel().
1333 //
1334 // Note that on the host and CPU targets, the runtime implementation of
1335 // these calls simply call the outlined function without forking threads.
1336 // The outlined functions themselves have runtime calls to
1337 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1338 // the compiler in emitTeamsCall() and emitParallelCall().
1339 //
1340 // In contrast, on the NVPTX target, the implementation of
1341 // __tgt_target_teams() launches a GPU kernel with the requested number
1342 // of teams and threads so no additional calls to the runtime are required.
1343 // Check the error code and execute the host version if required.
1344 Builder.restoreIP(emitTargetKernel(
1345 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1346 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1347
1348 BasicBlock *OffloadFailedBlock =
1349 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1350 BasicBlock *OffloadContBlock =
1351 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1352 Value *Failed = Builder.CreateIsNotNull(Return);
1353 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1354
1355 auto CurFn = Builder.GetInsertBlock()->getParent();
1356 emitBlock(OffloadFailedBlock, CurFn);
1357 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1358 if (!AfterIP)
1359 return AfterIP.takeError();
1360 Builder.restoreIP(*AfterIP);
1361 emitBranch(OffloadContBlock);
1362 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1363 return Builder.saveIP();
1364}
1365
1367 Value *CancelFlag, omp::Directive CanceledDirective) {
1368 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1369 "Unexpected cancellation!");
1370
1371 // For a cancel barrier we create two new blocks.
1372 BasicBlock *BB = Builder.GetInsertBlock();
1373 BasicBlock *NonCancellationBlock;
1374 if (Builder.GetInsertPoint() == BB->end()) {
1375 // TODO: This branch will not be needed once we moved to the
1376 // OpenMPIRBuilder codegen completely.
1377 NonCancellationBlock = BasicBlock::Create(
1378 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1379 } else {
1380 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1382 Builder.SetInsertPoint(BB);
1383 }
1384 BasicBlock *CancellationBlock = BasicBlock::Create(
1385 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1386
1387 // Jump to them based on the return value.
1388 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1389 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1390 /* TODO weight */ nullptr, nullptr);
1391
1392 // From the cancellation block we finalize all variables and go to the
1393 // post finalization block that is known to the FiniCB callback.
1394 auto &FI = FinalizationStack.back();
1395 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1396 if (!FiniBBOrErr)
1397 return FiniBBOrErr.takeError();
1398 Builder.SetInsertPoint(CancellationBlock);
1399 Builder.CreateBr(*FiniBBOrErr);
1400
1401 // The continuation block is where code generation continues.
1402 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1403 return Error::success();
1404}
1405
1406// Callback used to create OpenMP runtime calls to support
1407// omp parallel clause for the device.
1408// We need to use this callback to replace call to the OutlinedFn in OuterFn
1409// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1411 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1412 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1413 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1414 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1415 // Add some known attributes.
1416 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1417 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1418 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1419 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1420 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1421 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1422
1423 assert(OutlinedFn.arg_size() >= 2 &&
1424 "Expected at least tid and bounded tid as arguments");
1425 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1426
1427 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1428 assert(CI && "Expected call instruction to outlined function");
1429 CI->getParent()->setName("omp_parallel");
1430
1431 Builder.SetInsertPoint(CI);
1432 Type *PtrTy = OMPIRBuilder->VoidPtr;
1433 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1434
1435 // Add alloca for kernel args
1436 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1437 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1438 AllocaInst *ArgsAlloca =
1439 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1440 Value *Args = ArgsAlloca;
1441 // Add address space cast if array for storing arguments is not allocated
1442 // in address space 0
1443 if (ArgsAlloca->getAddressSpace())
1444 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1445 Builder.restoreIP(CurrentIP);
1446
1447 // Store captured vars which are used by kmpc_parallel_60
1448 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1449 Value *V = *(CI->arg_begin() + 2 + Idx);
1450 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1451 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1452 Builder.CreateStore(V, StoreAddress);
1453 }
1454
1455 Value *Cond =
1456 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1457 : Builder.getInt32(1);
1458
1459 // Build kmpc_parallel_60 call
1460 Value *Parallel60CallArgs[] = {
1461 /* identifier*/ Ident,
1462 /* global thread num*/ ThreadID,
1463 /* if expression */ Cond,
1464 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1465 /* Proc bind */ Builder.getInt32(-1),
1466 /* outlined function */ &OutlinedFn,
1467 /* wrapper function */ NullPtrValue,
1468 /* arguments of the outlined funciton*/ Args,
1469 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1470 /* strict for number of threads */ Builder.getInt32(0)};
1471
1472 FunctionCallee RTLFn =
1473 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1474
1475 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1476
1477 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1478 << *Builder.GetInsertBlock()->getParent() << "\n");
1479
1480 // Initialize the local TID stack location with the argument value.
1481 Builder.SetInsertPoint(PrivTID);
1482 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1483 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1484 PrivTIDAddr);
1485
1486 // Remove redundant call to the outlined function.
1487 CI->eraseFromParent();
1488
1489 for (Instruction *I : ToBeDeleted) {
1490 I->eraseFromParent();
1491 }
1492}
1493
1494// Callback used to create OpenMP runtime calls to support
1495// omp parallel clause for the host.
1496// We need to use this callback to replace call to the OutlinedFn in OuterFn
1497// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1498static void
1500 Function *OuterFn, Value *Ident, Value *IfCondition,
1501 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1502 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1503 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1504 FunctionCallee RTLFn;
1505 if (IfCondition) {
1506 RTLFn =
1507 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1508 } else {
1509 RTLFn =
1510 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1511 }
1512 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1513 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1514 LLVMContext &Ctx = F->getContext();
1515 MDBuilder MDB(Ctx);
1516 // Annotate the callback behavior of the __kmpc_fork_call:
1517 // - The callback callee is argument number 2 (microtask).
1518 // - The first two arguments of the callback callee are unknown (-1).
1519 // - All variadic arguments to the __kmpc_fork_call are passed to the
1520 // callback callee.
1521 F->addMetadata(LLVMContext::MD_callback,
1523 2, {-1, -1},
1524 /* VarArgsArePassed */ true)}));
1525 }
1526 }
1527 // Add some known attributes.
1528 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1529 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1530 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1531
1532 assert(OutlinedFn.arg_size() >= 2 &&
1533 "Expected at least tid and bounded tid as arguments");
1534 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1535
1536 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1537 CI->getParent()->setName("omp_parallel");
1538 Builder.SetInsertPoint(CI);
1539
1540 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1541 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1542 &OutlinedFn};
1543
1544 SmallVector<Value *, 16> RealArgs;
1545 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1546 if (IfCondition) {
1547 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1548 RealArgs.push_back(Cond);
1549 }
1550 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1551
1552 // __kmpc_fork_call_if always expects a void ptr as the last argument
1553 // If there are no arguments, pass a null pointer.
1554 auto PtrTy = OMPIRBuilder->VoidPtr;
1555 if (IfCondition && NumCapturedVars == 0) {
1556 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1557 RealArgs.push_back(NullPtrValue);
1558 }
1559
1560 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1561
1562 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1563 << *Builder.GetInsertBlock()->getParent() << "\n");
1564
1565 // Initialize the local TID stack location with the argument value.
1566 Builder.SetInsertPoint(PrivTID);
1567 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1568 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1569 PrivTIDAddr);
1570
1571 // Remove redundant call to the outlined function.
1572 CI->eraseFromParent();
1573
1574 for (Instruction *I : ToBeDeleted) {
1575 I->eraseFromParent();
1576 }
1577}
1578
1580 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1581 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1582 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1583 omp::ProcBindKind ProcBind, bool IsCancellable) {
1584 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1585
1586 if (!updateToLocation(Loc))
1587 return Loc.IP;
1588
1589 uint32_t SrcLocStrSize;
1590 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1591 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1592 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1593 (ProcBind != OMP_PROC_BIND_default);
1594 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1595 // If we generate code for the target device, we need to allocate
1596 // struct for aggregate params in the device default alloca address space.
1597 // OpenMP runtime requires that the params of the extracted functions are
1598 // passed as zero address space pointers. This flag ensures that extracted
1599 // function arguments are declared in zero address space
1600 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1601
1602 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1603 // only if we compile for host side.
1604 if (NumThreads && !Config.isTargetDevice()) {
1605 Value *Args[] = {
1606 Ident, ThreadID,
1607 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1609 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1610 }
1611
1612 if (ProcBind != OMP_PROC_BIND_default) {
1613 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1614 Value *Args[] = {
1615 Ident, ThreadID,
1616 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1618 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1619 }
1620
1621 BasicBlock *InsertBB = Builder.GetInsertBlock();
1622 Function *OuterFn = InsertBB->getParent();
1623
1624 // Save the outer alloca block because the insertion iterator may get
1625 // invalidated and we still need this later.
1626 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1627
1628 // Vector to remember instructions we used only during the modeling but which
1629 // we want to delete at the end.
1631
1632 // Change the location to the outer alloca insertion point to create and
1633 // initialize the allocas we pass into the parallel region.
1634 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1635 Builder.restoreIP(NewOuter);
1636 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1637 AllocaInst *ZeroAddrAlloca =
1638 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1639 Instruction *TIDAddr = TIDAddrAlloca;
1640 Instruction *ZeroAddr = ZeroAddrAlloca;
1641 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1642 // Add additional casts to enforce pointers in zero address space
1643 TIDAddr = new AddrSpaceCastInst(
1644 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1645 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1646 ToBeDeleted.push_back(TIDAddr);
1647 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1648 PointerType ::get(M.getContext(), 0),
1649 "zero.addr.ascast");
1650 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1651 ToBeDeleted.push_back(ZeroAddr);
1652 }
1653
1654 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1655 // associated arguments in the outlined function, so we delete them later.
1656 ToBeDeleted.push_back(TIDAddrAlloca);
1657 ToBeDeleted.push_back(ZeroAddrAlloca);
1658
1659 // Create an artificial insertion point that will also ensure the blocks we
1660 // are about to split are not degenerated.
1661 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1662
1663 BasicBlock *EntryBB = UI->getParent();
1664 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1665 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1666 BasicBlock *PRegPreFiniBB =
1667 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1668 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1669
1670 auto FiniCBWrapper = [&](InsertPointTy IP) {
1671 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1672 // target to the region exit block.
1673 if (IP.getBlock()->end() == IP.getPoint()) {
1675 Builder.restoreIP(IP);
1676 Instruction *I = Builder.CreateBr(PRegExitBB);
1677 IP = InsertPointTy(I->getParent(), I->getIterator());
1678 }
1679 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1680 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1681 "Unexpected insertion point for finalization call!");
1682 return FiniCB(IP);
1683 };
1684
1685 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1686
1687 // Generate the privatization allocas in the block that will become the entry
1688 // of the outlined function.
1689 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1690 InsertPointTy InnerAllocaIP = Builder.saveIP();
1691
1692 AllocaInst *PrivTIDAddr =
1693 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1694 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1695
1696 // Add some fake uses for OpenMP provided arguments.
1697 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1698 Instruction *ZeroAddrUse =
1699 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1700 ToBeDeleted.push_back(ZeroAddrUse);
1701
1702 // EntryBB
1703 // |
1704 // V
1705 // PRegionEntryBB <- Privatization allocas are placed here.
1706 // |
1707 // V
1708 // PRegionBodyBB <- BodeGen is invoked here.
1709 // |
1710 // V
1711 // PRegPreFiniBB <- The block we will start finalization from.
1712 // |
1713 // V
1714 // PRegionExitBB <- A common exit to simplify block collection.
1715 //
1716
1717 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1718
1719 // Let the caller create the body.
1720 assert(BodyGenCB && "Expected body generation callback!");
1721 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1722 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1723 return Err;
1724
1725 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1726
1727 OutlineInfo OI;
1728 if (Config.isTargetDevice()) {
1729 // Generate OpenMP target specific runtime call
1730 OI.PostOutlineCB = [=, ToBeDeletedVec =
1731 std::move(ToBeDeleted)](Function &OutlinedFn) {
1732 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1733 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1734 ThreadID, ToBeDeletedVec);
1735 };
1736 OI.FixUpNonEntryAllocas = true;
1737 } else {
1738 // Generate OpenMP host runtime call
1739 OI.PostOutlineCB = [=, ToBeDeletedVec =
1740 std::move(ToBeDeleted)](Function &OutlinedFn) {
1741 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1742 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1743 };
1744 OI.FixUpNonEntryAllocas = true;
1745 }
1746
1747 OI.OuterAllocaBB = OuterAllocaBlock;
1748 OI.EntryBB = PRegEntryBB;
1749 OI.ExitBB = PRegExitBB;
1750
1751 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1753 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1754
1755 CodeExtractorAnalysisCache CEAC(*OuterFn);
1756 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1757 /* AggregateArgs */ false,
1758 /* BlockFrequencyInfo */ nullptr,
1759 /* BranchProbabilityInfo */ nullptr,
1760 /* AssumptionCache */ nullptr,
1761 /* AllowVarArgs */ true,
1762 /* AllowAlloca */ true,
1763 /* AllocationBlock */ OuterAllocaBlock,
1764 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1765
1766 // Find inputs to, outputs from the code region.
1767 BasicBlock *CommonExit = nullptr;
1768 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1769 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1770
1771 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1772 /*CollectGlobalInputs=*/true);
1773
1774 Inputs.remove_if([&](Value *I) {
1776 return GV->getValueType() == OpenMPIRBuilder::Ident;
1777
1778 return false;
1779 });
1780
1781 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1782
1783 FunctionCallee TIDRTLFn =
1784 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1785
1786 auto PrivHelper = [&](Value &V) -> Error {
1787 if (&V == TIDAddr || &V == ZeroAddr) {
1789 return Error::success();
1790 }
1791
1793 for (Use &U : V.uses())
1794 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1795 if (ParallelRegionBlockSet.count(UserI->getParent()))
1796 Uses.insert(&U);
1797
1798 // __kmpc_fork_call expects extra arguments as pointers. If the input
1799 // already has a pointer type, everything is fine. Otherwise, store the
1800 // value onto stack and load it back inside the to-be-outlined region. This
1801 // will ensure only the pointer will be passed to the function.
1802 // FIXME: if there are more than 15 trailing arguments, they must be
1803 // additionally packed in a struct.
1804 Value *Inner = &V;
1805 if (!V.getType()->isPointerTy()) {
1807 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1808
1809 Builder.restoreIP(OuterAllocaIP);
1810 Value *Ptr =
1811 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1812
1813 // Store to stack at end of the block that currently branches to the entry
1814 // block of the to-be-outlined region.
1815 Builder.SetInsertPoint(InsertBB,
1816 InsertBB->getTerminator()->getIterator());
1817 Builder.CreateStore(&V, Ptr);
1818
1819 // Load back next to allocations in the to-be-outlined region.
1820 Builder.restoreIP(InnerAllocaIP);
1821 Inner = Builder.CreateLoad(V.getType(), Ptr);
1822 }
1823
1824 Value *ReplacementValue = nullptr;
1825 CallInst *CI = dyn_cast<CallInst>(&V);
1826 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1827 ReplacementValue = PrivTID;
1828 } else {
1829 InsertPointOrErrorTy AfterIP =
1830 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1831 if (!AfterIP)
1832 return AfterIP.takeError();
1833 Builder.restoreIP(*AfterIP);
1834 InnerAllocaIP = {
1835 InnerAllocaIP.getBlock(),
1836 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1837
1838 assert(ReplacementValue &&
1839 "Expected copy/create callback to set replacement value!");
1840 if (ReplacementValue == &V)
1841 return Error::success();
1842 }
1843
1844 for (Use *UPtr : Uses)
1845 UPtr->set(ReplacementValue);
1846
1847 return Error::success();
1848 };
1849
1850 // Reset the inner alloca insertion as it will be used for loading the values
1851 // wrapped into pointers before passing them into the to-be-outlined region.
1852 // Configure it to insert immediately after the fake use of zero address so
1853 // that they are available in the generated body and so that the
1854 // OpenMP-related values (thread ID and zero address pointers) remain leading
1855 // in the argument list.
1856 InnerAllocaIP = IRBuilder<>::InsertPoint(
1857 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1858
1859 // Reset the outer alloca insertion point to the entry of the relevant block
1860 // in case it was invalidated.
1861 OuterAllocaIP = IRBuilder<>::InsertPoint(
1862 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1863
1864 for (Value *Input : Inputs) {
1865 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1866 if (Error Err = PrivHelper(*Input))
1867 return Err;
1868 }
1869 LLVM_DEBUG({
1870 for (Value *Output : Outputs)
1871 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1872 });
1873 assert(Outputs.empty() &&
1874 "OpenMP outlining should not produce live-out values!");
1875
1876 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1877 LLVM_DEBUG({
1878 for (auto *BB : Blocks)
1879 dbgs() << " PBR: " << BB->getName() << "\n";
1880 });
1881
1882 // Adjust the finalization stack, verify the adjustment, and call the
1883 // finalize function a last time to finalize values between the pre-fini
1884 // block and the exit block if we left the parallel "the normal way".
1885 auto FiniInfo = FinalizationStack.pop_back_val();
1886 (void)FiniInfo;
1887 assert(FiniInfo.DK == OMPD_parallel &&
1888 "Unexpected finalization stack state!");
1889
1890 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1891
1892 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1893 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1894 if (!FiniBBOrErr)
1895 return FiniBBOrErr.takeError();
1896 {
1898 Builder.restoreIP(PreFiniIP);
1899 Builder.CreateBr(*FiniBBOrErr);
1900 // There's currently a branch to omp.par.exit. Delete it. We will get there
1901 // via the fini block
1902 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1903 Term->eraseFromParent();
1904 }
1905
1906 // Register the outlined info.
1907 addOutlineInfo(std::move(OI));
1908
1909 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1910 UI->eraseFromParent();
1911
1912 return AfterIP;
1913}
1914
1916 // Build call void __kmpc_flush(ident_t *loc)
1917 uint32_t SrcLocStrSize;
1918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1919 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1920
1922 Args);
1923}
1924
1926 if (!updateToLocation(Loc))
1927 return;
1928 emitFlush(Loc);
1929}
1930
1932 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1933 // global_tid);
1934 uint32_t SrcLocStrSize;
1935 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1936 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1937 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1938
1939 // Ignore return result until untied tasks are supported.
1941 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1942}
1943
1949
1951 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1952 uint32_t SrcLocStrSize;
1953 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1954 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1955 Constant *I32Null = ConstantInt::getNullValue(Int32);
1956 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1957
1959 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1960}
1961
1967
1969 const DependData &Dep) {
1970 // Store the pointer to the variable
1971 Value *Addr = Builder.CreateStructGEP(
1972 DependInfo, Entry,
1973 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1974 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, SizeTy);
1975 Builder.CreateStore(DepValPtr, Addr);
1976 // Store the size of the variable
1977 Value *Size = Builder.CreateStructGEP(
1978 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Len));
1979 Builder.CreateStore(
1980 ConstantInt::get(SizeTy,
1981 M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1982 Size);
1983 // Store the dependency kind
1984 Value *Flags = Builder.CreateStructGEP(
1985 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Flags));
1986 Builder.CreateStore(ConstantInt::get(Builder.getInt8Ty(),
1987 static_cast<unsigned int>(Dep.DepKind)),
1988 Flags);
1989}
1990
1991// Processes the dependencies in Dependencies and does the following
1992// - Allocates space on the stack of an array of DependInfo objects
1993// - Populates each DependInfo object with relevant information of
1994// the corresponding dependence.
1995// - All code is inserted in the entry block of the current function.
1997 OpenMPIRBuilder &OMPBuilder,
1999 // Early return if we have no dependencies to process
2000 if (Dependencies.empty())
2001 return nullptr;
2002
2003 // Given a vector of DependData objects, in this function we create an
2004 // array on the stack that holds kmp_depend_info objects corresponding
2005 // to each dependency. This is then passed to the OpenMP runtime.
2006 // For example, if there are 'n' dependencies then the following psedo
2007 // code is generated. Assume the first dependence is on a variable 'a'
2008 //
2009 // \code{c}
2010 // DepArray = alloc(n x sizeof(kmp_depend_info);
2011 // idx = 0;
2012 // DepArray[idx].base_addr = ptrtoint(&a);
2013 // DepArray[idx].len = 8;
2014 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
2015 // ++idx;
2016 // DepArray[idx].base_addr = ...;
2017 // \endcode
2018
2019 IRBuilderBase &Builder = OMPBuilder.Builder;
2020 Type *DependInfo = OMPBuilder.DependInfo;
2021
2022 Value *DepArray = nullptr;
2023 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2024 Builder.SetInsertPoint(
2026
2027 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2028 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2029
2030 Builder.restoreIP(OldIP);
2031
2032 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2033 Value *Base =
2034 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2035 OMPBuilder.emitTaskDependency(Builder, Base, Dep);
2036 }
2037 return DepArray;
2038}
2039
2040/// Create the task duplication function passed to kmpc_taskloop.
2041Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2042 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2043 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2044 if (!DupCB)
2046 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2047
2048 // From OpenMP Runtime p_task_dup_t:
2049 // Routine optionally generated by the compiler for setting the lastprivate
2050 // flag and calling needed constructors for private/firstprivate objects (used
2051 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2052 // lastprivate flag.
2053 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2054
2055 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2056
2057 FunctionType *DupFuncTy = FunctionType::get(
2058 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2059 /*isVarArg=*/false);
2060
2061 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2062 "omp_taskloop_dup", M);
2063 Value *DestTaskArg = DupFunction->getArg(0);
2064 Value *SrcTaskArg = DupFunction->getArg(1);
2065 Value *LastprivateFlagArg = DupFunction->getArg(2);
2066 DestTaskArg->setName("dest_task");
2067 SrcTaskArg->setName("src_task");
2068 LastprivateFlagArg->setName("lastprivate_flag");
2069
2070 IRBuilderBase::InsertPointGuard Guard(Builder);
2071 Builder.SetInsertPoint(
2072 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2073
2074 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2075 Type *TaskWithPrivatesTy =
2076 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2077 Value *TaskPrivates = Builder.CreateGEP(
2078 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2079 Value *ContextPtr = Builder.CreateGEP(
2080 PrivatesTy, TaskPrivates,
2081 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2082 return ContextPtr;
2083 };
2084
2085 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2086 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2087
2088 DestTaskContextPtr->setName("destPtr");
2089 SrcTaskContextPtr->setName("srcPtr");
2090
2091 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2092 DupFunction->getEntryBlock().begin());
2093 InsertPointTy CodeGenIP = Builder.saveIP();
2094 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2095 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2096 if (!AfterIPOrError)
2097 return AfterIPOrError.takeError();
2098 Builder.restoreIP(*AfterIPOrError);
2099
2100 Builder.CreateRetVoid();
2101
2102 return DupFunction;
2103}
2104
2105OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2106 const LocationDescription &Loc, InsertPointTy AllocaIP,
2107 BodyGenCallbackTy BodyGenCB,
2108 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2109 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2110 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2111 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2112 Value *TaskContextStructPtrVal) {
2113
2114 if (!updateToLocation(Loc))
2115 return InsertPointTy();
2116
2117 uint32_t SrcLocStrSize;
2118 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2119 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2120
2121 BasicBlock *TaskloopExitBB =
2122 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2123 BasicBlock *TaskloopBodyBB =
2124 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2125 BasicBlock *TaskloopAllocaBB =
2126 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2127
2128 InsertPointTy TaskloopAllocaIP =
2129 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2130 InsertPointTy TaskloopBodyIP =
2131 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2132
2133 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2134 return Err;
2135
2136 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2137 if (!result) {
2138 return result.takeError();
2139 }
2140
2141 llvm::CanonicalLoopInfo *CLI = result.get();
2142 OutlineInfo OI;
2143 OI.EntryBB = TaskloopAllocaBB;
2144 OI.OuterAllocaBB = AllocaIP.getBlock();
2145 OI.ExitBB = TaskloopExitBB;
2146
2147 // Add the thread ID argument.
2148 SmallVector<Instruction *> ToBeDeleted;
2149 // dummy instruction to be used as a fake argument
2150 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2151 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2152 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2153 TaskloopAllocaIP, "lb", false, true);
2154 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2155 TaskloopAllocaIP, "ub", false, true);
2156 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2157 TaskloopAllocaIP, "step", false, true);
2158 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2159 // aggregate struct
2160 OI.Inputs.insert(FakeLB);
2161 OI.Inputs.insert(FakeUB);
2162 OI.Inputs.insert(FakeStep);
2163 if (TaskContextStructPtrVal)
2164 OI.Inputs.insert(TaskContextStructPtrVal);
2165 assert(((TaskContextStructPtrVal && DupCB) ||
2166 (!TaskContextStructPtrVal && !DupCB)) &&
2167 "Task context struct ptr and duplication callback must be both set "
2168 "or both null");
2169
2170 // It isn't safe to run the duplication bodygen callback inside the post
2171 // outlining callback so this has to be run now before we know the real task
2172 // shareds structure type.
2173 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2174 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2175 Type *FakeSharedsTy = StructType::get(
2176 Builder.getContext(),
2177 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2178 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2179 FakeSharedsTy,
2180 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2181 if (!TaskDupFnOrErr) {
2182 return TaskDupFnOrErr.takeError();
2183 }
2184 Value *TaskDupFn = *TaskDupFnOrErr;
2185
2186 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2187 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2188 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2189 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2190 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2191 // Replace the Stale CI by appropriate RTL function call.
2192 assert(OutlinedFn.hasOneUse() &&
2193 "there must be a single user for the outlined function");
2194 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2195
2196 /* Create the casting for the Bounds Values that can be used when outlining
2197 * to replace the uses of the fakes with real values */
2198 BasicBlock *CodeReplBB = StaleCI->getParent();
2199 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2200 Value *CastedLBVal =
2201 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2202 Value *CastedUBVal =
2203 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2204 Value *CastedStepVal =
2205 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2206
2207 Builder.SetInsertPoint(StaleCI);
2208
2209 // Gather the arguments for emitting the runtime call for
2210 // @__kmpc_omp_task_alloc
2211 Function *TaskAllocFn =
2212 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2213
2214 Value *ThreadID = getOrCreateThreadID(Ident);
2215
2216 if (!NoGroup) {
2217 // Emit runtime call for @__kmpc_taskgroup
2218 Function *TaskgroupFn =
2219 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2220 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2221 }
2222
2223 // `flags` Argument Configuration
2224 // Task is tied if (Flags & 1) == 1.
2225 // Task is untied if (Flags & 1) == 0.
2226 // Task is final if (Flags & 2) == 2.
2227 // Task is not final if (Flags & 2) == 0.
2228 // Task is mergeable if (Flags & 4) == 4.
2229 // Task is not mergeable if (Flags & 4) == 0.
2230 // Task is priority if (Flags & 32) == 32.
2231 // Task is not priority if (Flags & 32) == 0.
2232 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2233 if (Final)
2234 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2235 if (Mergeable)
2236 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2237 if (Priority)
2238 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2239
2240 Value *TaskSize = Builder.getInt64(
2241 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2242
2243 AllocaInst *ArgStructAlloca =
2245 assert(ArgStructAlloca &&
2246 "Unable to find the alloca instruction corresponding to arguments "
2247 "for extracted function");
2248 std::optional<TypeSize> ArgAllocSize =
2249 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2250 assert(ArgAllocSize &&
2251 "Unable to determine size of arguments for extracted function");
2252 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2253
2254 // Emit the @__kmpc_omp_task_alloc runtime call
2255 // The runtime call returns a pointer to an area where the task captured
2256 // variables must be copied before the task is run (TaskData)
2257 CallInst *TaskData = Builder.CreateCall(
2258 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2259 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2260 /*task_func=*/&OutlinedFn});
2261
2262 Value *Shareds = StaleCI->getArgOperand(1);
2263 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2264 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2265 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2266 SharedsSize);
2267 // Get the pointer to loop lb, ub, step from task ptr
2268 // and set up the lowerbound,upperbound and step values
2269 llvm::Value *Lb = Builder.CreateGEP(
2270 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2271
2272 llvm::Value *Ub = Builder.CreateGEP(
2273 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2274
2275 llvm::Value *Step = Builder.CreateGEP(
2276 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2277 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2278
2279 // set up the arguments for emitting kmpc_taskloop runtime call
2280 // setting values for ifval, nogroup, sched, grainsize, task_dup
2281 Value *IfCondVal =
2282 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2283 : Builder.getInt32(1);
2284 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2285 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2286 Value *NoGroupVal = Builder.getInt32(1);
2287 Value *SchedVal = Builder.getInt32(Sched);
2288 Value *GrainSizeVal =
2289 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2290 : Builder.getInt64(0);
2291 Value *TaskDup = TaskDupFn;
2292
2293 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2294 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2295
2296 // taskloop runtime call
2297 Function *TaskloopFn =
2298 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2299 Builder.CreateCall(TaskloopFn, Args);
2300
2301 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2302 // nogroup is not defined
2303 if (!NoGroup) {
2304 Function *EndTaskgroupFn =
2305 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2306 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2307 }
2308
2309 StaleCI->eraseFromParent();
2310
2311 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2312
2313 LoadInst *SharedsOutlined =
2314 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2315 OutlinedFn.getArg(1)->replaceUsesWithIf(
2316 SharedsOutlined,
2317 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2318
2319 Value *IV = CLI->getIndVar();
2320 Type *IVTy = IV->getType();
2321 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2322
2323 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2324 // UpperBound. These GEP's can be reused for loading the tasks respective
2325 // bounds.
2326 Value *TaskLB = nullptr;
2327 Value *TaskUB = nullptr;
2328 Value *TaskStep = nullptr;
2329 Value *LoadTaskLB = nullptr;
2330 Value *LoadTaskUB = nullptr;
2331 Value *LoadTaskStep = nullptr;
2332 for (Instruction &I : *TaskloopAllocaBB) {
2333 if (I.getOpcode() == Instruction::GetElementPtr) {
2334 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2335 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2336 switch (CI->getZExtValue()) {
2337 case 0:
2338 TaskLB = &I;
2339 break;
2340 case 1:
2341 TaskUB = &I;
2342 break;
2343 case 2:
2344 TaskStep = &I;
2345 break;
2346 }
2347 }
2348 } else if (I.getOpcode() == Instruction::Load) {
2349 LoadInst &Load = cast<LoadInst>(I);
2350 if (Load.getPointerOperand() == TaskLB) {
2351 assert(TaskLB != nullptr && "Expected value for TaskLB");
2352 LoadTaskLB = &I;
2353 } else if (Load.getPointerOperand() == TaskUB) {
2354 assert(TaskUB != nullptr && "Expected value for TaskUB");
2355 LoadTaskUB = &I;
2356 } else if (Load.getPointerOperand() == TaskStep) {
2357 assert(TaskStep != nullptr && "Expected value for TaskStep");
2358 LoadTaskStep = &I;
2359 }
2360 }
2361 }
2362
2363 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2364
2365 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2366 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2367 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2368 Value *TripCountMinusOne = Builder.CreateSDiv(
2369 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2370 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2371 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2372 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2373 // set the trip count in the CLI
2374 CLI->setTripCount(CastedTripCount);
2375
2376 Builder.SetInsertPoint(CLI->getBody(),
2377 CLI->getBody()->getFirstInsertionPt());
2378
2379 if (NumOfCollapseLoops > 1) {
2380 llvm::SmallVector<User *> UsersToReplace;
2381 // When using the collapse clause, the bounds of the loop have to be
2382 // adjusted to properly represent the iterator of the outer loop.
2383 Value *IVPlusTaskLB = Builder.CreateAdd(
2384 CLI->getIndVar(),
2385 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2386 // To ensure every Use is correctly captured, we first want to record
2387 // which users to replace the value in, and then replace the value.
2388 for (auto IVUse = CLI->getIndVar()->uses().begin();
2389 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2390 User *IVUser = IVUse->getUser();
2391 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2392 if (Op->getOpcode() == Instruction::URem ||
2393 Op->getOpcode() == Instruction::UDiv) {
2394 UsersToReplace.push_back(IVUser);
2395 }
2396 }
2397 }
2398 for (User *User : UsersToReplace) {
2399 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2400 }
2401 } else {
2402 // The canonical loop is generated with a fixed lower bound. We need to
2403 // update the index calculation code to use the task's lower bound. The
2404 // generated code looks like this:
2405 // %omp_loop.iv = phi ...
2406 // ...
2407 // %tmp = mul [type] %omp_loop.iv, step
2408 // %user_index = add [type] tmp, lb
2409 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2410 // of the normalised induction variable:
2411 // 1. This one: converting the normalised IV to the user IV
2412 // 2. The increment (add)
2413 // 3. The comparison against the trip count (icmp)
2414 // (1) is the only use that is a mul followed by an add so this cannot
2415 // match other IR.
2416 assert(CLI->getIndVar()->getNumUses() == 3 &&
2417 "Canonical loop should have exactly three uses of the ind var");
2418 for (User *IVUser : CLI->getIndVar()->users()) {
2419 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2420 if (Mul->getOpcode() == Instruction::Mul) {
2421 for (User *MulUser : Mul->users()) {
2422 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2423 if (Add->getOpcode() == Instruction::Add) {
2424 Add->setOperand(1, CastedTaskLB);
2425 }
2426 }
2427 }
2428 }
2429 }
2430 }
2431 }
2432
2433 FakeLB->replaceAllUsesWith(CastedLBVal);
2434 FakeUB->replaceAllUsesWith(CastedUBVal);
2435 FakeStep->replaceAllUsesWith(CastedStepVal);
2436 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2437 I->eraseFromParent();
2438 }
2439 };
2440
2441 addOutlineInfo(std::move(OI));
2442 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2443 return Builder.saveIP();
2444}
2445
2448 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2449 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2450 llvm::Type::getInt32Ty(M.getContext()));
2451}
2452
2454 const LocationDescription &Loc, InsertPointTy AllocaIP,
2455 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2456 const DependenciesInfo &Dependencies, const AffinityData &Affinities,
2457 bool Mergeable, Value *EventHandle, Value *Priority) {
2458
2459 if (!updateToLocation(Loc))
2460 return InsertPointTy();
2461
2462 uint32_t SrcLocStrSize;
2463 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2464 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2465 // The current basic block is split into four basic blocks. After outlining,
2466 // they will be mapped as follows:
2467 // ```
2468 // def current_fn() {
2469 // current_basic_block:
2470 // br label %task.exit
2471 // task.exit:
2472 // ; instructions after task
2473 // }
2474 // def outlined_fn() {
2475 // task.alloca:
2476 // br label %task.body
2477 // task.body:
2478 // ret void
2479 // }
2480 // ```
2481 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2482 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2483 BasicBlock *TaskAllocaBB =
2484 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2485
2486 InsertPointTy TaskAllocaIP =
2487 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2488 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2489 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2490 return Err;
2491
2492 OutlineInfo OI;
2493 OI.EntryBB = TaskAllocaBB;
2494 OI.OuterAllocaBB = AllocaIP.getBlock();
2495 OI.ExitBB = TaskExitBB;
2496
2497 // Add the thread ID argument.
2500 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2501
2502 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2503 Affinities, Mergeable, Priority, EventHandle,
2504 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
2505 // Replace the Stale CI by appropriate RTL function call.
2506 assert(OutlinedFn.hasOneUse() &&
2507 "there must be a single user for the outlined function");
2508 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2509
2510 // HasShareds is true if any variables are captured in the outlined region,
2511 // false otherwise.
2512 bool HasShareds = StaleCI->arg_size() > 1;
2513 Builder.SetInsertPoint(StaleCI);
2514
2515 // Gather the arguments for emitting the runtime call for
2516 // @__kmpc_omp_task_alloc
2517 Function *TaskAllocFn =
2518 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2519
2520 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2521 // call.
2522 Value *ThreadID = getOrCreateThreadID(Ident);
2523
2524 // Argument - `flags`
2525 // Task is tied iff (Flags & 1) == 1.
2526 // Task is untied iff (Flags & 1) == 0.
2527 // Task is final iff (Flags & 2) == 2.
2528 // Task is not final iff (Flags & 2) == 0.
2529 // Task is mergeable iff (Flags & 4) == 4.
2530 // Task is not mergeable iff (Flags & 4) == 0.
2531 // Task is priority iff (Flags & 32) == 32.
2532 // Task is not priority iff (Flags & 32) == 0.
2533 // TODO: Handle the other flags.
2534 Value *Flags = Builder.getInt32(Tied);
2535 if (Final) {
2536 Value *FinalFlag =
2537 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2538 Flags = Builder.CreateOr(FinalFlag, Flags);
2539 }
2540
2541 if (Mergeable)
2542 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2543 if (Priority)
2544 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2545
2546 // Argument - `sizeof_kmp_task_t` (TaskSize)
2547 // Tasksize refers to the size in bytes of kmp_task_t data structure
2548 // including private vars accessed in task.
2549 // TODO: add kmp_task_t_with_privates (privates)
2550 Value *TaskSize = Builder.getInt64(
2551 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2552
2553 // Argument - `sizeof_shareds` (SharedsSize)
2554 // SharedsSize refers to the shareds array size in the kmp_task_t data
2555 // structure.
2556 Value *SharedsSize = Builder.getInt64(0);
2557 if (HasShareds) {
2558 AllocaInst *ArgStructAlloca =
2560 assert(ArgStructAlloca &&
2561 "Unable to find the alloca instruction corresponding to arguments "
2562 "for extracted function");
2563 std::optional<TypeSize> ArgAllocSize =
2564 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2565 assert(ArgAllocSize &&
2566 "Unable to determine size of arguments for extracted function");
2567 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2568 }
2569 // Emit the @__kmpc_omp_task_alloc runtime call
2570 // The runtime call returns a pointer to an area where the task captured
2571 // variables must be copied before the task is run (TaskData)
2573 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2574 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2575 /*task_func=*/&OutlinedFn});
2576
2577 if (Affinities.Count && Affinities.Info) {
2579 OMPRTL___kmpc_omp_reg_task_with_affinity);
2580
2581 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2582 Affinities.Count, Affinities.Info});
2583 }
2584
2585 // Emit detach clause initialization.
2586 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2587 // task_descriptor);
2588 if (EventHandle) {
2590 OMPRTL___kmpc_task_allow_completion_event);
2591 llvm::Value *EventVal =
2592 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2593 llvm::Value *EventHandleAddr =
2594 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2595 Builder.getPtrTy(0));
2596 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2597 Builder.CreateStore(EventVal, EventHandleAddr);
2598 }
2599 // Copy the arguments for outlined function
2600 if (HasShareds) {
2601 Value *Shareds = StaleCI->getArgOperand(1);
2602 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2603 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2604 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2605 SharedsSize);
2606 }
2607
2608 if (Priority) {
2609 //
2610 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2611 // we populate the priority information into the "kmp_task_t" here
2612 //
2613 // The struct "kmp_task_t" definition is available in kmp.h
2614 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2615 // data2 is used for priority
2616 //
2617 Type *Int32Ty = Builder.getInt32Ty();
2618 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2619 // kmp_task_t* => { ptr }
2620 Type *TaskPtr = StructType::get(VoidPtr);
2621 Value *TaskGEP =
2622 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2623 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2624 Type *TaskStructType = StructType::get(
2625 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2626 Value *PriorityData = Builder.CreateInBoundsGEP(
2627 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2628 // kmp_cmplrdata_t => { ptr, ptr }
2629 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2630 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2631 PriorityData, {Zero, Zero});
2632 Builder.CreateStore(Priority, CmplrData);
2633 }
2634
2635 Value *DepArray = nullptr;
2636 Value *NumDeps = nullptr;
2637 if (Dependencies.DepArray) {
2638 DepArray = Dependencies.DepArray;
2639 NumDeps = Dependencies.NumDeps;
2640 } else if (!Dependencies.Deps.empty()) {
2641 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
2642 NumDeps = Builder.getInt32(Dependencies.Deps.size());
2643 }
2644
2645 // In the presence of the `if` clause, the following IR is generated:
2646 // ...
2647 // %data = call @__kmpc_omp_task_alloc(...)
2648 // br i1 %if_condition, label %then, label %else
2649 // then:
2650 // call @__kmpc_omp_task(...)
2651 // br label %exit
2652 // else:
2653 // ;; Wait for resolution of dependencies, if any, before
2654 // ;; beginning the task
2655 // call @__kmpc_omp_wait_deps(...)
2656 // call @__kmpc_omp_task_begin_if0(...)
2657 // call @outlined_fn(...)
2658 // call @__kmpc_omp_task_complete_if0(...)
2659 // br label %exit
2660 // exit:
2661 // ...
2662 if (IfCondition) {
2663 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2664 // terminator.
2665 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2666 Instruction *IfTerminator =
2667 Builder.GetInsertPoint()->getParent()->getTerminator();
2668 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2669 Builder.SetInsertPoint(IfTerminator);
2670 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2671 &ElseTI);
2672 Builder.SetInsertPoint(ElseTI);
2673
2674 if (DepArray) {
2675 Function *TaskWaitFn =
2676 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2678 TaskWaitFn,
2679 {Ident, ThreadID, NumDeps, DepArray,
2680 ConstantInt::get(Builder.getInt32Ty(), 0),
2682 }
2683 Function *TaskBeginFn =
2684 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2685 Function *TaskCompleteFn =
2686 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2687 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2688 CallInst *CI = nullptr;
2689 if (HasShareds)
2690 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2691 else
2692 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2693 CI->setDebugLoc(StaleCI->getDebugLoc());
2694 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2695 Builder.SetInsertPoint(ThenTI);
2696 }
2697
2698 if (DepArray) {
2699 Function *TaskFn =
2700 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2702 TaskFn,
2703 {Ident, ThreadID, TaskData, NumDeps, DepArray,
2704 ConstantInt::get(Builder.getInt32Ty(), 0),
2706
2707 } else {
2708 // Emit the @__kmpc_omp_task runtime call to spawn the task
2709 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2710 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2711 }
2712
2713 StaleCI->eraseFromParent();
2714
2715 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2716 if (HasShareds) {
2717 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2718 OutlinedFn.getArg(1)->replaceUsesWithIf(
2719 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2720 }
2721
2722 for (Instruction *I : llvm::reverse(ToBeDeleted))
2723 I->eraseFromParent();
2724 };
2725
2726 addOutlineInfo(std::move(OI));
2727 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2728
2729 return Builder.saveIP();
2730}
2731
2734 InsertPointTy AllocaIP,
2735 BodyGenCallbackTy BodyGenCB) {
2736 if (!updateToLocation(Loc))
2737 return InsertPointTy();
2738
2739 uint32_t SrcLocStrSize;
2740 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2741 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2742 Value *ThreadID = getOrCreateThreadID(Ident);
2743
2744 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2745 Function *TaskgroupFn =
2746 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2747 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2748
2749 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2750 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2751 return Err;
2752
2753 Builder.SetInsertPoint(TaskgroupExitBB);
2754 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2755 Function *EndTaskgroupFn =
2756 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2757 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2758
2759 return Builder.saveIP();
2760}
2761
2763 const LocationDescription &Loc, InsertPointTy AllocaIP,
2765 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2766 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2767
2768 if (!updateToLocation(Loc))
2769 return Loc.IP;
2770
2771 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2772
2773 // Each section is emitted as a switch case
2774 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2775 // -> OMP.createSection() which generates the IR for each section
2776 // Iterate through all sections and emit a switch construct:
2777 // switch (IV) {
2778 // case 0:
2779 // <SectionStmt[0]>;
2780 // break;
2781 // ...
2782 // case <NumSection> - 1:
2783 // <SectionStmt[<NumSection> - 1]>;
2784 // break;
2785 // }
2786 // ...
2787 // section_loop.after:
2788 // <FiniCB>;
2789 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2790 Builder.restoreIP(CodeGenIP);
2792 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2793 Function *CurFn = Continue->getParent();
2794 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2795
2796 unsigned CaseNumber = 0;
2797 for (auto SectionCB : SectionCBs) {
2799 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2800 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2801 Builder.SetInsertPoint(CaseBB);
2802 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
2803 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2804 CaseEndBr->getIterator()}))
2805 return Err;
2806 CaseNumber++;
2807 }
2808 // remove the existing terminator from body BB since there can be no
2809 // terminators after switch/case
2810 return Error::success();
2811 };
2812 // Loop body ends here
2813 // LowerBound, UpperBound, and STride for createCanonicalLoop
2814 Type *I32Ty = Type::getInt32Ty(M.getContext());
2815 Value *LB = ConstantInt::get(I32Ty, 0);
2816 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2817 Value *ST = ConstantInt::get(I32Ty, 1);
2819 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2820 if (!LoopInfo)
2821 return LoopInfo.takeError();
2822
2823 InsertPointOrErrorTy WsloopIP =
2824 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2825 WorksharingLoopType::ForStaticLoop, !IsNowait);
2826 if (!WsloopIP)
2827 return WsloopIP.takeError();
2828 InsertPointTy AfterIP = *WsloopIP;
2829
2830 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2831 assert(LoopFini && "Bad structure of static workshare loop finalization");
2832
2833 // Apply the finalization callback in LoopAfterBB
2834 auto FiniInfo = FinalizationStack.pop_back_val();
2835 assert(FiniInfo.DK == OMPD_sections &&
2836 "Unexpected finalization stack state!");
2837 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2838 return Err;
2839
2840 return AfterIP;
2841}
2842
2845 BodyGenCallbackTy BodyGenCB,
2846 FinalizeCallbackTy FiniCB) {
2847 if (!updateToLocation(Loc))
2848 return Loc.IP;
2849
2850 auto FiniCBWrapper = [&](InsertPointTy IP) {
2851 if (IP.getBlock()->end() != IP.getPoint())
2852 return FiniCB(IP);
2853 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2854 // will fail because that function requires the Finalization Basic Block to
2855 // have a terminator, which is already removed by EmitOMPRegionBody.
2856 // IP is currently at cancelation block.
2857 // We need to backtrack to the condition block to fetch
2858 // the exit block and create a branch from cancelation
2859 // to exit block.
2861 Builder.restoreIP(IP);
2862 auto *CaseBB = Loc.IP.getBlock();
2863 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2864 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2865 Instruction *I = Builder.CreateBr(ExitBB);
2866 IP = InsertPointTy(I->getParent(), I->getIterator());
2867 return FiniCB(IP);
2868 };
2869
2870 Directive OMPD = Directive::OMPD_sections;
2871 // Since we are using Finalization Callback here, HasFinalize
2872 // and IsCancellable have to be true
2873 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2874 /*Conditional*/ false, /*hasFinalize*/ true,
2875 /*IsCancellable*/ true);
2876}
2877
2883
2884Value *OpenMPIRBuilder::getGPUThreadID() {
2887 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2888 {});
2889}
2890
2891Value *OpenMPIRBuilder::getGPUWarpSize() {
2893 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2894}
2895
2896Value *OpenMPIRBuilder::getNVPTXWarpID() {
2897 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2898 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2899}
2900
2901Value *OpenMPIRBuilder::getNVPTXLaneID() {
2902 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2903 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2904 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2905 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2906 "nvptx_lane_id");
2907}
2908
2909Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2910 Type *ToType) {
2911 Type *FromType = From->getType();
2912 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2913 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2914 assert(FromSize > 0 && "From size must be greater than zero");
2915 assert(ToSize > 0 && "To size must be greater than zero");
2916 if (FromType == ToType)
2917 return From;
2918 if (FromSize == ToSize)
2919 return Builder.CreateBitCast(From, ToType);
2920 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2921 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2922 InsertPointTy SaveIP = Builder.saveIP();
2923 Builder.restoreIP(AllocaIP);
2924 Value *CastItem = Builder.CreateAlloca(ToType);
2925 Builder.restoreIP(SaveIP);
2926
2927 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2928 CastItem, Builder.getPtrTy(0));
2929 Builder.CreateStore(From, ValCastItem);
2930 return Builder.CreateLoad(ToType, CastItem);
2931}
2932
2933Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2934 Value *Element,
2935 Type *ElementType,
2936 Value *Offset) {
2937 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2938 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2939
2940 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2941 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2942 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2943 Value *WarpSize =
2944 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2946 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2947 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2948 Value *WarpSizeCast =
2949 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2950 Value *ShuffleCall =
2951 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2952 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2953}
2954
2955void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2956 Value *DstAddr, Type *ElemType,
2957 Value *Offset, Type *ReductionArrayTy,
2958 bool IsByRefElem) {
2959 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2960 // Create the loop over the big sized data.
2961 // ptr = (void*)Elem;
2962 // ptrEnd = (void*) Elem + 1;
2963 // Step = 8;
2964 // while (ptr + Step < ptrEnd)
2965 // shuffle((int64_t)*ptr);
2966 // Step = 4;
2967 // while (ptr + Step < ptrEnd)
2968 // shuffle((int32_t)*ptr);
2969 // ...
2970 Type *IndexTy = Builder.getIndexTy(
2971 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2972 Value *ElemPtr = DstAddr;
2973 Value *Ptr = SrcAddr;
2974 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2975 if (Size < IntSize)
2976 continue;
2977 Type *IntType = Builder.getIntNTy(IntSize * 8);
2978 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2979 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2980 Value *SrcAddrGEP =
2981 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2982 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2983 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2984
2985 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2986 if ((Size / IntSize) > 1) {
2987 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2988 SrcAddrGEP, Builder.getPtrTy());
2989 BasicBlock *PreCondBB =
2990 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2991 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2992 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2993 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2994 emitBlock(PreCondBB, CurFunc);
2995 PHINode *PhiSrc =
2996 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2997 PhiSrc->addIncoming(Ptr, CurrentBB);
2998 PHINode *PhiDest =
2999 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
3000 PhiDest->addIncoming(ElemPtr, CurrentBB);
3001 Ptr = PhiSrc;
3002 ElemPtr = PhiDest;
3003 Value *PtrDiff = Builder.CreatePtrDiff(
3004 Builder.getInt8Ty(), PtrEnd,
3005 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
3006 Builder.CreateCondBr(
3007 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
3008 ExitBB);
3009 emitBlock(ThenBB, CurFunc);
3010 Value *Res = createRuntimeShuffleFunction(
3011 AllocaIP,
3012 Builder.CreateAlignedLoad(
3013 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3014 IntType, Offset);
3015 Builder.CreateAlignedStore(Res, ElemPtr,
3016 M.getDataLayout().getPrefTypeAlign(ElemType));
3017 Value *LocalPtr =
3018 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3019 Value *LocalElemPtr =
3020 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3021 PhiSrc->addIncoming(LocalPtr, ThenBB);
3022 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3023 emitBranch(PreCondBB);
3024 emitBlock(ExitBB, CurFunc);
3025 } else {
3026 Value *Res = createRuntimeShuffleFunction(
3027 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3028 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3029 Res->getType()->getScalarSizeInBits())
3030 Res = Builder.CreateTrunc(Res, ElemType);
3031 Builder.CreateStore(Res, ElemPtr);
3032 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3033 ElemPtr =
3034 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3035 }
3036 Size = Size % IntSize;
3037 }
3038}
3039
3040Error OpenMPIRBuilder::emitReductionListCopy(
3041 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3042 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3043 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3044 Type *IndexTy = Builder.getIndexTy(
3045 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3046 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3047
3048 // Iterates, element-by-element, through the source Reduce list and
3049 // make a copy.
3050 for (auto En : enumerate(ReductionInfos)) {
3051 const ReductionInfo &RI = En.value();
3052 Value *SrcElementAddr = nullptr;
3053 AllocaInst *DestAlloca = nullptr;
3054 Value *DestElementAddr = nullptr;
3055 Value *DestElementPtrAddr = nullptr;
3056 // Should we shuffle in an element from a remote lane?
3057 bool ShuffleInElement = false;
3058 // Set to true to update the pointer in the dest Reduce list to a
3059 // newly created element.
3060 bool UpdateDestListPtr = false;
3061
3062 // Step 1.1: Get the address for the src element in the Reduce list.
3063 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3064 ReductionArrayTy, SrcBase,
3065 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3066 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3067
3068 // Step 1.2: Create a temporary to store the element in the destination
3069 // Reduce list.
3070 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3071 ReductionArrayTy, DestBase,
3072 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3073 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3074 switch (Action) {
3076 InsertPointTy CurIP = Builder.saveIP();
3077 Builder.restoreIP(AllocaIP);
3078
3079 Type *DestAllocaType =
3080 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3081 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3082 ".omp.reduction.element");
3083 DestAlloca->setAlignment(
3084 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3085 DestElementAddr = DestAlloca;
3086 DestElementAddr =
3087 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3088 DestElementAddr->getName() + ".ascast");
3089 Builder.restoreIP(CurIP);
3090 ShuffleInElement = true;
3091 UpdateDestListPtr = true;
3092 break;
3093 }
3095 DestElementAddr =
3096 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3097 break;
3098 }
3099 }
3100
3101 // Now that all active lanes have read the element in the
3102 // Reduce list, shuffle over the value from the remote lane.
3103 if (ShuffleInElement) {
3104 Type *ShuffleType = RI.ElementType;
3105 Value *ShuffleSrcAddr = SrcElementAddr;
3106 Value *ShuffleDestAddr = DestElementAddr;
3107 AllocaInst *LocalStorage = nullptr;
3108
3109 if (IsByRefElem) {
3110 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3111 assert(RI.ByRefAllocatedType &&
3112 "Expected by-ref allocated type to be set");
3113 // For by-ref reductions, we need to copy from the remote lane the
3114 // actual value of the partial reduction computed by that remote lane;
3115 // rather than, for example, a pointer to that data or, even worse, a
3116 // pointer to the descriptor of the by-ref reduction element.
3117 ShuffleType = RI.ByRefElementType;
3118
3119 InsertPointOrErrorTy GenResult =
3120 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3121
3122 if (!GenResult)
3123 return GenResult.takeError();
3124
3125 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3126
3127 {
3128 InsertPointTy OldIP = Builder.saveIP();
3129 Builder.restoreIP(AllocaIP);
3130
3131 LocalStorage = Builder.CreateAlloca(ShuffleType);
3132 Builder.restoreIP(OldIP);
3133 ShuffleDestAddr = LocalStorage;
3134 }
3135 }
3136
3137 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3138 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3139
3140 if (IsByRefElem) {
3141 // Copy descriptor from source and update base_ptr to shuffled data
3142 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3143 DestAlloca, Builder.getPtrTy(), ".ascast");
3144
3145 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3146 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3147 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3148
3149 if (!GenResult)
3150 return GenResult.takeError();
3151 }
3152 } else {
3153 switch (RI.EvaluationKind) {
3154 case EvalKind::Scalar: {
3155 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3156 // Store the source element value to the dest element address.
3157 Builder.CreateStore(Elem, DestElementAddr);
3158 break;
3159 }
3160 case EvalKind::Complex: {
3161 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3162 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3163 Value *SrcReal = Builder.CreateLoad(
3164 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3165 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3166 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3167 Value *SrcImg = Builder.CreateLoad(
3168 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3169
3170 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3171 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3172 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3173 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3174 Builder.CreateStore(SrcReal, DestRealPtr);
3175 Builder.CreateStore(SrcImg, DestImgPtr);
3176 break;
3177 }
3178 case EvalKind::Aggregate: {
3179 Value *SizeVal = Builder.getInt64(
3180 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3181 Builder.CreateMemCpy(
3182 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3183 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3184 SizeVal, false);
3185 break;
3186 }
3187 };
3188 }
3189
3190 // Step 3.1: Modify reference in dest Reduce list as needed.
3191 // Modifying the reference in Reduce list to point to the newly
3192 // created element. The element is live in the current function
3193 // scope and that of functions it invokes (i.e., reduce_function).
3194 // RemoteReduceData[i] = (void*)&RemoteElem
3195 if (UpdateDestListPtr) {
3196 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3197 DestElementAddr, Builder.getPtrTy(),
3198 DestElementAddr->getName() + ".ascast");
3199 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3200 }
3201 }
3202
3203 return Error::success();
3204}
3205
3206Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3207 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3208 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3209 InsertPointTy SavedIP = Builder.saveIP();
3210 LLVMContext &Ctx = M.getContext();
3211 FunctionType *FuncTy = FunctionType::get(
3212 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3213 /* IsVarArg */ false);
3214 Function *WcFunc =
3216 "_omp_reduction_inter_warp_copy_func", &M);
3217 WcFunc->setAttributes(FuncAttrs);
3218 WcFunc->addParamAttr(0, Attribute::NoUndef);
3219 WcFunc->addParamAttr(1, Attribute::NoUndef);
3220 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3221 Builder.SetInsertPoint(EntryBB);
3222
3223 // ReduceList: thread local Reduce list.
3224 // At the stage of the computation when this function is called, partially
3225 // aggregated values reside in the first lane of every active warp.
3226 Argument *ReduceListArg = WcFunc->getArg(0);
3227 // NumWarps: number of warps active in the parallel region. This could
3228 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3229 Argument *NumWarpsArg = WcFunc->getArg(1);
3230
3231 // This array is used as a medium to transfer, one reduce element at a time,
3232 // the data from the first lane of every warp to lanes in the first warp
3233 // in order to perform the final step of a reduction in a parallel region
3234 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3235 // for reduced latency, as well as to have a distinct copy for concurrently
3236 // executing target regions. The array is declared with common linkage so
3237 // as to be shared across compilation units.
3238 StringRef TransferMediumName =
3239 "__openmp_nvptx_data_transfer_temporary_storage";
3240 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3241 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3242 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3243 if (!TransferMedium) {
3244 TransferMedium = new GlobalVariable(
3245 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3246 UndefValue::get(ArrayTy), TransferMediumName,
3247 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3248 /*AddressSpace=*/3);
3249 }
3250
3251 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3252 Value *GPUThreadID = getGPUThreadID();
3253 // nvptx_lane_id = nvptx_id % warpsize
3254 Value *LaneID = getNVPTXLaneID();
3255 // nvptx_warp_id = nvptx_id / warpsize
3256 Value *WarpID = getNVPTXWarpID();
3257
3258 InsertPointTy AllocaIP =
3259 InsertPointTy(Builder.GetInsertBlock(),
3260 Builder.GetInsertBlock()->getFirstInsertionPt());
3261 Type *Arg0Type = ReduceListArg->getType();
3262 Type *Arg1Type = NumWarpsArg->getType();
3263 Builder.restoreIP(AllocaIP);
3264 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3265 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3266 AllocaInst *NumWarpsAlloca =
3267 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3268 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3269 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3270 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3271 NumWarpsAlloca, Builder.getPtrTy(0),
3272 NumWarpsAlloca->getName() + ".ascast");
3273 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3274 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3275 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3276 InsertPointTy CodeGenIP =
3277 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3278 Builder.restoreIP(CodeGenIP);
3279
3280 Value *ReduceList =
3281 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3282
3283 for (auto En : enumerate(ReductionInfos)) {
3284 //
3285 // Warp master copies reduce element to transfer medium in __shared__
3286 // memory.
3287 //
3288 const ReductionInfo &RI = En.value();
3289 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3290 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3291 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3292 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3293 Type *CType = Builder.getIntNTy(TySize * 8);
3294
3295 unsigned NumIters = RealTySize / TySize;
3296 if (NumIters == 0)
3297 continue;
3298 Value *Cnt = nullptr;
3299 Value *CntAddr = nullptr;
3300 BasicBlock *PrecondBB = nullptr;
3301 BasicBlock *ExitBB = nullptr;
3302 if (NumIters > 1) {
3303 CodeGenIP = Builder.saveIP();
3304 Builder.restoreIP(AllocaIP);
3305 CntAddr =
3306 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3307
3308 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3309 CntAddr->getName() + ".ascast");
3310 Builder.restoreIP(CodeGenIP);
3311 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3312 CntAddr,
3313 /*Volatile=*/false);
3314 PrecondBB = BasicBlock::Create(Ctx, "precond");
3315 ExitBB = BasicBlock::Create(Ctx, "exit");
3316 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3317 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3318 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3319 /*Volatile=*/false);
3320 Value *Cmp = Builder.CreateICmpULT(
3321 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3322 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3323 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3324 }
3325
3326 // kmpc_barrier.
3327 InsertPointOrErrorTy BarrierIP1 =
3328 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3329 omp::Directive::OMPD_unknown,
3330 /* ForceSimpleCall */ false,
3331 /* CheckCancelFlag */ true);
3332 if (!BarrierIP1)
3333 return BarrierIP1.takeError();
3334 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3335 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3336 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3337
3338 // if (lane_id == 0)
3339 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3340 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3341 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3342
3343 // Reduce element = LocalReduceList[i]
3344 auto *RedListArrayTy =
3345 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3346 Type *IndexTy = Builder.getIndexTy(
3347 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3348 Value *ElemPtrPtr =
3349 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3350 {ConstantInt::get(IndexTy, 0),
3351 ConstantInt::get(IndexTy, En.index())});
3352 // elemptr = ((CopyType*)(elemptrptr)) + I
3353 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3354
3355 if (IsByRefElem) {
3356 InsertPointOrErrorTy GenRes =
3357 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3358
3359 if (!GenRes)
3360 return GenRes.takeError();
3361
3362 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3363 }
3364
3365 if (NumIters > 1)
3366 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3367
3368 // Get pointer to location in transfer medium.
3369 // MediumPtr = &medium[warp_id]
3370 Value *MediumPtr = Builder.CreateInBoundsGEP(
3371 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3372 // elem = *elemptr
3373 //*MediumPtr = elem
3374 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3375 // Store the source element value to the dest element address.
3376 Builder.CreateStore(Elem, MediumPtr,
3377 /*IsVolatile*/ true);
3378 Builder.CreateBr(MergeBB);
3379
3380 // else
3381 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3382 Builder.CreateBr(MergeBB);
3383
3384 // endif
3385 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3386 InsertPointOrErrorTy BarrierIP2 =
3387 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3388 omp::Directive::OMPD_unknown,
3389 /* ForceSimpleCall */ false,
3390 /* CheckCancelFlag */ true);
3391 if (!BarrierIP2)
3392 return BarrierIP2.takeError();
3393
3394 // Warp 0 copies reduce element from transfer medium
3395 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3396 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3397 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3398
3399 Value *NumWarpsVal =
3400 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3401 // Up to 32 threads in warp 0 are active.
3402 Value *IsActiveThread =
3403 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3404 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3405
3406 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3407
3408 // SecMediumPtr = &medium[tid]
3409 // SrcMediumVal = *SrcMediumPtr
3410 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3411 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3412 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3413 Value *TargetElemPtrPtr =
3414 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3415 {ConstantInt::get(IndexTy, 0),
3416 ConstantInt::get(IndexTy, En.index())});
3417 Value *TargetElemPtrVal =
3418 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3419 Value *TargetElemPtr = TargetElemPtrVal;
3420
3421 if (IsByRefElem) {
3422 InsertPointOrErrorTy GenRes =
3423 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3424
3425 if (!GenRes)
3426 return GenRes.takeError();
3427
3428 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3429 }
3430
3431 if (NumIters > 1)
3432 TargetElemPtr =
3433 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3434
3435 // *TargetElemPtr = SrcMediumVal;
3436 Value *SrcMediumValue =
3437 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3438 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3439 Builder.CreateBr(W0MergeBB);
3440
3441 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3442 Builder.CreateBr(W0MergeBB);
3443
3444 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3445
3446 if (NumIters > 1) {
3447 Cnt = Builder.CreateNSWAdd(
3448 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3449 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3450
3451 auto *CurFn = Builder.GetInsertBlock()->getParent();
3452 emitBranch(PrecondBB);
3453 emitBlock(ExitBB, CurFn);
3454 }
3455 RealTySize %= TySize;
3456 }
3457 }
3458
3459 Builder.CreateRetVoid();
3460 Builder.restoreIP(SavedIP);
3461
3462 return WcFunc;
3463}
3464
3465Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3466 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3467 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3468 LLVMContext &Ctx = M.getContext();
3469 FunctionType *FuncTy =
3470 FunctionType::get(Builder.getVoidTy(),
3471 {Builder.getPtrTy(), Builder.getInt16Ty(),
3472 Builder.getInt16Ty(), Builder.getInt16Ty()},
3473 /* IsVarArg */ false);
3474 Function *SarFunc =
3476 "_omp_reduction_shuffle_and_reduce_func", &M);
3477 SarFunc->setAttributes(FuncAttrs);
3478 SarFunc->addParamAttr(0, Attribute::NoUndef);
3479 SarFunc->addParamAttr(1, Attribute::NoUndef);
3480 SarFunc->addParamAttr(2, Attribute::NoUndef);
3481 SarFunc->addParamAttr(3, Attribute::NoUndef);
3482 SarFunc->addParamAttr(1, Attribute::SExt);
3483 SarFunc->addParamAttr(2, Attribute::SExt);
3484 SarFunc->addParamAttr(3, Attribute::SExt);
3485 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3486 Builder.SetInsertPoint(EntryBB);
3487
3488 // Thread local Reduce list used to host the values of data to be reduced.
3489 Argument *ReduceListArg = SarFunc->getArg(0);
3490 // Current lane id; could be logical.
3491 Argument *LaneIDArg = SarFunc->getArg(1);
3492 // Offset of the remote source lane relative to the current lane.
3493 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3494 // Algorithm version. This is expected to be known at compile time.
3495 Argument *AlgoVerArg = SarFunc->getArg(3);
3496
3497 Type *ReduceListArgType = ReduceListArg->getType();
3498 Type *LaneIDArgType = LaneIDArg->getType();
3499 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3500 Value *ReduceListAlloca = Builder.CreateAlloca(
3501 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3502 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3503 LaneIDArg->getName() + ".addr");
3504 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3505 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3506 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3507 AlgoVerArg->getName() + ".addr");
3508 ArrayType *RedListArrayTy =
3509 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3510
3511 // Create a local thread-private variable to host the Reduce list
3512 // from a remote lane.
3513 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3514 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3515
3516 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3517 ReduceListAlloca, ReduceListArgType,
3518 ReduceListAlloca->getName() + ".ascast");
3519 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3520 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3521 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3522 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3523 RemoteLaneOffsetAlloca->getName() + ".ascast");
3524 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3525 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3526 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3527 RemoteReductionListAlloca, Builder.getPtrTy(),
3528 RemoteReductionListAlloca->getName() + ".ascast");
3529
3530 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3531 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3532 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3533 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3534
3535 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3536 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3537 Value *RemoteLaneOffset =
3538 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3539 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3540
3541 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3542
3543 // This loop iterates through the list of reduce elements and copies,
3544 // element by element, from a remote lane in the warp to RemoteReduceList,
3545 // hosted on the thread's stack.
3546 Error EmitRedLsCpRes = emitReductionListCopy(
3547 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3548 ReduceList, RemoteListAddrCast, IsByRef,
3549 {RemoteLaneOffset, nullptr, nullptr});
3550
3551 if (EmitRedLsCpRes)
3552 return EmitRedLsCpRes;
3553
3554 // The actions to be performed on the Remote Reduce list is dependent
3555 // on the algorithm version.
3556 //
3557 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3558 // LaneId % 2 == 0 && Offset > 0):
3559 // do the reduction value aggregation
3560 //
3561 // The thread local variable Reduce list is mutated in place to host the
3562 // reduced data, which is the aggregated value produced from local and
3563 // remote lanes.
3564 //
3565 // Note that AlgoVer is expected to be a constant integer known at compile
3566 // time.
3567 // When AlgoVer==0, the first conjunction evaluates to true, making
3568 // the entire predicate true during compile time.
3569 // When AlgoVer==1, the second conjunction has only the second part to be
3570 // evaluated during runtime. Other conjunctions evaluates to false
3571 // during compile time.
3572 // When AlgoVer==2, the third conjunction has only the second part to be
3573 // evaluated during runtime. Other conjunctions evaluates to false
3574 // during compile time.
3575 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3576 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3577 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3578 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3579 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3580 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3581 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3582 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3583 Value *RemoteOffsetComp =
3584 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3585 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3586 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3587 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3588
3589 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3590 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3591 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3592
3593 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3594 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3595 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3596 ReduceList, Builder.getPtrTy());
3597 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3598 RemoteListAddrCast, Builder.getPtrTy());
3599 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3600 ->addFnAttr(Attribute::NoUnwind);
3601 Builder.CreateBr(MergeBB);
3602
3603 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3604 Builder.CreateBr(MergeBB);
3605
3606 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3607
3608 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3609 // Reduce list.
3610 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3611 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3612 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3613
3614 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3615 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3616 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3617 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3618
3619 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3620
3621 EmitRedLsCpRes = emitReductionListCopy(
3622 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3623 RemoteListAddrCast, ReduceList, IsByRef);
3624
3625 if (EmitRedLsCpRes)
3626 return EmitRedLsCpRes;
3627
3628 Builder.CreateBr(CpyMergeBB);
3629
3630 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3631 Builder.CreateBr(CpyMergeBB);
3632
3633 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3634
3635 Builder.CreateRetVoid();
3636
3637 return SarFunc;
3638}
3639
3641OpenMPIRBuilder::generateReductionDescriptor(
3642 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3643 Type *DescriptorType,
3644 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3645 DataPtrPtrGen) {
3646
3647 // Copy the source descriptor to preserve all metadata (rank, extents,
3648 // strides, etc.)
3649 Value *DescriptorSize =
3650 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3651 Builder.CreateMemCpy(
3652 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3653 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3654 DescriptorSize);
3655
3656 // Update the base pointer field to point to the local shuffled data
3657 Value *DataPtrField;
3658 InsertPointOrErrorTy GenResult =
3659 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3660
3661 if (!GenResult)
3662 return GenResult.takeError();
3663
3664 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3665 DataPtr, Builder.getPtrTy(), ".ascast"),
3666 DataPtrField);
3667
3668 return Builder.saveIP();
3669}
3670
3671Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3672 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3673 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3674 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3675 LLVMContext &Ctx = M.getContext();
3676 FunctionType *FuncTy = FunctionType::get(
3677 Builder.getVoidTy(),
3678 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3679 /* IsVarArg */ false);
3680 Function *LtGCFunc =
3682 "_omp_reduction_list_to_global_copy_func", &M);
3683 LtGCFunc->setAttributes(FuncAttrs);
3684 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3685 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3686 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3687
3688 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3689 Builder.SetInsertPoint(EntryBlock);
3690
3691 // Buffer: global reduction buffer.
3692 Argument *BufferArg = LtGCFunc->getArg(0);
3693 // Idx: index of the buffer.
3694 Argument *IdxArg = LtGCFunc->getArg(1);
3695 // ReduceList: thread local Reduce list.
3696 Argument *ReduceListArg = LtGCFunc->getArg(2);
3697
3698 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3699 BufferArg->getName() + ".addr");
3700 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3701 IdxArg->getName() + ".addr");
3702 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3703 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3704 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3705 BufferArgAlloca, Builder.getPtrTy(),
3706 BufferArgAlloca->getName() + ".ascast");
3707 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3708 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3709 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3710 ReduceListArgAlloca, Builder.getPtrTy(),
3711 ReduceListArgAlloca->getName() + ".ascast");
3712
3713 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3714 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3715 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3716
3717 Value *LocalReduceList =
3718 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3719 Value *BufferArgVal =
3720 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3721 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3722 Type *IndexTy = Builder.getIndexTy(
3723 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3724 for (auto En : enumerate(ReductionInfos)) {
3725 const ReductionInfo &RI = En.value();
3726 auto *RedListArrayTy =
3727 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3728 // Reduce element = LocalReduceList[i]
3729 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3730 RedListArrayTy, LocalReduceList,
3731 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3732 // elemptr = ((CopyType*)(elemptrptr)) + I
3733 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3734
3735 // Global = Buffer.VD[Idx];
3736 Value *BufferVD =
3737 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3738 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3739 ReductionsBufferTy, BufferVD, 0, En.index());
3740
3741 switch (RI.EvaluationKind) {
3742 case EvalKind::Scalar: {
3743 Value *TargetElement;
3744
3745 if (IsByRef.empty() || !IsByRef[En.index()]) {
3746 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3747 } else {
3748 InsertPointOrErrorTy GenResult =
3749 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3750
3751 if (!GenResult)
3752 return GenResult.takeError();
3753
3754 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3755 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3756 }
3757
3758 Builder.CreateStore(TargetElement, GlobVal);
3759 break;
3760 }
3761 case EvalKind::Complex: {
3762 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3763 RI.ElementType, ElemPtr, 0, 0, ".realp");
3764 Value *SrcReal = Builder.CreateLoad(
3765 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3766 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3767 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3768 Value *SrcImg = Builder.CreateLoad(
3769 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3770
3771 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3772 RI.ElementType, GlobVal, 0, 0, ".realp");
3773 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3774 RI.ElementType, GlobVal, 0, 1, ".imagp");
3775 Builder.CreateStore(SrcReal, DestRealPtr);
3776 Builder.CreateStore(SrcImg, DestImgPtr);
3777 break;
3778 }
3779 case EvalKind::Aggregate: {
3780 Value *SizeVal =
3781 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3782 Builder.CreateMemCpy(
3783 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3784 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3785 break;
3786 }
3787 }
3788 }
3789
3790 Builder.CreateRetVoid();
3791 Builder.restoreIP(OldIP);
3792 return LtGCFunc;
3793}
3794
3795Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3796 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3797 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3798 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3799 LLVMContext &Ctx = M.getContext();
3800 FunctionType *FuncTy = FunctionType::get(
3801 Builder.getVoidTy(),
3802 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3803 /* IsVarArg */ false);
3804 Function *LtGRFunc =
3806 "_omp_reduction_list_to_global_reduce_func", &M);
3807 LtGRFunc->setAttributes(FuncAttrs);
3808 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3809 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3810 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3811
3812 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3813 Builder.SetInsertPoint(EntryBlock);
3814
3815 // Buffer: global reduction buffer.
3816 Argument *BufferArg = LtGRFunc->getArg(0);
3817 // Idx: index of the buffer.
3818 Argument *IdxArg = LtGRFunc->getArg(1);
3819 // ReduceList: thread local Reduce list.
3820 Argument *ReduceListArg = LtGRFunc->getArg(2);
3821
3822 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3823 BufferArg->getName() + ".addr");
3824 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3825 IdxArg->getName() + ".addr");
3826 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3827 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3828 auto *RedListArrayTy =
3829 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3830
3831 // 1. Build a list of reduction variables.
3832 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3833 Value *LocalReduceList =
3834 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3835
3836 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3837
3838 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3839 BufferArgAlloca, Builder.getPtrTy(),
3840 BufferArgAlloca->getName() + ".ascast");
3841 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3842 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3843 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3844 ReduceListArgAlloca, Builder.getPtrTy(),
3845 ReduceListArgAlloca->getName() + ".ascast");
3846 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3847 LocalReduceList, Builder.getPtrTy(),
3848 LocalReduceList->getName() + ".ascast");
3849
3850 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3851 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3852 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3853
3854 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3855 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3856 Type *IndexTy = Builder.getIndexTy(
3857 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3858 for (auto En : enumerate(ReductionInfos)) {
3859 const ReductionInfo &RI = En.value();
3860 Value *ByRefAlloc;
3861
3862 if (!IsByRef.empty() && IsByRef[En.index()]) {
3863 InsertPointTy OldIP = Builder.saveIP();
3864 Builder.restoreIP(AllocaIP);
3865
3866 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3867 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3868 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3869
3870 Builder.restoreIP(OldIP);
3871 }
3872
3873 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3874 RedListArrayTy, LocalReduceListAddrCast,
3875 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3876 Value *BufferVD =
3877 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3878 // Global = Buffer.VD[Idx];
3879 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3880 ReductionsBufferTy, BufferVD, 0, En.index());
3881
3882 if (!IsByRef.empty() && IsByRef[En.index()]) {
3883 // Get source descriptor from the reduce list argument
3884 Value *ReduceList =
3885 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3886 Value *SrcElementPtrPtr =
3887 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3888 {ConstantInt::get(IndexTy, 0),
3889 ConstantInt::get(IndexTy, En.index())});
3890 Value *SrcDescriptorAddr =
3891 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3892
3893 // Copy descriptor from source and update base_ptr to global buffer data
3894 InsertPointOrErrorTy GenResult =
3895 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3896 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3897
3898 if (!GenResult)
3899 return GenResult.takeError();
3900
3901 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3902 } else {
3903 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3904 }
3905 }
3906
3907 // Call reduce_function(GlobalReduceList, ReduceList)
3908 Value *ReduceList =
3909 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3910 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3911 ->addFnAttr(Attribute::NoUnwind);
3912 Builder.CreateRetVoid();
3913 Builder.restoreIP(OldIP);
3914 return LtGRFunc;
3915}
3916
3917Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3918 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3919 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3920 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3921 LLVMContext &Ctx = M.getContext();
3922 FunctionType *FuncTy = FunctionType::get(
3923 Builder.getVoidTy(),
3924 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3925 /* IsVarArg */ false);
3926 Function *GtLCFunc =
3928 "_omp_reduction_global_to_list_copy_func", &M);
3929 GtLCFunc->setAttributes(FuncAttrs);
3930 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3931 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3932 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3933
3934 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3935 Builder.SetInsertPoint(EntryBlock);
3936
3937 // Buffer: global reduction buffer.
3938 Argument *BufferArg = GtLCFunc->getArg(0);
3939 // Idx: index of the buffer.
3940 Argument *IdxArg = GtLCFunc->getArg(1);
3941 // ReduceList: thread local Reduce list.
3942 Argument *ReduceListArg = GtLCFunc->getArg(2);
3943
3944 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3945 BufferArg->getName() + ".addr");
3946 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3947 IdxArg->getName() + ".addr");
3948 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3949 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3950 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3951 BufferArgAlloca, Builder.getPtrTy(),
3952 BufferArgAlloca->getName() + ".ascast");
3953 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3954 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3955 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3956 ReduceListArgAlloca, Builder.getPtrTy(),
3957 ReduceListArgAlloca->getName() + ".ascast");
3958 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3959 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3960 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3961
3962 Value *LocalReduceList =
3963 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3964 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3965 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3966 Type *IndexTy = Builder.getIndexTy(
3967 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3968 for (auto En : enumerate(ReductionInfos)) {
3969 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3970 auto *RedListArrayTy =
3971 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3972 // Reduce element = LocalReduceList[i]
3973 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3974 RedListArrayTy, LocalReduceList,
3975 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3976 // elemptr = ((CopyType*)(elemptrptr)) + I
3977 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3978 // Global = Buffer.VD[Idx];
3979 Value *BufferVD =
3980 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3981 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3982 ReductionsBufferTy, BufferVD, 0, En.index());
3983
3984 switch (RI.EvaluationKind) {
3985 case EvalKind::Scalar: {
3986 Type *ElemType = RI.ElementType;
3987
3988 if (!IsByRef.empty() && IsByRef[En.index()]) {
3989 ElemType = RI.ByRefElementType;
3990 InsertPointOrErrorTy GenResult =
3991 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3992
3993 if (!GenResult)
3994 return GenResult.takeError();
3995
3996 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3997 }
3998
3999 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
4000 Builder.CreateStore(TargetElement, ElemPtr);
4001 break;
4002 }
4003 case EvalKind::Complex: {
4004 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
4005 RI.ElementType, GlobValPtr, 0, 0, ".realp");
4006 Value *SrcReal = Builder.CreateLoad(
4007 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
4008 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
4009 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
4010 Value *SrcImg = Builder.CreateLoad(
4011 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4012
4013 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4014 RI.ElementType, ElemPtr, 0, 0, ".realp");
4015 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4016 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4017 Builder.CreateStore(SrcReal, DestRealPtr);
4018 Builder.CreateStore(SrcImg, DestImgPtr);
4019 break;
4020 }
4021 case EvalKind::Aggregate: {
4022 Value *SizeVal =
4023 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4024 Builder.CreateMemCpy(
4025 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4026 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4027 SizeVal, false);
4028 break;
4029 }
4030 }
4031 }
4032
4033 Builder.CreateRetVoid();
4034 Builder.restoreIP(OldIP);
4035 return GtLCFunc;
4036}
4037
4038Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4039 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4040 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4041 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4042 LLVMContext &Ctx = M.getContext();
4043 auto *FuncTy = FunctionType::get(
4044 Builder.getVoidTy(),
4045 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4046 /* IsVarArg */ false);
4047 Function *GtLRFunc =
4049 "_omp_reduction_global_to_list_reduce_func", &M);
4050 GtLRFunc->setAttributes(FuncAttrs);
4051 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4052 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4053 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4054
4055 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4056 Builder.SetInsertPoint(EntryBlock);
4057
4058 // Buffer: global reduction buffer.
4059 Argument *BufferArg = GtLRFunc->getArg(0);
4060 // Idx: index of the buffer.
4061 Argument *IdxArg = GtLRFunc->getArg(1);
4062 // ReduceList: thread local Reduce list.
4063 Argument *ReduceListArg = GtLRFunc->getArg(2);
4064
4065 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4066 BufferArg->getName() + ".addr");
4067 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4068 IdxArg->getName() + ".addr");
4069 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4070 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4071 ArrayType *RedListArrayTy =
4072 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4073
4074 // 1. Build a list of reduction variables.
4075 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4076 Value *LocalReduceList =
4077 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4078
4079 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4080
4081 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4082 BufferArgAlloca, Builder.getPtrTy(),
4083 BufferArgAlloca->getName() + ".ascast");
4084 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4085 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4086 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4087 ReduceListArgAlloca, Builder.getPtrTy(),
4088 ReduceListArgAlloca->getName() + ".ascast");
4089 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4090 LocalReduceList, Builder.getPtrTy(),
4091 LocalReduceList->getName() + ".ascast");
4092
4093 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4094 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4095 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4096
4097 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4098 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4099 Type *IndexTy = Builder.getIndexTy(
4100 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4101 for (auto En : enumerate(ReductionInfos)) {
4102 const ReductionInfo &RI = En.value();
4103 Value *ByRefAlloc;
4104
4105 if (!IsByRef.empty() && IsByRef[En.index()]) {
4106 InsertPointTy OldIP = Builder.saveIP();
4107 Builder.restoreIP(AllocaIP);
4108
4109 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4110 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4111 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4112
4113 Builder.restoreIP(OldIP);
4114 }
4115
4116 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4117 RedListArrayTy, ReductionList,
4118 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4119 // Global = Buffer.VD[Idx];
4120 Value *BufferVD =
4121 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4122 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4123 ReductionsBufferTy, BufferVD, 0, En.index());
4124
4125 if (!IsByRef.empty() && IsByRef[En.index()]) {
4126 // Get source descriptor from the reduce list
4127 Value *ReduceListVal =
4128 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4129 Value *SrcElementPtrPtr =
4130 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4131 {ConstantInt::get(IndexTy, 0),
4132 ConstantInt::get(IndexTy, En.index())});
4133 Value *SrcDescriptorAddr =
4134 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4135
4136 // Copy descriptor from source and update base_ptr to global buffer data
4137 InsertPointOrErrorTy GenResult =
4138 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4139 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4140 if (!GenResult)
4141 return GenResult.takeError();
4142
4143 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4144 } else {
4145 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4146 }
4147 }
4148
4149 // Call reduce_function(ReduceList, GlobalReduceList)
4150 Value *ReduceList =
4151 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4152 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4153 ->addFnAttr(Attribute::NoUnwind);
4154 Builder.CreateRetVoid();
4155 Builder.restoreIP(OldIP);
4156 return GtLRFunc;
4157}
4158
4159std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4160 std::string Suffix =
4161 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4162 return (Name + Suffix).str();
4163}
4164
4165Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4166 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4168 AttributeList FuncAttrs) {
4169 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4170 {Builder.getPtrTy(), Builder.getPtrTy()},
4171 /* IsVarArg */ false);
4172 std::string Name = getReductionFuncName(ReducerName);
4173 Function *ReductionFunc =
4175 ReductionFunc->setAttributes(FuncAttrs);
4176 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4177 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4178 BasicBlock *EntryBB =
4179 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4180 Builder.SetInsertPoint(EntryBB);
4181
4182 // Need to alloca memory here and deal with the pointers before getting
4183 // LHS/RHS pointers out
4184 Value *LHSArrayPtr = nullptr;
4185 Value *RHSArrayPtr = nullptr;
4186 Argument *Arg0 = ReductionFunc->getArg(0);
4187 Argument *Arg1 = ReductionFunc->getArg(1);
4188 Type *Arg0Type = Arg0->getType();
4189 Type *Arg1Type = Arg1->getType();
4190
4191 Value *LHSAlloca =
4192 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4193 Value *RHSAlloca =
4194 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4195 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4196 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4197 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4198 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4199 Builder.CreateStore(Arg0, LHSAddrCast);
4200 Builder.CreateStore(Arg1, RHSAddrCast);
4201 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4202 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4203
4204 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4205 Type *IndexTy = Builder.getIndexTy(
4206 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4207 SmallVector<Value *> LHSPtrs, RHSPtrs;
4208 for (auto En : enumerate(ReductionInfos)) {
4209 const ReductionInfo &RI = En.value();
4210 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4211 RedArrayTy, RHSArrayPtr,
4212 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4213 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4214 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4215 RHSI8Ptr, RI.PrivateVariable->getType(),
4216 RHSI8Ptr->getName() + ".ascast");
4217
4218 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4219 RedArrayTy, LHSArrayPtr,
4220 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4221 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4222 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4223 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4224
4226 LHSPtrs.emplace_back(LHSPtr);
4227 RHSPtrs.emplace_back(RHSPtr);
4228 } else {
4229 Value *LHS = LHSPtr;
4230 Value *RHS = RHSPtr;
4231
4232 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4233 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4234 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4235 }
4236
4237 Value *Reduced;
4238 InsertPointOrErrorTy AfterIP =
4239 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4240 if (!AfterIP)
4241 return AfterIP.takeError();
4242 if (!Builder.GetInsertBlock())
4243 return ReductionFunc;
4244
4245 Builder.restoreIP(*AfterIP);
4246
4247 if (!IsByRef.empty() && !IsByRef[En.index()])
4248 Builder.CreateStore(Reduced, LHSPtr);
4249 }
4250 }
4251
4253 for (auto En : enumerate(ReductionInfos)) {
4254 unsigned Index = En.index();
4255 const ReductionInfo &RI = En.value();
4256 Value *LHSFixupPtr, *RHSFixupPtr;
4257 Builder.restoreIP(RI.ReductionGenClang(
4258 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4259
4260 // Fix the CallBack code genereated to use the correct Values for the LHS
4261 // and RHS
4262 LHSFixupPtr->replaceUsesWithIf(
4263 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4264 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4265 ReductionFunc;
4266 });
4267 RHSFixupPtr->replaceUsesWithIf(
4268 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4269 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4270 ReductionFunc;
4271 });
4272 }
4273
4274 Builder.CreateRetVoid();
4275 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4276 // to the entry block (this is dones for higher opt levels by later passes in
4277 // the pipeline). This has caused issues because non-entry `alloca`s force the
4278 // function to use dynamic stack allocations and we might run out of scratch
4279 // memory.
4280 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4281
4282 return ReductionFunc;
4283}
4284
4285static void
4287 bool IsGPU) {
4288 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4289 (void)RI;
4290 assert(RI.Variable && "expected non-null variable");
4291 assert(RI.PrivateVariable && "expected non-null private variable");
4292 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4293 "expected non-null reduction generator callback");
4294 if (!IsGPU) {
4295 assert(
4296 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4297 "expected variables and their private equivalents to have the same "
4298 "type");
4299 }
4300 assert(RI.Variable->getType()->isPointerTy() &&
4301 "expected variables to be pointers");
4302 }
4303}
4304
4306 const LocationDescription &Loc, InsertPointTy AllocaIP,
4307 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4308 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4309 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4310 unsigned ReductionBufNum, Value *SrcLocInfo) {
4311 if (!updateToLocation(Loc))
4312 return InsertPointTy();
4313 Builder.restoreIP(CodeGenIP);
4314 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4315 LLVMContext &Ctx = M.getContext();
4316
4317 // Source location for the ident struct
4318 if (!SrcLocInfo) {
4319 uint32_t SrcLocStrSize;
4320 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4321 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4322 }
4323
4324 if (ReductionInfos.size() == 0)
4325 return Builder.saveIP();
4326
4327 BasicBlock *ContinuationBlock = nullptr;
4329 // Copied code from createReductions
4330 BasicBlock *InsertBlock = Loc.IP.getBlock();
4331 ContinuationBlock =
4332 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4333 InsertBlock->getTerminator()->eraseFromParent();
4334 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4335 }
4336
4337 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4338 AttributeList FuncAttrs;
4339 AttrBuilder AttrBldr(Ctx);
4340 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4341 AttrBldr.addAttribute(Attr);
4342 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4343 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4344
4345 CodeGenIP = Builder.saveIP();
4346 Expected<Function *> ReductionResult = createReductionFunction(
4347 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4348 ReductionGenCBKind, FuncAttrs);
4349 if (!ReductionResult)
4350 return ReductionResult.takeError();
4351 Function *ReductionFunc = *ReductionResult;
4352 Builder.restoreIP(CodeGenIP);
4353
4354 // Set the grid value in the config needed for lowering later on
4355 if (GridValue.has_value())
4356 Config.setGridValue(GridValue.value());
4357 else
4358 Config.setGridValue(getGridValue(T, ReductionFunc));
4359
4360 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4361 // RedList, shuffle_reduce_func, interwarp_copy_func);
4362 // or
4363 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4364 Value *Res;
4365
4366 // 1. Build a list of reduction variables.
4367 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4368 auto Size = ReductionInfos.size();
4369 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4370 Type *FuncPtrTy =
4371 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4372 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4373 CodeGenIP = Builder.saveIP();
4374 Builder.restoreIP(AllocaIP);
4375 Value *ReductionListAlloca =
4376 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4377 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4378 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4379 Builder.restoreIP(CodeGenIP);
4380 Type *IndexTy = Builder.getIndexTy(
4381 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4382 for (auto En : enumerate(ReductionInfos)) {
4383 const ReductionInfo &RI = En.value();
4384 Value *ElemPtr = Builder.CreateInBoundsGEP(
4385 RedArrayTy, ReductionList,
4386 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4387
4388 Value *PrivateVar = RI.PrivateVariable;
4389 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4390 if (IsByRefElem)
4391 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4392
4393 Value *CastElem =
4394 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4395 Builder.CreateStore(CastElem, ElemPtr);
4396 }
4397 CodeGenIP = Builder.saveIP();
4398 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4399 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4400
4401 if (!SarFunc)
4402 return SarFunc.takeError();
4403
4404 Expected<Function *> CopyResult =
4405 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4406 if (!CopyResult)
4407 return CopyResult.takeError();
4408 Function *WcFunc = *CopyResult;
4409 Builder.restoreIP(CodeGenIP);
4410
4411 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4412
4413 // NOTE: ReductionDataSize is passed as the reduce_data_size
4414 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4415 // the runtime implementations do not currently use it. The teams
4416 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4417 // (set separately via TargetKernelDefaultAttrs). It is computed
4418 // here conservatively as max(element sizes) * N rather than the
4419 // exact sum, which over-calculates the size for mixed reduction
4420 // types but is harmless given the argument is unused.
4421 // TODO: Consider dropping this computation if the runtime API is
4422 // ever revised to remove the unused parameter.
4423 unsigned MaxDataSize = 0;
4424 SmallVector<Type *> ReductionTypeArgs;
4425 for (auto En : enumerate(ReductionInfos)) {
4426 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4427 // the actual data size stored in the global reduction buffer, consistent
4428 // with the ReductionsBufferTy struct used for GEP offsets below.
4429 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4430 ? En.value().ByRefElementType
4431 : En.value().ElementType;
4432 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4433 if (Size > MaxDataSize)
4434 MaxDataSize = Size;
4435 ReductionTypeArgs.emplace_back(RedTypeArg);
4436 }
4437 Value *ReductionDataSize =
4438 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4439 if (!IsTeamsReduction) {
4440 Value *SarFuncCast =
4441 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4442 Value *WcFuncCast =
4443 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4444 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4445 WcFuncCast};
4447 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4448 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4449 } else {
4450 CodeGenIP = Builder.saveIP();
4451 StructType *ReductionsBufferTy = StructType::create(
4452 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4453 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4454 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4455
4456 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4457 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4458 if (!LtGCFunc)
4459 return LtGCFunc.takeError();
4460
4461 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4462 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4463 if (!LtGRFunc)
4464 return LtGRFunc.takeError();
4465
4466 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4467 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4468 if (!GtLCFunc)
4469 return GtLCFunc.takeError();
4470
4471 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4472 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4473 if (!GtLRFunc)
4474 return GtLRFunc.takeError();
4475
4476 Builder.restoreIP(CodeGenIP);
4477
4478 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4479 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4480
4481 Value *Args3[] = {SrcLocInfo,
4482 KernelTeamsReductionPtr,
4483 Builder.getInt32(ReductionBufNum),
4484 ReductionDataSize,
4485 RL,
4486 *SarFunc,
4487 WcFunc,
4488 *LtGCFunc,
4489 *LtGRFunc,
4490 *GtLCFunc,
4491 *GtLRFunc};
4492
4493 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4494 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4495 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4496 }
4497
4498 // 5. Build if (res == 1)
4499 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4500 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4501 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4502 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4503
4504 // 6. Build then branch: where we have reduced values in the master
4505 // thread in each team.
4506 // __kmpc_end_reduce{_nowait}(<gtid>);
4507 // break;
4508 emitBlock(ThenBB, CurFunc);
4509
4510 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4511 for (auto En : enumerate(ReductionInfos)) {
4512 const ReductionInfo &RI = En.value();
4514 Value *RedValue = RI.Variable;
4515 Value *RHS =
4516 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4517
4519 Value *LHSPtr, *RHSPtr;
4520 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4521 &LHSPtr, &RHSPtr, CurFunc));
4522
4523 // Fix the CallBack code genereated to use the correct Values for the LHS
4524 // and RHS
4525 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4526 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4527 ReductionFunc;
4528 });
4529 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4530 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4531 ReductionFunc;
4532 });
4533 } else {
4534 if (IsByRef.empty() || !IsByRef[En.index()]) {
4535 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4536 "red.value." + Twine(En.index()));
4537 }
4538 Value *PrivateRedValue = Builder.CreateLoad(
4539 ValueType, RHS, "red.private.value" + Twine(En.index()));
4540 Value *Reduced;
4541 InsertPointOrErrorTy AfterIP =
4542 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4543 if (!AfterIP)
4544 return AfterIP.takeError();
4545 Builder.restoreIP(*AfterIP);
4546
4547 if (!IsByRef.empty() && !IsByRef[En.index()])
4548 Builder.CreateStore(Reduced, RI.Variable);
4549 }
4550 }
4551 emitBlock(ExitBB, CurFunc);
4552 if (ContinuationBlock) {
4553 Builder.CreateBr(ContinuationBlock);
4554 Builder.SetInsertPoint(ContinuationBlock);
4555 }
4556 Config.setEmitLLVMUsed();
4557
4558 return Builder.saveIP();
4559}
4560
4562 Type *VoidTy = Type::getVoidTy(M.getContext());
4563 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4564 auto *FuncTy =
4565 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4567 ".omp.reduction.func", &M);
4568}
4569
4571 Function *ReductionFunc,
4573 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4574 Module *Module = ReductionFunc->getParent();
4575 BasicBlock *ReductionFuncBlock =
4576 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4577 Builder.SetInsertPoint(ReductionFuncBlock);
4578 Value *LHSArrayPtr = nullptr;
4579 Value *RHSArrayPtr = nullptr;
4580 if (IsGPU) {
4581 // Need to alloca memory here and deal with the pointers before getting
4582 // LHS/RHS pointers out
4583 //
4584 Argument *Arg0 = ReductionFunc->getArg(0);
4585 Argument *Arg1 = ReductionFunc->getArg(1);
4586 Type *Arg0Type = Arg0->getType();
4587 Type *Arg1Type = Arg1->getType();
4588
4589 Value *LHSAlloca =
4590 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4591 Value *RHSAlloca =
4592 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4593 Value *LHSAddrCast =
4594 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4595 Value *RHSAddrCast =
4596 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4597 Builder.CreateStore(Arg0, LHSAddrCast);
4598 Builder.CreateStore(Arg1, RHSAddrCast);
4599 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4600 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4601 } else {
4602 LHSArrayPtr = ReductionFunc->getArg(0);
4603 RHSArrayPtr = ReductionFunc->getArg(1);
4604 }
4605
4606 unsigned NumReductions = ReductionInfos.size();
4607 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4608
4609 for (auto En : enumerate(ReductionInfos)) {
4610 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4611 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4612 RedArrayTy, LHSArrayPtr, 0, En.index());
4613 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4614 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4615 LHSI8Ptr, RI.Variable->getType());
4616 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4617 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4618 RedArrayTy, RHSArrayPtr, 0, En.index());
4619 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4620 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4621 RHSI8Ptr, RI.PrivateVariable->getType());
4622 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4623 Value *Reduced;
4625 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4626 if (!AfterIP)
4627 return AfterIP.takeError();
4628
4629 Builder.restoreIP(*AfterIP);
4630 // TODO: Consider flagging an error.
4631 if (!Builder.GetInsertBlock())
4632 return Error::success();
4633
4634 // store is inside of the reduction region when using by-ref
4635 if (!IsByRef[En.index()])
4636 Builder.CreateStore(Reduced, LHSPtr);
4637 }
4638 Builder.CreateRetVoid();
4639 return Error::success();
4640}
4641
4643 const LocationDescription &Loc, InsertPointTy AllocaIP,
4644 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4645 bool IsNoWait, bool IsTeamsReduction) {
4646 assert(ReductionInfos.size() == IsByRef.size());
4647 if (Config.isGPU())
4648 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4649 IsByRef, IsNoWait, IsTeamsReduction);
4650
4651 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4652
4653 if (!updateToLocation(Loc))
4654 return InsertPointTy();
4655
4656 if (ReductionInfos.size() == 0)
4657 return Builder.saveIP();
4658
4659 BasicBlock *InsertBlock = Loc.IP.getBlock();
4660 BasicBlock *ContinuationBlock =
4661 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4662 InsertBlock->getTerminator()->eraseFromParent();
4663
4664 // Create and populate array of type-erased pointers to private reduction
4665 // values.
4666 unsigned NumReductions = ReductionInfos.size();
4667 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4668 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4669 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4670
4671 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4672
4673 for (auto En : enumerate(ReductionInfos)) {
4674 unsigned Index = En.index();
4675 const ReductionInfo &RI = En.value();
4676 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4677 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4678 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4679 }
4680
4681 // Emit a call to the runtime function that orchestrates the reduction.
4682 // Declare the reduction function in the process.
4683 Type *IndexTy = Builder.getIndexTy(
4684 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4685 Function *Func = Builder.GetInsertBlock()->getParent();
4686 Module *Module = Func->getParent();
4687 uint32_t SrcLocStrSize;
4688 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4689 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4690 return RI.AtomicReductionGen;
4691 });
4692 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4693 CanGenerateAtomic
4694 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4695 : IdentFlag(0));
4696 Value *ThreadId = getOrCreateThreadID(Ident);
4697 Constant *NumVariables = Builder.getInt32(NumReductions);
4698 const DataLayout &DL = Module->getDataLayout();
4699 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4700 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4701 Function *ReductionFunc = getFreshReductionFunc(*Module);
4702 Value *Lock = getOMPCriticalRegionLock(".reduction");
4704 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4705 : RuntimeFunction::OMPRTL___kmpc_reduce);
4706 CallInst *ReduceCall =
4707 createRuntimeFunctionCall(ReduceFunc,
4708 {Ident, ThreadId, NumVariables, RedArraySize,
4709 RedArray, ReductionFunc, Lock},
4710 "reduce");
4711
4712 // Create final reduction entry blocks for the atomic and non-atomic case.
4713 // Emit IR that dispatches control flow to one of the blocks based on the
4714 // reduction supporting the atomic mode.
4715 BasicBlock *NonAtomicRedBlock =
4716 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4717 BasicBlock *AtomicRedBlock =
4718 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4719 SwitchInst *Switch =
4720 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4721 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4722 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4723
4724 // Populate the non-atomic reduction using the elementwise reduction function.
4725 // This loads the elements from the global and private variables and reduces
4726 // them before storing back the result to the global variable.
4727 Builder.SetInsertPoint(NonAtomicRedBlock);
4728 for (auto En : enumerate(ReductionInfos)) {
4729 const ReductionInfo &RI = En.value();
4731 // We have one less load for by-ref case because that load is now inside of
4732 // the reduction region
4733 Value *RedValue = RI.Variable;
4734 if (!IsByRef[En.index()]) {
4735 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4736 "red.value." + Twine(En.index()));
4737 }
4738 Value *PrivateRedValue =
4739 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4740 "red.private.value." + Twine(En.index()));
4741 Value *Reduced;
4742 InsertPointOrErrorTy AfterIP =
4743 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4744 if (!AfterIP)
4745 return AfterIP.takeError();
4746 Builder.restoreIP(*AfterIP);
4747
4748 if (!Builder.GetInsertBlock())
4749 return InsertPointTy();
4750 // for by-ref case, the load is inside of the reduction region
4751 if (!IsByRef[En.index()])
4752 Builder.CreateStore(Reduced, RI.Variable);
4753 }
4754 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4755 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4756 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4757 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4758 Builder.CreateBr(ContinuationBlock);
4759
4760 // Populate the atomic reduction using the atomic elementwise reduction
4761 // function. There are no loads/stores here because they will be happening
4762 // inside the atomic elementwise reduction.
4763 Builder.SetInsertPoint(AtomicRedBlock);
4764 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4765 for (const ReductionInfo &RI : ReductionInfos) {
4767 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4768 if (!AfterIP)
4769 return AfterIP.takeError();
4770 Builder.restoreIP(*AfterIP);
4771 if (!Builder.GetInsertBlock())
4772 return InsertPointTy();
4773 }
4774 Builder.CreateBr(ContinuationBlock);
4775 } else {
4776 Builder.CreateUnreachable();
4777 }
4778
4779 // Populate the outlined reduction function using the elementwise reduction
4780 // function. Partial values are extracted from the type-erased array of
4781 // pointers to private variables.
4782 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4783 IsByRef, /*isGPU=*/false);
4784 if (Err)
4785 return Err;
4786
4787 if (!Builder.GetInsertBlock())
4788 return InsertPointTy();
4789
4790 Builder.SetInsertPoint(ContinuationBlock);
4791 return Builder.saveIP();
4792}
4793
4796 BodyGenCallbackTy BodyGenCB,
4797 FinalizeCallbackTy FiniCB) {
4798 if (!updateToLocation(Loc))
4799 return Loc.IP;
4800
4801 Directive OMPD = Directive::OMPD_master;
4802 uint32_t SrcLocStrSize;
4803 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4804 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4805 Value *ThreadId = getOrCreateThreadID(Ident);
4806 Value *Args[] = {Ident, ThreadId};
4807
4808 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4809 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4810
4811 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4812 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4813
4814 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4815 /*Conditional*/ true, /*hasFinalize*/ true);
4816}
4817
4820 BodyGenCallbackTy BodyGenCB,
4821 FinalizeCallbackTy FiniCB, Value *Filter) {
4822 if (!updateToLocation(Loc))
4823 return Loc.IP;
4824
4825 Directive OMPD = Directive::OMPD_masked;
4826 uint32_t SrcLocStrSize;
4827 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4828 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4829 Value *ThreadId = getOrCreateThreadID(Ident);
4830 Value *Args[] = {Ident, ThreadId, Filter};
4831 Value *ArgsEnd[] = {Ident, ThreadId};
4832
4833 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4834 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4835
4836 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4837 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4838
4839 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4840 /*Conditional*/ true, /*hasFinalize*/ true);
4841}
4842
4844 llvm::FunctionCallee Callee,
4846 const llvm::Twine &Name) {
4847 llvm::CallInst *Call = Builder.CreateCall(
4848 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4849 Call->setDoesNotThrow();
4850 return Call;
4851}
4852
4853// Expects input basic block is dominated by BeforeScanBB.
4854// Once Scan directive is encountered, the code after scan directive should be
4855// dominated by AfterScanBB. Scan directive splits the code sequence to
4856// scan and input phase. Based on whether inclusive or exclusive
4857// clause is used in the scan directive and whether input loop or scan loop
4858// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4859// input loop and second is the scan loop. The code generated handles only
4860// inclusive scans now.
4862 const LocationDescription &Loc, InsertPointTy AllocaIP,
4863 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4864 bool IsInclusive, ScanInfo *ScanRedInfo) {
4865 if (ScanRedInfo->OMPFirstScanLoop) {
4866 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4867 ScanVarsType, ScanRedInfo);
4868 if (Err)
4869 return Err;
4870 }
4871 if (!updateToLocation(Loc))
4872 return Loc.IP;
4873
4874 llvm::Value *IV = ScanRedInfo->IV;
4875
4876 if (ScanRedInfo->OMPFirstScanLoop) {
4877 // Emit buffer[i] = red; at the end of the input phase.
4878 for (size_t i = 0; i < ScanVars.size(); i++) {
4879 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4880 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4881 Type *DestTy = ScanVarsType[i];
4882 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4883 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4884
4885 Builder.CreateStore(Src, Val);
4886 }
4887 }
4888 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4889 emitBlock(ScanRedInfo->OMPScanDispatch,
4890 Builder.GetInsertBlock()->getParent());
4891
4892 if (!ScanRedInfo->OMPFirstScanLoop) {
4893 IV = ScanRedInfo->IV;
4894 // Emit red = buffer[i]; at the entrance to the scan phase.
4895 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4896 for (size_t i = 0; i < ScanVars.size(); i++) {
4897 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4898 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4899 Type *DestTy = ScanVarsType[i];
4900 Value *SrcPtr =
4901 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4902 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4903 Builder.CreateStore(Src, ScanVars[i]);
4904 }
4905 }
4906
4907 // TODO: Update it to CreateBr and remove dead blocks
4908 llvm::Value *CmpI = Builder.getInt1(true);
4909 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4910 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4911 ScanRedInfo->OMPAfterScanBlock);
4912 } else {
4913 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4914 ScanRedInfo->OMPBeforeScanBlock);
4915 }
4916 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4917 Builder.GetInsertBlock()->getParent());
4918 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4919 return Builder.saveIP();
4920}
4921
4922Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4923 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4924 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4925
4926 Builder.restoreIP(AllocaIP);
4927 // Create the shared pointer at alloca IP.
4928 for (size_t i = 0; i < ScanVars.size(); i++) {
4929 llvm::Value *BuffPtr =
4930 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4931 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4932 }
4933
4934 // Allocate temporary buffer by master thread
4935 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4936 InsertPointTy CodeGenIP) -> Error {
4937 Builder.restoreIP(CodeGenIP);
4938 Value *AllocSpan =
4939 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4940 for (size_t i = 0; i < ScanVars.size(); i++) {
4941 Type *IntPtrTy = Builder.getInt32Ty();
4942 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4943 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4944 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4945 AllocSpan, nullptr, "arr");
4946 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4947 }
4948 return Error::success();
4949 };
4950 // TODO: Perform finalization actions for variables. This has to be
4951 // called for variables which have destructors/finalizers.
4952 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4953
4954 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4955 llvm::Value *FilterVal = Builder.getInt32(0);
4957 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4958
4959 if (!AfterIP)
4960 return AfterIP.takeError();
4961 Builder.restoreIP(*AfterIP);
4962 BasicBlock *InputBB = Builder.GetInsertBlock();
4963 if (InputBB->hasTerminator())
4964 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4965 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4966 if (!AfterIP)
4967 return AfterIP.takeError();
4968 Builder.restoreIP(*AfterIP);
4969
4970 return Error::success();
4971}
4972
4973Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4974 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4975 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4976 InsertPointTy CodeGenIP) -> Error {
4977 Builder.restoreIP(CodeGenIP);
4978 for (ReductionInfo RedInfo : ReductionInfos) {
4979 Value *PrivateVar = RedInfo.PrivateVariable;
4980 Value *OrigVar = RedInfo.Variable;
4981 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4982 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4983
4984 Type *SrcTy = RedInfo.ElementType;
4985 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4986 "arrayOffset");
4987 Value *Src = Builder.CreateLoad(SrcTy, Val);
4988
4989 Builder.CreateStore(Src, OrigVar);
4990 Builder.CreateFree(Buff);
4991 }
4992 return Error::success();
4993 };
4994 // TODO: Perform finalization actions for variables. This has to be
4995 // called for variables which have destructors/finalizers.
4996 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4997
4998 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
4999 Builder.SetInsertPoint(TI);
5000 else
5001 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
5002
5003 llvm::Value *FilterVal = Builder.getInt32(0);
5005 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5006
5007 if (!AfterIP)
5008 return AfterIP.takeError();
5009 Builder.restoreIP(*AfterIP);
5010 BasicBlock *InputBB = Builder.GetInsertBlock();
5011 if (InputBB->hasTerminator())
5012 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5013 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5014 if (!AfterIP)
5015 return AfterIP.takeError();
5016 Builder.restoreIP(*AfterIP);
5017 return Error::success();
5018}
5019
5021 const LocationDescription &Loc,
5023 ScanInfo *ScanRedInfo) {
5024
5025 if (!updateToLocation(Loc))
5026 return Loc.IP;
5027 auto BodyGenCB = [&](InsertPointTy AllocaIP,
5028 InsertPointTy CodeGenIP) -> Error {
5029 Builder.restoreIP(CodeGenIP);
5030 Function *CurFn = Builder.GetInsertBlock()->getParent();
5031 // for (int k = 0; k <= ceil(log2(n)); ++k)
5032 llvm::BasicBlock *LoopBB =
5033 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5034 llvm::BasicBlock *ExitBB =
5035 splitBB(Builder, false, "omp.outer.log.scan.exit");
5037 Builder.GetInsertBlock()->getModule(),
5038 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5039 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5040 llvm::Value *Arg =
5041 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5042 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5044 Builder.GetInsertBlock()->getModule(),
5045 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5046 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5047 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5048 llvm::Value *NMin1 = Builder.CreateNUWSub(
5049 ScanRedInfo->Span,
5050 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5051 Builder.SetInsertPoint(InputBB);
5052 Builder.CreateBr(LoopBB);
5053 emitBlock(LoopBB, CurFn);
5054 Builder.SetInsertPoint(LoopBB);
5055
5056 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5057 // size pow2k = 1;
5058 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5059 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5060 InputBB);
5061 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5062 InputBB);
5063 // for (size i = n - 1; i >= 2 ^ k; --i)
5064 // tmp[i] op= tmp[i-pow2k];
5065 llvm::BasicBlock *InnerLoopBB =
5066 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5067 llvm::BasicBlock *InnerExitBB =
5068 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5069 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5070 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5071 emitBlock(InnerLoopBB, CurFn);
5072 Builder.SetInsertPoint(InnerLoopBB);
5073 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5074 IVal->addIncoming(NMin1, LoopBB);
5075 for (ReductionInfo RedInfo : ReductionInfos) {
5076 Value *ReductionVal = RedInfo.PrivateVariable;
5077 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5078 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5079 Type *DestTy = RedInfo.ElementType;
5080 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5081 Value *LHSPtr =
5082 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5083 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5084 Value *RHSPtr =
5085 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5086 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5087 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5088 llvm::Value *Result;
5089 InsertPointOrErrorTy AfterIP =
5090 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5091 if (!AfterIP)
5092 return AfterIP.takeError();
5093 Builder.CreateStore(Result, LHSPtr);
5094 }
5095 llvm::Value *NextIVal = Builder.CreateNUWSub(
5096 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5097 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5098 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5099 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5100 emitBlock(InnerExitBB, CurFn);
5101 llvm::Value *Next = Builder.CreateNUWAdd(
5102 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5103 Counter->addIncoming(Next, Builder.GetInsertBlock());
5104 // pow2k <<= 1;
5105 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5106 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5107 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5108 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5109 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5110 return Error::success();
5111 };
5112
5113 // TODO: Perform finalization actions for variables. This has to be
5114 // called for variables which have destructors/finalizers.
5115 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5116
5117 llvm::Value *FilterVal = Builder.getInt32(0);
5119 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5120
5121 if (!AfterIP)
5122 return AfterIP.takeError();
5123 Builder.restoreIP(*AfterIP);
5124 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5125
5126 if (!AfterIP)
5127 return AfterIP.takeError();
5128 Builder.restoreIP(*AfterIP);
5129 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5130 if (Err)
5131 return Err;
5132
5133 return AfterIP;
5134}
5135
5136Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5137 llvm::function_ref<Error()> InputLoopGen,
5138 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5139 ScanInfo *ScanRedInfo) {
5140
5141 {
5142 // Emit loop with input phase:
5143 // for (i: 0..<num_iters>) {
5144 // <input phase>;
5145 // buffer[i] = red;
5146 // }
5147 ScanRedInfo->OMPFirstScanLoop = true;
5148 Error Err = InputLoopGen();
5149 if (Err)
5150 return Err;
5151 }
5152 {
5153 // Emit loop with scan phase:
5154 // for (i: 0..<num_iters>) {
5155 // red = buffer[i];
5156 // <scan phase>;
5157 // }
5158 ScanRedInfo->OMPFirstScanLoop = false;
5159 Error Err = ScanLoopGen(Builder.saveIP());
5160 if (Err)
5161 return Err;
5162 }
5163 return Error::success();
5164}
5165
5166void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5167 Function *Fun = Builder.GetInsertBlock()->getParent();
5168 ScanRedInfo->OMPScanDispatch =
5169 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5170 ScanRedInfo->OMPAfterScanBlock =
5171 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5172 ScanRedInfo->OMPBeforeScanBlock =
5173 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5174 ScanRedInfo->OMPScanLoopExit =
5175 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5176}
5178 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5179 BasicBlock *PostInsertBefore, const Twine &Name) {
5180 Module *M = F->getParent();
5181 LLVMContext &Ctx = M->getContext();
5182 Type *IndVarTy = TripCount->getType();
5183
5184 // Create the basic block structure.
5185 BasicBlock *Preheader =
5186 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5187 BasicBlock *Header =
5188 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5189 BasicBlock *Cond =
5190 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5191 BasicBlock *Body =
5192 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5193 BasicBlock *Latch =
5194 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5195 BasicBlock *Exit =
5196 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5197 BasicBlock *After =
5198 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5199
5200 // Use specified DebugLoc for new instructions.
5201 Builder.SetCurrentDebugLocation(DL);
5202
5203 Builder.SetInsertPoint(Preheader);
5204 Builder.CreateBr(Header);
5205
5206 Builder.SetInsertPoint(Header);
5207 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5208 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5209 Builder.CreateBr(Cond);
5210
5211 Builder.SetInsertPoint(Cond);
5212 Value *Cmp =
5213 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5214 Builder.CreateCondBr(Cmp, Body, Exit);
5215
5216 Builder.SetInsertPoint(Body);
5217 Builder.CreateBr(Latch);
5218
5219 Builder.SetInsertPoint(Latch);
5220 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5221 "omp_" + Name + ".next", /*HasNUW=*/true);
5222 Builder.CreateBr(Header);
5223 IndVarPHI->addIncoming(Next, Latch);
5224
5225 Builder.SetInsertPoint(Exit);
5226 Builder.CreateBr(After);
5227
5228 // Remember and return the canonical control flow.
5229 LoopInfos.emplace_front();
5230 CanonicalLoopInfo *CL = &LoopInfos.front();
5231
5232 CL->Header = Header;
5233 CL->Cond = Cond;
5234 CL->Latch = Latch;
5235 CL->Exit = Exit;
5236
5237#ifndef NDEBUG
5238 CL->assertOK();
5239#endif
5240 return CL;
5241}
5242
5245 LoopBodyGenCallbackTy BodyGenCB,
5246 Value *TripCount, const Twine &Name) {
5247 BasicBlock *BB = Loc.IP.getBlock();
5248 BasicBlock *NextBB = BB->getNextNode();
5249
5250 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5251 NextBB, NextBB, Name);
5252 BasicBlock *After = CL->getAfter();
5253
5254 // If location is not set, don't connect the loop.
5255 if (updateToLocation(Loc)) {
5256 // Split the loop at the insertion point: Branch to the preheader and move
5257 // every following instruction to after the loop (the After BB). Also, the
5258 // new successor is the loop's after block.
5259 spliceBB(Builder, After, /*CreateBranch=*/false);
5260 Builder.CreateBr(CL->getPreheader());
5261 }
5262
5263 // Emit the body content. We do it after connecting the loop to the CFG to
5264 // avoid that the callback encounters degenerate BBs.
5265 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5266 return Err;
5267
5268#ifndef NDEBUG
5269 CL->assertOK();
5270#endif
5271 return CL;
5272}
5273
5275 ScanInfos.emplace_front();
5276 ScanInfo *Result = &ScanInfos.front();
5277 return Result;
5278}
5279
5283 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5284 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5285 LocationDescription ComputeLoc =
5286 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5287 updateToLocation(ComputeLoc);
5288
5290
5292 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5293 ScanRedInfo->Span = TripCount;
5294 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5295 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5296
5297 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5298 Builder.restoreIP(CodeGenIP);
5299 ScanRedInfo->IV = IV;
5300 createScanBBs(ScanRedInfo);
5301 BasicBlock *InputBlock = Builder.GetInsertBlock();
5302 Instruction *Terminator = InputBlock->getTerminator();
5303 assert(Terminator->getNumSuccessors() == 1);
5304 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5305 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5306 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5307 Builder.GetInsertBlock()->getParent());
5308 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5309 emitBlock(ScanRedInfo->OMPScanLoopExit,
5310 Builder.GetInsertBlock()->getParent());
5311 Builder.CreateBr(ContinueBlock);
5312 Builder.SetInsertPoint(
5313 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5314 return BodyGenCB(Builder.saveIP(), IV);
5315 };
5316
5317 const auto &&InputLoopGen = [&]() -> Error {
5319 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5320 ComputeIP, Name, true, ScanRedInfo);
5321 if (!LoopInfo)
5322 return LoopInfo.takeError();
5323 Result.push_back(*LoopInfo);
5324 Builder.restoreIP((*LoopInfo)->getAfterIP());
5325 return Error::success();
5326 };
5327 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5329 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5330 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5331 if (!LoopInfo)
5332 return LoopInfo.takeError();
5333 Result.push_back(*LoopInfo);
5334 Builder.restoreIP((*LoopInfo)->getAfterIP());
5335 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5336 return Error::success();
5337 };
5338 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5339 if (Err)
5340 return Err;
5341 return Result;
5342}
5343
5345 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5346 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5347
5348 // Consider the following difficulties (assuming 8-bit signed integers):
5349 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5350 // DO I = 1, 100, 50
5351 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5352 // DO I = 100, 0, -128
5353
5354 // Start, Stop and Step must be of the same integer type.
5355 auto *IndVarTy = cast<IntegerType>(Start->getType());
5356 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5357 assert(IndVarTy == Step->getType() && "Step type mismatch");
5358
5360
5361 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5362 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5363
5364 // Like Step, but always positive.
5365 Value *Incr = Step;
5366
5367 // Distance between Start and Stop; always positive.
5368 Value *Span;
5369
5370 // Condition whether there are no iterations are executed at all, e.g. because
5371 // UB < LB.
5372 Value *ZeroCmp;
5373
5374 if (IsSigned) {
5375 // Ensure that increment is positive. If not, negate and invert LB and UB.
5376 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5377 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5378 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5379 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5380 Span = Builder.CreateSub(UB, LB, "", false, true);
5381 ZeroCmp = Builder.CreateICmp(
5382 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5383 } else {
5384 Span = Builder.CreateSub(Stop, Start, "", true);
5385 ZeroCmp = Builder.CreateICmp(
5386 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5387 }
5388
5389 Value *CountIfLooping;
5390 if (InclusiveStop) {
5391 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5392 } else {
5393 // Avoid incrementing past stop since it could overflow.
5394 Value *CountIfTwo = Builder.CreateAdd(
5395 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5396 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5397 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5398 }
5399
5400 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5401 "omp_" + Name + ".tripcount");
5402}
5403
5406 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5407 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5408 ScanInfo *ScanRedInfo) {
5409 LocationDescription ComputeLoc =
5410 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5411
5413 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5414
5415 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5416 Builder.restoreIP(CodeGenIP);
5417 Value *Span = Builder.CreateMul(IV, Step);
5418 Value *IndVar = Builder.CreateAdd(Span, Start);
5419 if (InScan)
5420 ScanRedInfo->IV = IndVar;
5421 return BodyGenCB(Builder.saveIP(), IndVar);
5422 };
5423 LocationDescription LoopLoc =
5424 ComputeIP.isSet()
5425 ? Loc
5426 : LocationDescription(Builder.saveIP(),
5427 Builder.getCurrentDebugLocation());
5428 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5429}
5430
5431// Returns an LLVM function to call for initializing loop bounds using OpenMP
5432// static scheduling for composite `distribute parallel for` depending on
5433// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5434// integers as unsigned similarly to CanonicalLoopInfo.
5435static FunctionCallee
5437 OpenMPIRBuilder &OMPBuilder) {
5438 unsigned Bitwidth = Ty->getIntegerBitWidth();
5439 if (Bitwidth == 32)
5440 return OMPBuilder.getOrCreateRuntimeFunction(
5441 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5442 if (Bitwidth == 64)
5443 return OMPBuilder.getOrCreateRuntimeFunction(
5444 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5445 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5446}
5447
5448// Returns an LLVM function to call for initializing loop bounds using OpenMP
5449// static scheduling depending on `type`. Only i32 and i64 are supported by the
5450// runtime. Always interpret integers as unsigned similarly to
5451// CanonicalLoopInfo.
5453 OpenMPIRBuilder &OMPBuilder) {
5454 unsigned Bitwidth = Ty->getIntegerBitWidth();
5455 if (Bitwidth == 32)
5456 return OMPBuilder.getOrCreateRuntimeFunction(
5457 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5458 if (Bitwidth == 64)
5459 return OMPBuilder.getOrCreateRuntimeFunction(
5460 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5461 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5462}
5463
5464OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5465 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5466 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5467 OMPScheduleType DistScheduleSchedType) {
5468 assert(CLI->isValid() && "Requires a valid canonical loop");
5469 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5470 "Require dedicated allocate IP");
5471
5472 // Set up the source location value for OpenMP runtime.
5473 Builder.restoreIP(CLI->getPreheaderIP());
5474 Builder.SetCurrentDebugLocation(DL);
5475
5476 uint32_t SrcLocStrSize;
5477 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5479 switch (LoopType) {
5480 case WorksharingLoopType::ForStaticLoop:
5481 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5482 break;
5483 case WorksharingLoopType::DistributeStaticLoop:
5484 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5485 break;
5486 case WorksharingLoopType::DistributeForStaticLoop:
5487 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5488 break;
5489 }
5490 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5491
5492 // Declare useful OpenMP runtime functions.
5493 Value *IV = CLI->getIndVar();
5494 Type *IVTy = IV->getType();
5495 FunctionCallee StaticInit =
5496 LoopType == WorksharingLoopType::DistributeForStaticLoop
5497 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5498 : getKmpcForStaticInitForType(IVTy, M, *this);
5499 FunctionCallee StaticFini =
5500 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5501
5502 // Allocate space for computed loop bounds as expected by the "init" function.
5503 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5504
5505 Type *I32Type = Type::getInt32Ty(M.getContext());
5506 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5507 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5508 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5509 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5510 CLI->setLastIter(PLastIter);
5511
5512 // At the end of the preheader, prepare for calling the "init" function by
5513 // storing the current loop bounds into the allocated space. A canonical loop
5514 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5515 // and produces an inclusive upper bound.
5516 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5517 Constant *Zero = ConstantInt::get(IVTy, 0);
5518 Constant *One = ConstantInt::get(IVTy, 1);
5519 Builder.CreateStore(Zero, PLowerBound);
5520 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5521 Builder.CreateStore(UpperBound, PUpperBound);
5522 Builder.CreateStore(One, PStride);
5523
5524 Value *ThreadNum =
5525 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5526
5527 OMPScheduleType SchedType =
5528 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5529 ? OMPScheduleType::OrderedDistribute
5531 Constant *SchedulingType =
5532 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5533
5534 // Call the "init" function and update the trip count of the loop with the
5535 // value it produced.
5536 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5537 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5538 this](Value *SchedulingType, auto &Builder) {
5539 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5540 PLowerBound, PUpperBound});
5541 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5542 Value *PDistUpperBound =
5543 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5544 Args.push_back(PDistUpperBound);
5545 }
5546 Args.append({PStride, One, Zero});
5547 createRuntimeFunctionCall(StaticInit, Args);
5548 };
5549 BuildInitCall(SchedulingType, Builder);
5550 if (HasDistSchedule &&
5551 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5552 Constant *DistScheduleSchedType = ConstantInt::get(
5553 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5554 // We want to emit a second init function call for the dist_schedule clause
5555 // to the Distribute construct. This should only be done however if a
5556 // Workshare Loop is nested within a Distribute Construct
5557 BuildInitCall(DistScheduleSchedType, Builder);
5558 }
5559 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5560 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5561 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5562 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5563 CLI->setTripCount(TripCount);
5564
5565 // Update all uses of the induction variable except the one in the condition
5566 // block that compares it with the actual upper bound, and the increment in
5567 // the latch block.
5568
5569 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5570 Builder.SetInsertPoint(CLI->getBody(),
5571 CLI->getBody()->getFirstInsertionPt());
5572 Builder.SetCurrentDebugLocation(DL);
5573 return Builder.CreateAdd(OldIV, LowerBound);
5574 });
5575
5576 // In the "exit" block, call the "fini" function.
5577 Builder.SetInsertPoint(CLI->getExit(),
5578 CLI->getExit()->getTerminator()->getIterator());
5579 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5580
5581 // Add the barrier if requested.
5582 if (NeedsBarrier) {
5583 InsertPointOrErrorTy BarrierIP =
5585 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5586 /* CheckCancelFlag */ false);
5587 if (!BarrierIP)
5588 return BarrierIP.takeError();
5589 }
5590
5591 InsertPointTy AfterIP = CLI->getAfterIP();
5592 CLI->invalidate();
5593
5594 return AfterIP;
5595}
5596
5597static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5598 LoopInfo &LI);
5599static void addLoopMetadata(CanonicalLoopInfo *Loop,
5600 ArrayRef<Metadata *> Properties);
5601
5603 LLVMContext &Ctx, Loop *Loop,
5605 SmallVector<Metadata *> &LoopMDList) {
5606 SmallSet<BasicBlock *, 8> Reachable;
5607
5608 // Get the basic blocks from the loop in which memref instructions
5609 // can be found.
5610 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5611 // preferably without running any passes.
5612 for (BasicBlock *Block : Loop->getBlocks()) {
5613 if (Block == CLI->getCond() || Block == CLI->getHeader())
5614 continue;
5615 Reachable.insert(Block);
5616 }
5617
5618 // Add access group metadata to memory-access instructions.
5619 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5620 for (BasicBlock *BB : Reachable)
5621 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5622 // TODO: If the loop has existing parallel access metadata, have
5623 // to combine two lists.
5624 LoopMDList.push_back(MDNode::get(
5625 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5626}
5627
5629OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5630 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5631 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5632 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5633 assert(CLI->isValid() && "Requires a valid canonical loop");
5634 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5635
5636 LLVMContext &Ctx = CLI->getFunction()->getContext();
5637 Value *IV = CLI->getIndVar();
5638 Value *OrigTripCount = CLI->getTripCount();
5639 Type *IVTy = IV->getType();
5640 assert(IVTy->getIntegerBitWidth() <= 64 &&
5641 "Max supported tripcount bitwidth is 64 bits");
5642 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5643 : Type::getInt64Ty(Ctx);
5644 Type *I32Type = Type::getInt32Ty(M.getContext());
5645 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5646 Constant *One = ConstantInt::get(InternalIVTy, 1);
5647
5648 Function *F = CLI->getFunction();
5649 // Blocks must have terminators.
5650 // FIXME: Don't run analyses on incomplete/invalid IR.
5652 for (BasicBlock &BB : *F)
5653 if (!BB.hasTerminator())
5654 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5656 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5657 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5658 LoopAnalysis LIA;
5659 LoopInfo &&LI = LIA.run(*F, FAM);
5660 for (Instruction *I : UIs)
5661 I->eraseFromParent();
5662 Loop *L = LI.getLoopFor(CLI->getHeader());
5663 SmallVector<Metadata *> LoopMDList;
5664 if (ChunkSize || DistScheduleChunkSize)
5665 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5666 addLoopMetadata(CLI, LoopMDList);
5667
5668 // Declare useful OpenMP runtime functions.
5669 FunctionCallee StaticInit =
5670 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5671 FunctionCallee StaticFini =
5672 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5673
5674 // Allocate space for computed loop bounds as expected by the "init" function.
5675 Builder.restoreIP(AllocaIP);
5676 Builder.SetCurrentDebugLocation(DL);
5677 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5678 Value *PLowerBound =
5679 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5680 Value *PUpperBound =
5681 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5682 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5683 CLI->setLastIter(PLastIter);
5684
5685 // Set up the source location value for the OpenMP runtime.
5686 Builder.restoreIP(CLI->getPreheaderIP());
5687 Builder.SetCurrentDebugLocation(DL);
5688
5689 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5690 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5691 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5692 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5693 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5694 "distschedulechunksize");
5695 Value *CastedTripCount =
5696 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5697
5698 Constant *SchedulingType =
5699 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5700 Constant *DistSchedulingType =
5701 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5702 Builder.CreateStore(Zero, PLowerBound);
5703 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5704 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5705 Value *UpperBound =
5706 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5707 Builder.CreateStore(UpperBound, PUpperBound);
5708 Builder.CreateStore(One, PStride);
5709
5710 // Call the "init" function and update the trip count of the loop with the
5711 // value it produced.
5712 uint32_t SrcLocStrSize;
5713 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5714 IdentFlag Flag = OMP_IDENT_FLAG_WORK_LOOP;
5715 if (DistScheduleSchedType != OMPScheduleType::None) {
5716 Flag |= OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5717 }
5718 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5719 Value *ThreadNum =
5720 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5721 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5722 PUpperBound, PStride, One,
5723 this](Value *SchedulingType, Value *ChunkSize,
5724 auto &Builder) {
5726 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5727 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5728 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5729 /*pstride=*/PStride, /*incr=*/One,
5730 /*chunk=*/ChunkSize});
5731 };
5732 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5733 if (DistScheduleSchedType != OMPScheduleType::None &&
5734 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5735 SchedType != OMPScheduleType::OrderedDistribute) {
5736 // We want to emit a second init function call for the dist_schedule clause
5737 // to the Distribute construct. This should only be done however if a
5738 // Workshare Loop is nested within a Distribute Construct
5739 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5740 }
5741
5742 // Load values written by the "init" function.
5743 Value *FirstChunkStart =
5744 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5745 Value *FirstChunkStop =
5746 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5747 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5748 Value *ChunkRange =
5749 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5750 Value *NextChunkStride =
5751 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5752
5753 // Create outer "dispatch" loop for enumerating the chunks.
5754 BasicBlock *DispatchEnter = splitBB(Builder, true);
5755 Value *DispatchCounter;
5756
5757 // It is safe to assume this didn't return an error because the callback
5758 // passed into createCanonicalLoop is the only possible error source, and it
5759 // always returns success.
5760 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5761 {Builder.saveIP(), DL},
5762 [&](InsertPointTy BodyIP, Value *Counter) {
5763 DispatchCounter = Counter;
5764 return Error::success();
5765 },
5766 FirstChunkStart, CastedTripCount, NextChunkStride,
5767 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5768 "dispatch"));
5769
5770 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5771 // not have to preserve the canonical invariant.
5772 BasicBlock *DispatchBody = DispatchCLI->getBody();
5773 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5774 BasicBlock *DispatchExit = DispatchCLI->getExit();
5775 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5776 DispatchCLI->invalidate();
5777
5778 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5779 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5780 redirectTo(CLI->getExit(), DispatchLatch, DL);
5781 redirectTo(DispatchBody, DispatchEnter, DL);
5782
5783 // Prepare the prolog of the chunk loop.
5784 Builder.restoreIP(CLI->getPreheaderIP());
5785 Builder.SetCurrentDebugLocation(DL);
5786
5787 // Compute the number of iterations of the chunk loop.
5788 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5789 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5790 Value *IsLastChunk =
5791 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5792 Value *CountUntilOrigTripCount =
5793 Builder.CreateSub(CastedTripCount, DispatchCounter);
5794 Value *ChunkTripCount = Builder.CreateSelect(
5795 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5796 Value *BackcastedChunkTC =
5797 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5798 CLI->setTripCount(BackcastedChunkTC);
5799
5800 // Update all uses of the induction variable except the one in the condition
5801 // block that compares it with the actual upper bound, and the increment in
5802 // the latch block.
5803 Value *BackcastedDispatchCounter =
5804 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5805 CLI->mapIndVar([&](Instruction *) -> Value * {
5806 Builder.restoreIP(CLI->getBodyIP());
5807 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5808 });
5809
5810 // In the "exit" block, call the "fini" function.
5811 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5812 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5813
5814 // Add the barrier if requested.
5815 if (NeedsBarrier) {
5816 InsertPointOrErrorTy AfterIP =
5817 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5818 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5819 if (!AfterIP)
5820 return AfterIP.takeError();
5821 }
5822
5823#ifndef NDEBUG
5824 // Even though we currently do not support applying additional methods to it,
5825 // the chunk loop should remain a canonical loop.
5826 CLI->assertOK();
5827#endif
5828
5829 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5830}
5831
5832// Returns an LLVM function to call for executing an OpenMP static worksharing
5833// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5834// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5835static FunctionCallee
5837 WorksharingLoopType LoopType) {
5838 unsigned Bitwidth = Ty->getIntegerBitWidth();
5839 Module &M = OMPBuilder->M;
5840 switch (LoopType) {
5841 case WorksharingLoopType::ForStaticLoop:
5842 if (Bitwidth == 32)
5843 return OMPBuilder->getOrCreateRuntimeFunction(
5844 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5845 if (Bitwidth == 64)
5846 return OMPBuilder->getOrCreateRuntimeFunction(
5847 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5848 break;
5849 case WorksharingLoopType::DistributeStaticLoop:
5850 if (Bitwidth == 32)
5851 return OMPBuilder->getOrCreateRuntimeFunction(
5852 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5853 if (Bitwidth == 64)
5854 return OMPBuilder->getOrCreateRuntimeFunction(
5855 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5856 break;
5857 case WorksharingLoopType::DistributeForStaticLoop:
5858 if (Bitwidth == 32)
5859 return OMPBuilder->getOrCreateRuntimeFunction(
5860 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5861 if (Bitwidth == 64)
5862 return OMPBuilder->getOrCreateRuntimeFunction(
5863 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5864 break;
5865 }
5866 if (Bitwidth != 32 && Bitwidth != 64) {
5867 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5868 }
5869 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5870}
5871
5872// Inserts a call to proper OpenMP Device RTL function which handles
5873// loop worksharing.
5875 WorksharingLoopType LoopType,
5876 BasicBlock *InsertBlock, Value *Ident,
5877 Value *LoopBodyArg, Value *TripCount,
5878 Function &LoopBodyFn, bool NoLoop) {
5879 Type *TripCountTy = TripCount->getType();
5880 Module &M = OMPBuilder->M;
5881 IRBuilder<> &Builder = OMPBuilder->Builder;
5882 FunctionCallee RTLFn =
5883 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5884 SmallVector<Value *, 8> RealArgs;
5885 RealArgs.push_back(Ident);
5886 RealArgs.push_back(&LoopBodyFn);
5887 RealArgs.push_back(LoopBodyArg);
5888 RealArgs.push_back(TripCount);
5889 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5890 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5891 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5892 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5893 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5894 return;
5895 }
5896 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5897 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5898 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5899 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5900
5901 RealArgs.push_back(
5902 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5903 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5904 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5905 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5906 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5907 } else {
5908 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5909 }
5910
5911 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5912}
5913
5915 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5916 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5917 WorksharingLoopType LoopType, bool NoLoop) {
5918 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5919 BasicBlock *Preheader = CLI->getPreheader();
5920 Value *TripCount = CLI->getTripCount();
5921
5922 // After loop body outling, the loop body contains only set up
5923 // of loop body argument structure and the call to the outlined
5924 // loop body function. Firstly, we need to move setup of loop body args
5925 // into loop preheader.
5926 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5927 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5928
5929 // The next step is to remove the whole loop. We do not it need anymore.
5930 // That's why make an unconditional branch from loop preheader to loop
5931 // exit block
5932 Builder.restoreIP({Preheader, Preheader->end()});
5933 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5934 Preheader->getTerminator()->eraseFromParent();
5935 Builder.CreateBr(CLI->getExit());
5936
5937 // Delete dead loop blocks
5938 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5939 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5940 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5941 CleanUpInfo.EntryBB = CLI->getHeader();
5942 CleanUpInfo.ExitBB = CLI->getExit();
5943 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5944 DeleteDeadBlocks(BlocksToBeRemoved);
5945
5946 // Find the instruction which corresponds to loop body argument structure
5947 // and remove the call to loop body function instruction.
5948 Value *LoopBodyArg;
5949 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5950 assert(OutlinedFnUser &&
5951 "Expected unique undroppable user of outlined function");
5952 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5953 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5954 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5955 "Expected outlined function call to be located in loop preheader");
5956 // Check in case no argument structure has been passed.
5957 if (OutlinedFnCallInstruction->arg_size() > 1)
5958 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5959 else
5960 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5961 OutlinedFnCallInstruction->eraseFromParent();
5962
5963 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5964 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5965
5966 for (auto &ToBeDeletedItem : ToBeDeleted)
5967 ToBeDeletedItem->eraseFromParent();
5968 CLI->invalidate();
5969}
5970
5971OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5972 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5973 WorksharingLoopType LoopType, bool NoLoop) {
5974 uint32_t SrcLocStrSize;
5975 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5977 switch (LoopType) {
5978 case WorksharingLoopType::ForStaticLoop:
5979 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5980 break;
5981 case WorksharingLoopType::DistributeStaticLoop:
5982 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5983 break;
5984 case WorksharingLoopType::DistributeForStaticLoop:
5985 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5986 break;
5987 }
5988 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5989
5990 OutlineInfo OI;
5991 OI.OuterAllocaBB = CLI->getPreheader();
5992 Function *OuterFn = CLI->getPreheader()->getParent();
5993
5994 // Instructions which need to be deleted at the end of code generation
5995 SmallVector<Instruction *, 4> ToBeDeleted;
5996
5997 OI.OuterAllocaBB = AllocaIP.getBlock();
5998
5999 // Mark the body loop as region which needs to be extracted
6000 OI.EntryBB = CLI->getBody();
6001 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
6002 "omp.prelatch");
6003
6004 // Prepare loop body for extraction
6005 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
6006
6007 // Insert new loop counter variable which will be used only in loop
6008 // body.
6009 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
6010 Instruction *NewLoopCntLoad =
6011 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
6012 // New loop counter instructions are redundant in the loop preheader when
6013 // code generation for workshare loop is finshed. That's why mark them as
6014 // ready for deletion.
6015 ToBeDeleted.push_back(NewLoopCntLoad);
6016 ToBeDeleted.push_back(NewLoopCnt);
6017
6018 // Analyse loop body region. Find all input variables which are used inside
6019 // loop body region.
6020 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
6022 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
6023
6024 CodeExtractorAnalysisCache CEAC(*OuterFn);
6025 CodeExtractor Extractor(Blocks,
6026 /* DominatorTree */ nullptr,
6027 /* AggregateArgs */ true,
6028 /* BlockFrequencyInfo */ nullptr,
6029 /* BranchProbabilityInfo */ nullptr,
6030 /* AssumptionCache */ nullptr,
6031 /* AllowVarArgs */ true,
6032 /* AllowAlloca */ true,
6033 /* AllocationBlock */ CLI->getPreheader(),
6034 /* Suffix */ ".omp_wsloop",
6035 /* AggrArgsIn0AddrSpace */ true);
6036
6037 BasicBlock *CommonExit = nullptr;
6038 SetVector<Value *> SinkingCands, HoistingCands;
6039
6040 // Find allocas outside the loop body region which are used inside loop
6041 // body
6042 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6043
6044 // We need to model loop body region as the function f(cnt, loop_arg).
6045 // That's why we replace loop induction variable by the new counter
6046 // which will be one of loop body function argument
6048 CLI->getIndVar()->user_end());
6049 for (auto Use : Users) {
6050 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6051 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6052 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6053 }
6054 }
6055 }
6056 // Make sure that loop counter variable is not merged into loop body
6057 // function argument structure and it is passed as separate variable
6058 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6059
6060 // PostOutline CB is invoked when loop body function is outlined and
6061 // loop body is replaced by call to outlined function. We need to add
6062 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6063 // function will handle loop control logic.
6064 //
6065 OI.PostOutlineCB = [=, ToBeDeletedVec =
6066 std::move(ToBeDeleted)](Function &OutlinedFn) {
6067 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6068 LoopType, NoLoop);
6069 };
6070 addOutlineInfo(std::move(OI));
6071 return CLI->getAfterIP();
6072}
6073
6076 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6077 bool HasSimdModifier, bool HasMonotonicModifier,
6078 bool HasNonmonotonicModifier, bool HasOrderedClause,
6079 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6080 Value *DistScheduleChunkSize) {
6081 if (Config.isTargetDevice())
6082 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6083 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6084 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6085 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6086
6087 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6088 OMPScheduleType::ModifierOrdered;
6089 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6090 if (HasDistSchedule) {
6091 DistScheduleSchedType = DistScheduleChunkSize
6092 ? OMPScheduleType::OrderedDistributeChunked
6093 : OMPScheduleType::OrderedDistribute;
6094 }
6095 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6096 case OMPScheduleType::BaseStatic:
6097 case OMPScheduleType::BaseDistribute:
6098 assert((!ChunkSize || !DistScheduleChunkSize) &&
6099 "No chunk size with static-chunked schedule");
6100 if (IsOrdered && !HasDistSchedule)
6101 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6102 NeedsBarrier, ChunkSize);
6103 // FIXME: Monotonicity ignored?
6104 if (DistScheduleChunkSize)
6105 return applyStaticChunkedWorkshareLoop(
6106 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6107 DistScheduleChunkSize, DistScheduleSchedType);
6108 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6109 HasDistSchedule);
6110
6111 case OMPScheduleType::BaseStaticChunked:
6112 case OMPScheduleType::BaseDistributeChunked:
6113 if (IsOrdered && !HasDistSchedule)
6114 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6115 NeedsBarrier, ChunkSize);
6116 // FIXME: Monotonicity ignored?
6117 return applyStaticChunkedWorkshareLoop(
6118 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6119 DistScheduleChunkSize, DistScheduleSchedType);
6120
6121 case OMPScheduleType::BaseRuntime:
6122 case OMPScheduleType::BaseAuto:
6123 case OMPScheduleType::BaseGreedy:
6124 case OMPScheduleType::BaseBalanced:
6125 case OMPScheduleType::BaseSteal:
6126 case OMPScheduleType::BaseRuntimeSimd:
6127 assert(!ChunkSize &&
6128 "schedule type does not support user-defined chunk sizes");
6129 [[fallthrough]];
6130 case OMPScheduleType::BaseGuidedSimd:
6131 case OMPScheduleType::BaseDynamicChunked:
6132 case OMPScheduleType::BaseGuidedChunked:
6133 case OMPScheduleType::BaseGuidedIterativeChunked:
6134 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6135 case OMPScheduleType::BaseStaticBalancedChunked:
6136 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6137 NeedsBarrier, ChunkSize);
6138
6139 default:
6140 llvm_unreachable("Unknown/unimplemented schedule kind");
6141 }
6142}
6143
6144/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6145/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6146/// the runtime. Always interpret integers as unsigned similarly to
6147/// CanonicalLoopInfo.
6148static FunctionCallee
6150 unsigned Bitwidth = Ty->getIntegerBitWidth();
6151 if (Bitwidth == 32)
6152 return OMPBuilder.getOrCreateRuntimeFunction(
6153 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6154 if (Bitwidth == 64)
6155 return OMPBuilder.getOrCreateRuntimeFunction(
6156 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6157 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6158}
6159
6160/// Returns an LLVM function to call for updating the next loop using OpenMP
6161/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6162/// the runtime. Always interpret integers as unsigned similarly to
6163/// CanonicalLoopInfo.
6164static FunctionCallee
6166 unsigned Bitwidth = Ty->getIntegerBitWidth();
6167 if (Bitwidth == 32)
6168 return OMPBuilder.getOrCreateRuntimeFunction(
6169 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6170 if (Bitwidth == 64)
6171 return OMPBuilder.getOrCreateRuntimeFunction(
6172 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6173 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6174}
6175
6176/// Returns an LLVM function to call for finalizing the dynamic loop using
6177/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6178/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6179static FunctionCallee
6181 unsigned Bitwidth = Ty->getIntegerBitWidth();
6182 if (Bitwidth == 32)
6183 return OMPBuilder.getOrCreateRuntimeFunction(
6184 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6185 if (Bitwidth == 64)
6186 return OMPBuilder.getOrCreateRuntimeFunction(
6187 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6188 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6189}
6190
6192OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6193 InsertPointTy AllocaIP,
6194 OMPScheduleType SchedType,
6195 bool NeedsBarrier, Value *Chunk) {
6196 assert(CLI->isValid() && "Requires a valid canonical loop");
6197 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6198 "Require dedicated allocate IP");
6200 "Require valid schedule type");
6201
6202 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6203 OMPScheduleType::ModifierOrdered;
6204
6205 // Set up the source location value for OpenMP runtime.
6206 Builder.SetCurrentDebugLocation(DL);
6207
6208 uint32_t SrcLocStrSize;
6209 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6210 Value *SrcLoc =
6211 getOrCreateIdent(SrcLocStr, SrcLocStrSize, OMP_IDENT_FLAG_WORK_LOOP);
6212
6213 // Declare useful OpenMP runtime functions.
6214 Value *IV = CLI->getIndVar();
6215 Type *IVTy = IV->getType();
6216 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6217 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6218
6219 // Allocate space for computed loop bounds as expected by the "init" function.
6220 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6221 Type *I32Type = Type::getInt32Ty(M.getContext());
6222 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6223 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6224 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6225 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6226 CLI->setLastIter(PLastIter);
6227
6228 // At the end of the preheader, prepare for calling the "init" function by
6229 // storing the current loop bounds into the allocated space. A canonical loop
6230 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6231 // and produces an inclusive upper bound.
6232 BasicBlock *PreHeader = CLI->getPreheader();
6233 Builder.SetInsertPoint(PreHeader->getTerminator());
6234 Constant *One = ConstantInt::get(IVTy, 1);
6235 Builder.CreateStore(One, PLowerBound);
6236 Value *UpperBound = CLI->getTripCount();
6237 Builder.CreateStore(UpperBound, PUpperBound);
6238 Builder.CreateStore(One, PStride);
6239
6240 BasicBlock *Header = CLI->getHeader();
6241 BasicBlock *Exit = CLI->getExit();
6242 BasicBlock *Cond = CLI->getCond();
6243 BasicBlock *Latch = CLI->getLatch();
6244 InsertPointTy AfterIP = CLI->getAfterIP();
6245
6246 // The CLI will be "broken" in the code below, as the loop is no longer
6247 // a valid canonical loop.
6248
6249 if (!Chunk)
6250 Chunk = One;
6251
6252 Value *ThreadNum =
6253 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
6254
6255 Constant *SchedulingType =
6256 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6257
6258 // Call the "init" function.
6259 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6260 /* LowerBound */ One, UpperBound,
6261 /* step */ One, Chunk});
6262
6263 // An outer loop around the existing one.
6264 BasicBlock *OuterCond = BasicBlock::Create(
6265 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6266 PreHeader->getParent());
6267 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6268 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6270 DynamicNext,
6271 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6272 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6273 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6274 Value *LowerBound =
6275 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6276 Builder.CreateCondBr(MoreWork, Header, Exit);
6277
6278 // Change PHI-node in loop header to use outer cond rather than preheader,
6279 // and set IV to the LowerBound.
6280 Instruction *Phi = &Header->front();
6281 auto *PI = cast<PHINode>(Phi);
6282 PI->setIncomingBlock(0, OuterCond);
6283 PI->setIncomingValue(0, LowerBound);
6284
6285 // Then set the pre-header to jump to the OuterCond
6286 Instruction *Term = PreHeader->getTerminator();
6287 auto *Br = cast<UncondBrInst>(Term);
6288 Br->setSuccessor(OuterCond);
6289
6290 // Modify the inner condition:
6291 // * Use the UpperBound returned from the DynamicNext call.
6292 // * jump to the loop outer loop when done with one of the inner loops.
6293 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6294 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6295 Instruction *Comp = &*Builder.GetInsertPoint();
6296 auto *CI = cast<CmpInst>(Comp);
6297 CI->setOperand(1, UpperBound);
6298 // Redirect the inner exit to branch to outer condition.
6299 Instruction *Branch = &Cond->back();
6300 auto *BI = cast<CondBrInst>(Branch);
6301 assert(BI->getSuccessor(1) == Exit);
6302 BI->setSuccessor(1, OuterCond);
6303
6304 // Call the "fini" function if "ordered" is present in wsloop directive.
6305 if (Ordered) {
6306 Builder.SetInsertPoint(&Latch->back());
6307 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6308 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6309 }
6310
6311 // Add the barrier if requested.
6312 if (NeedsBarrier) {
6313 Builder.SetInsertPoint(&Exit->back());
6314 InsertPointOrErrorTy BarrierIP =
6316 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6317 /* CheckCancelFlag */ false);
6318 if (!BarrierIP)
6319 return BarrierIP.takeError();
6320 }
6321
6322 CLI->invalidate();
6323 return AfterIP;
6324}
6325
6326/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6327/// after this \p OldTarget will be orphaned.
6329 BasicBlock *NewTarget, DebugLoc DL) {
6330 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6331 redirectTo(Pred, NewTarget, DL);
6332}
6333
6334/// Determine which blocks in \p BBs are reachable from outside and remove the
6335/// ones that are not reachable from the function.
6338 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6339 for (Use &U : BB->uses()) {
6340 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6341 if (!UseInst)
6342 continue;
6343 if (BBsToErase.count(UseInst->getParent()))
6344 continue;
6345 return true;
6346 }
6347 return false;
6348 };
6349
6350 while (BBsToErase.remove_if(HasRemainingUses)) {
6351 // Try again if anything was removed.
6352 }
6353
6354 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6355 DeleteDeadBlocks(BBVec);
6356}
6357
6358CanonicalLoopInfo *
6360 InsertPointTy ComputeIP) {
6361 assert(Loops.size() >= 1 && "At least one loop required");
6362 size_t NumLoops = Loops.size();
6363
6364 // Nothing to do if there is already just one loop.
6365 if (NumLoops == 1)
6366 return Loops.front();
6367
6368 CanonicalLoopInfo *Outermost = Loops.front();
6369 CanonicalLoopInfo *Innermost = Loops.back();
6370 BasicBlock *OrigPreheader = Outermost->getPreheader();
6371 BasicBlock *OrigAfter = Outermost->getAfter();
6372 Function *F = OrigPreheader->getParent();
6373
6374 // Loop control blocks that may become orphaned later.
6375 SmallVector<BasicBlock *, 12> OldControlBBs;
6376 OldControlBBs.reserve(6 * Loops.size());
6378 Loop->collectControlBlocks(OldControlBBs);
6379
6380 // Setup the IRBuilder for inserting the trip count computation.
6381 Builder.SetCurrentDebugLocation(DL);
6382 if (ComputeIP.isSet())
6383 Builder.restoreIP(ComputeIP);
6384 else
6385 Builder.restoreIP(Outermost->getPreheaderIP());
6386
6387 // Derive the collapsed' loop trip count.
6388 // TODO: Find common/largest indvar type.
6389 Value *CollapsedTripCount = nullptr;
6390 for (CanonicalLoopInfo *L : Loops) {
6391 assert(L->isValid() &&
6392 "All loops to collapse must be valid canonical loops");
6393 Value *OrigTripCount = L->getTripCount();
6394 if (!CollapsedTripCount) {
6395 CollapsedTripCount = OrigTripCount;
6396 continue;
6397 }
6398
6399 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6400 CollapsedTripCount =
6401 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6402 }
6403
6404 // Create the collapsed loop control flow.
6405 CanonicalLoopInfo *Result =
6406 createLoopSkeleton(DL, CollapsedTripCount, F,
6407 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6408
6409 // Build the collapsed loop body code.
6410 // Start with deriving the input loop induction variables from the collapsed
6411 // one, using a divmod scheme. To preserve the original loops' order, the
6412 // innermost loop use the least significant bits.
6413 Builder.restoreIP(Result->getBodyIP());
6414
6415 Value *Leftover = Result->getIndVar();
6416 SmallVector<Value *> NewIndVars;
6417 NewIndVars.resize(NumLoops);
6418 for (int i = NumLoops - 1; i >= 1; --i) {
6419 Value *OrigTripCount = Loops[i]->getTripCount();
6420
6421 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6422 NewIndVars[i] = NewIndVar;
6423
6424 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6425 }
6426 // Outermost loop gets all the remaining bits.
6427 NewIndVars[0] = Leftover;
6428
6429 // Construct the loop body control flow.
6430 // We progressively construct the branch structure following in direction of
6431 // the control flow, from the leading in-between code, the loop nest body, the
6432 // trailing in-between code, and rejoining the collapsed loop's latch.
6433 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6434 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6435 // its predecessors as sources.
6436 BasicBlock *ContinueBlock = Result->getBody();
6437 BasicBlock *ContinuePred = nullptr;
6438 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6439 BasicBlock *NextSrc) {
6440 if (ContinueBlock)
6441 redirectTo(ContinueBlock, Dest, DL);
6442 else
6443 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6444
6445 ContinueBlock = nullptr;
6446 ContinuePred = NextSrc;
6447 };
6448
6449 // The code before the nested loop of each level.
6450 // Because we are sinking it into the nest, it will be executed more often
6451 // that the original loop. More sophisticated schemes could keep track of what
6452 // the in-between code is and instantiate it only once per thread.
6453 for (size_t i = 0; i < NumLoops - 1; ++i)
6454 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6455
6456 // Connect the loop nest body.
6457 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6458
6459 // The code after the nested loop at each level.
6460 for (size_t i = NumLoops - 1; i > 0; --i)
6461 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6462
6463 // Connect the finished loop to the collapsed loop latch.
6464 ContinueWith(Result->getLatch(), nullptr);
6465
6466 // Replace the input loops with the new collapsed loop.
6467 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6468 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6469
6470 // Replace the input loop indvars with the derived ones.
6471 for (size_t i = 0; i < NumLoops; ++i)
6472 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6473
6474 // Remove unused parts of the input loops.
6475 removeUnusedBlocksFromParent(OldControlBBs);
6476
6477 for (CanonicalLoopInfo *L : Loops)
6478 L->invalidate();
6479
6480#ifndef NDEBUG
6481 Result->assertOK();
6482#endif
6483 return Result;
6484}
6485
6486std::vector<CanonicalLoopInfo *>
6488 ArrayRef<Value *> TileSizes) {
6489 assert(TileSizes.size() == Loops.size() &&
6490 "Must pass as many tile sizes as there are loops");
6491 int NumLoops = Loops.size();
6492 assert(NumLoops >= 1 && "At least one loop to tile required");
6493
6494 CanonicalLoopInfo *OutermostLoop = Loops.front();
6495 CanonicalLoopInfo *InnermostLoop = Loops.back();
6496 Function *F = OutermostLoop->getBody()->getParent();
6497 BasicBlock *InnerEnter = InnermostLoop->getBody();
6498 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6499
6500 // Loop control blocks that may become orphaned later.
6501 SmallVector<BasicBlock *, 12> OldControlBBs;
6502 OldControlBBs.reserve(6 * Loops.size());
6504 Loop->collectControlBlocks(OldControlBBs);
6505
6506 // Collect original trip counts and induction variable to be accessible by
6507 // index. Also, the structure of the original loops is not preserved during
6508 // the construction of the tiled loops, so do it before we scavenge the BBs of
6509 // any original CanonicalLoopInfo.
6510 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6511 for (CanonicalLoopInfo *L : Loops) {
6512 assert(L->isValid() && "All input loops must be valid canonical loops");
6513 OrigTripCounts.push_back(L->getTripCount());
6514 OrigIndVars.push_back(L->getIndVar());
6515 }
6516
6517 // Collect the code between loop headers. These may contain SSA definitions
6518 // that are used in the loop nest body. To be usable with in the innermost
6519 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6520 // these instructions may be executed more often than before the tiling.
6521 // TODO: It would be sufficient to only sink them into body of the
6522 // corresponding tile loop.
6524 for (int i = 0; i < NumLoops - 1; ++i) {
6525 CanonicalLoopInfo *Surrounding = Loops[i];
6526 CanonicalLoopInfo *Nested = Loops[i + 1];
6527
6528 BasicBlock *EnterBB = Surrounding->getBody();
6529 BasicBlock *ExitBB = Nested->getHeader();
6530 InbetweenCode.emplace_back(EnterBB, ExitBB);
6531 }
6532
6533 // Compute the trip counts of the floor loops.
6534 Builder.SetCurrentDebugLocation(DL);
6535 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6536 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6537 for (int i = 0; i < NumLoops; ++i) {
6538 Value *TileSize = TileSizes[i];
6539 Value *OrigTripCount = OrigTripCounts[i];
6540 Type *IVType = OrigTripCount->getType();
6541
6542 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6543 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6544
6545 // 0 if tripcount divides the tilesize, 1 otherwise.
6546 // 1 means we need an additional iteration for a partial tile.
6547 //
6548 // Unfortunately we cannot just use the roundup-formula
6549 // (tripcount + tilesize - 1)/tilesize
6550 // because the summation might overflow. We do not want introduce undefined
6551 // behavior when the untiled loop nest did not.
6552 Value *FloorTripOverflow =
6553 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6554
6555 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6556 Value *FloorTripCount =
6557 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6558 "omp_floor" + Twine(i) + ".tripcount", true);
6559
6560 // Remember some values for later use.
6561 FloorCompleteCount.push_back(FloorCompleteTripCount);
6562 FloorCount.push_back(FloorTripCount);
6563 FloorRems.push_back(FloorTripRem);
6564 }
6565
6566 // Generate the new loop nest, from the outermost to the innermost.
6567 std::vector<CanonicalLoopInfo *> Result;
6568 Result.reserve(NumLoops * 2);
6569
6570 // The basic block of the surrounding loop that enters the nest generated
6571 // loop.
6572 BasicBlock *Enter = OutermostLoop->getPreheader();
6573
6574 // The basic block of the surrounding loop where the inner code should
6575 // continue.
6576 BasicBlock *Continue = OutermostLoop->getAfter();
6577
6578 // Where the next loop basic block should be inserted.
6579 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6580
6581 auto EmbeddNewLoop =
6582 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6583 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6584 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6585 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6586 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6587 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6588
6589 // Setup the position where the next embedded loop connects to this loop.
6590 Enter = EmbeddedLoop->getBody();
6591 Continue = EmbeddedLoop->getLatch();
6592 OutroInsertBefore = EmbeddedLoop->getLatch();
6593 return EmbeddedLoop;
6594 };
6595
6596 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6597 const Twine &NameBase) {
6598 for (auto P : enumerate(TripCounts)) {
6599 CanonicalLoopInfo *EmbeddedLoop =
6600 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6601 Result.push_back(EmbeddedLoop);
6602 }
6603 };
6604
6605 EmbeddNewLoops(FloorCount, "floor");
6606
6607 // Within the innermost floor loop, emit the code that computes the tile
6608 // sizes.
6609 Builder.SetInsertPoint(Enter->getTerminator());
6610 SmallVector<Value *, 4> TileCounts;
6611 for (int i = 0; i < NumLoops; ++i) {
6612 CanonicalLoopInfo *FloorLoop = Result[i];
6613 Value *TileSize = TileSizes[i];
6614
6615 Value *FloorIsEpilogue =
6616 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6617 Value *TileTripCount =
6618 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6619
6620 TileCounts.push_back(TileTripCount);
6621 }
6622
6623 // Create the tile loops.
6624 EmbeddNewLoops(TileCounts, "tile");
6625
6626 // Insert the inbetween code into the body.
6627 BasicBlock *BodyEnter = Enter;
6628 BasicBlock *BodyEntered = nullptr;
6629 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6630 BasicBlock *EnterBB = P.first;
6631 BasicBlock *ExitBB = P.second;
6632
6633 if (BodyEnter)
6634 redirectTo(BodyEnter, EnterBB, DL);
6635 else
6636 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6637
6638 BodyEnter = nullptr;
6639 BodyEntered = ExitBB;
6640 }
6641
6642 // Append the original loop nest body into the generated loop nest body.
6643 if (BodyEnter)
6644 redirectTo(BodyEnter, InnerEnter, DL);
6645 else
6646 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6648
6649 // Replace the original induction variable with an induction variable computed
6650 // from the tile and floor induction variables.
6651 Builder.restoreIP(Result.back()->getBodyIP());
6652 for (int i = 0; i < NumLoops; ++i) {
6653 CanonicalLoopInfo *FloorLoop = Result[i];
6654 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6655 Value *OrigIndVar = OrigIndVars[i];
6656 Value *Size = TileSizes[i];
6657
6658 Value *Scale =
6659 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6660 Value *Shift =
6661 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6662 OrigIndVar->replaceAllUsesWith(Shift);
6663 }
6664
6665 // Remove unused parts of the original loops.
6666 removeUnusedBlocksFromParent(OldControlBBs);
6667
6668 for (CanonicalLoopInfo *L : Loops)
6669 L->invalidate();
6670
6671#ifndef NDEBUG
6672 for (CanonicalLoopInfo *GenL : Result)
6673 GenL->assertOK();
6674#endif
6675 return Result;
6676}
6677
6678/// Attach metadata \p Properties to the basic block described by \p BB. If the
6679/// basic block already has metadata, the basic block properties are appended.
6681 ArrayRef<Metadata *> Properties) {
6682 // Nothing to do if no property to attach.
6683 if (Properties.empty())
6684 return;
6685
6686 LLVMContext &Ctx = BB->getContext();
6687 SmallVector<Metadata *> NewProperties;
6688 NewProperties.push_back(nullptr);
6689
6690 // If the basic block already has metadata, prepend it to the new metadata.
6691 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6692 if (Existing)
6693 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6694
6695 append_range(NewProperties, Properties);
6696 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6697 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6698
6699 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6700}
6701
6702/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6703/// loop already has metadata, the loop properties are appended.
6705 ArrayRef<Metadata *> Properties) {
6706 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6707
6708 // Attach metadata to the loop's latch
6709 BasicBlock *Latch = Loop->getLatch();
6710 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6711 addBasicBlockMetadata(Latch, Properties);
6712}
6713
6714/// Attach llvm.access.group metadata to the memref instructions of \p Block
6716 LoopInfo &LI) {
6717 for (Instruction &I : *Block) {
6718 if (I.mayReadOrWriteMemory()) {
6719 // TODO: This instruction may already have access group from
6720 // other pragmas e.g. #pragma clang loop vectorize. Append
6721 // so that the existing metadata is not overwritten.
6722 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6723 }
6724 }
6725}
6726
6727CanonicalLoopInfo *
6729 CanonicalLoopInfo *firstLoop = Loops.front();
6730 CanonicalLoopInfo *lastLoop = Loops.back();
6731 Function *F = firstLoop->getPreheader()->getParent();
6732
6733 // Loop control blocks that will become orphaned later
6734 SmallVector<BasicBlock *> oldControlBBs;
6736 Loop->collectControlBlocks(oldControlBBs);
6737
6738 // Collect original trip counts
6739 SmallVector<Value *> origTripCounts;
6740 for (CanonicalLoopInfo *L : Loops) {
6741 assert(L->isValid() && "All input loops must be valid canonical loops");
6742 origTripCounts.push_back(L->getTripCount());
6743 }
6744
6745 Builder.SetCurrentDebugLocation(DL);
6746
6747 // Compute max trip count.
6748 // The fused loop will be from 0 to max(origTripCounts)
6749 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6750 F, firstLoop->getHeader());
6751 Builder.SetInsertPoint(TCBlock);
6752 Value *fusedTripCount = nullptr;
6753 for (CanonicalLoopInfo *L : Loops) {
6754 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6755 Value *origTripCount = L->getTripCount();
6756 if (!fusedTripCount) {
6757 fusedTripCount = origTripCount;
6758 continue;
6759 }
6760 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6761 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6762 ".omp.fuse.tc");
6763 }
6764
6765 // Generate new loop
6766 CanonicalLoopInfo *fused =
6767 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6768 lastLoop->getLatch(), "fused");
6769
6770 // Replace original loops with the fused loop
6771 // Preheader and After are not considered inside the CLI.
6772 // These are used to compute the individual TCs of the loops
6773 // so they have to be put before the resulting fused loop.
6774 // Moving them up for readability.
6775 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6776 Loops[i]->getPreheader()->moveBefore(TCBlock);
6777 Loops[i]->getAfter()->moveBefore(TCBlock);
6778 }
6779 lastLoop->getPreheader()->moveBefore(TCBlock);
6780
6781 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6782 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6783 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6784 }
6785 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6786 redirectTo(TCBlock, fused->getPreheader(), DL);
6787 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6788
6789 // Build the fused body
6790 // Create new Blocks with conditions that jump to the original loop bodies
6792 SmallVector<Value *> condValues;
6793 for (size_t i = 0; i < Loops.size(); ++i) {
6794 BasicBlock *condBlock = BasicBlock::Create(
6795 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6796 Builder.SetInsertPoint(condBlock);
6797 Value *condValue =
6798 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6799 condBBs.push_back(condBlock);
6800 condValues.push_back(condValue);
6801 }
6802 // Join the condition blocks with the bodies of the original loops
6803 redirectTo(fused->getBody(), condBBs[0], DL);
6804 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6805 Builder.SetInsertPoint(condBBs[i]);
6806 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6807 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6808 // Replace the IV with the fused IV
6809 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6810 }
6811 // Last body jumps to the created end body block
6812 Builder.SetInsertPoint(condBBs.back());
6813 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6814 fused->getLatch());
6815 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6816 // Replace the IV with the fused IV
6817 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6818
6819 // The loop latch must have only one predecessor. Currently it is branched to
6820 // from both the last condition block and the last loop body
6821 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6822 "omp.fused.pre_latch");
6823
6824 // Remove unused parts
6825 removeUnusedBlocksFromParent(oldControlBBs);
6826
6827 // Invalidate old CLIs
6828 for (CanonicalLoopInfo *L : Loops)
6829 L->invalidate();
6830
6831#ifndef NDEBUG
6832 fused->assertOK();
6833#endif
6834 return fused;
6835}
6836
6838 LLVMContext &Ctx = Builder.getContext();
6840 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6841 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6842}
6843
6845 LLVMContext &Ctx = Builder.getContext();
6847 Loop, {
6848 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6849 });
6850}
6851
6852void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6853 Value *IfCond, ValueToValueMapTy &VMap,
6854 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6855 const Twine &NamePrefix) {
6856 Function *F = CanonicalLoop->getFunction();
6857
6858 // We can't do
6859 // if (cond) {
6860 // simd_loop;
6861 // } else {
6862 // non_simd_loop;
6863 // }
6864 // because then the CanonicalLoopInfo would only point to one of the loops:
6865 // leading to other constructs operating on the same loop to malfunction.
6866 // Instead generate
6867 // while (...) {
6868 // if (cond) {
6869 // simd_body;
6870 // } else {
6871 // not_simd_body;
6872 // }
6873 // }
6874 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6875 // body at -O3
6876
6877 // Define where if branch should be inserted
6878 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6879
6880 // Create additional blocks for the if statement
6881 BasicBlock *Cond = SplitBeforeIt->getParent();
6882 llvm::LLVMContext &C = Cond->getContext();
6884 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6886 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6887
6888 // Create if condition branch.
6889 Builder.SetInsertPoint(SplitBeforeIt);
6890 Instruction *BrInstr =
6891 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6892 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6893 // Then block contains branch to omp loop body which needs to be vectorized
6894 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6895 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6896
6897 Builder.SetInsertPoint(ElseBlock);
6898
6899 // Clone loop for the else branch
6901
6902 SmallVector<BasicBlock *, 8> ExistingBlocks;
6903 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6904 ExistingBlocks.push_back(ThenBlock);
6905 ExistingBlocks.append(L->block_begin(), L->block_end());
6906 // Cond is the block that has the if clause condition
6907 // LoopCond is omp_loop.cond
6908 // LoopHeader is omp_loop.header
6909 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6910 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6911 assert(LoopCond && LoopHeader && "Invalid loop structure");
6912 for (BasicBlock *Block : ExistingBlocks) {
6913 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6914 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6915 continue;
6916 }
6917 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6918
6919 // fix name not to be omp.if.then
6920 if (Block == ThenBlock)
6921 NewBB->setName(NamePrefix + ".if.else");
6922
6923 NewBB->moveBefore(CanonicalLoop->getExit());
6924 VMap[Block] = NewBB;
6925 NewBlocks.push_back(NewBB);
6926 }
6927 remapInstructionsInBlocks(NewBlocks, VMap);
6928 Builder.CreateBr(NewBlocks.front());
6929
6930 // The loop latch must have only one predecessor. Currently it is branched to
6931 // from both the 'then' and 'else' branches.
6932 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6933 NamePrefix + ".pre_latch");
6934
6935 // Ensure that the then block is added to the loop so we add the attributes in
6936 // the next step
6937 L->addBasicBlockToLoop(ThenBlock, LI);
6938}
6939
6940unsigned
6942 const StringMap<bool> &Features) {
6943 if (TargetTriple.isX86()) {
6944 if (Features.lookup("avx512f"))
6945 return 512;
6946 else if (Features.lookup("avx"))
6947 return 256;
6948 return 128;
6949 }
6950 if (TargetTriple.isPPC())
6951 return 128;
6952 if (TargetTriple.isWasm())
6953 return 128;
6954 return 0;
6955}
6956
6958 MapVector<Value *, Value *> AlignedVars,
6959 Value *IfCond, OrderKind Order,
6960 ConstantInt *Simdlen, ConstantInt *Safelen) {
6961 LLVMContext &Ctx = Builder.getContext();
6962
6963 Function *F = CanonicalLoop->getFunction();
6964
6965 // Blocks must have terminators.
6966 // FIXME: Don't run analyses on incomplete/invalid IR.
6968 for (BasicBlock &BB : *F)
6969 if (!BB.hasTerminator())
6970 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
6971
6972 // TODO: We should not rely on pass manager. Currently we use pass manager
6973 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6974 // object. We should have a method which returns all blocks between
6975 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6977 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6978 FAM.registerPass([]() { return LoopAnalysis(); });
6979 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6980
6981 LoopAnalysis LIA;
6982 LoopInfo &&LI = LIA.run(*F, FAM);
6983
6984 for (Instruction *I : UIs)
6985 I->eraseFromParent();
6986
6987 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6988 if (AlignedVars.size()) {
6989 InsertPointTy IP = Builder.saveIP();
6990 for (auto &AlignedItem : AlignedVars) {
6991 Value *AlignedPtr = AlignedItem.first;
6992 Value *Alignment = AlignedItem.second;
6993 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6994 Builder.SetInsertPoint(loadInst->getNextNode());
6995 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6996 Alignment);
6997 }
6998 Builder.restoreIP(IP);
6999 }
7000
7001 if (IfCond) {
7002 ValueToValueMapTy VMap;
7003 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
7004 }
7005
7007
7008 // Get the basic blocks from the loop in which memref instructions
7009 // can be found.
7010 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
7011 // preferably without running any passes.
7012 for (BasicBlock *Block : L->getBlocks()) {
7013 if (Block == CanonicalLoop->getCond() ||
7014 Block == CanonicalLoop->getHeader())
7015 continue;
7016 Reachable.insert(Block);
7017 }
7018
7019 SmallVector<Metadata *> LoopMDList;
7020
7021 // In presence of finite 'safelen', it may be unsafe to mark all
7022 // the memory instructions parallel, because loop-carried
7023 // dependences of 'safelen' iterations are possible.
7024 // If clause order(concurrent) is specified then the memory instructions
7025 // are marked parallel even if 'safelen' is finite.
7026 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
7027 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
7028
7029 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
7030 // versions so we can't add the loop attributes in that case.
7031 if (IfCond) {
7032 // we can still add llvm.loop.parallel_access
7033 addLoopMetadata(CanonicalLoop, LoopMDList);
7034 return;
7035 }
7036
7037 // Use the above access group metadata to create loop level
7038 // metadata, which should be distinct for each loop.
7039 ConstantAsMetadata *BoolConst =
7041 LoopMDList.push_back(MDNode::get(
7042 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7043
7044 if (Simdlen || Safelen) {
7045 // If both simdlen and safelen clauses are specified, the value of the
7046 // simdlen parameter must be less than or equal to the value of the safelen
7047 // parameter. Therefore, use safelen only in the absence of simdlen.
7048 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7049 LoopMDList.push_back(
7050 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7051 ConstantAsMetadata::get(VectorizeWidth)}));
7052 }
7053
7054 addLoopMetadata(CanonicalLoop, LoopMDList);
7055}
7056
7057/// Create the TargetMachine object to query the backend for optimization
7058/// preferences.
7059///
7060/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7061/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7062/// needed for the LLVM pass pipline. We use some default options to avoid
7063/// having to pass too many settings from the frontend that probably do not
7064/// matter.
7065///
7066/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7067/// method. If we are going to use TargetMachine for more purposes, especially
7068/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7069/// might become be worth requiring front-ends to pass on their TargetMachine,
7070/// or at least cache it between methods. Note that while fontends such as Clang
7071/// have just a single main TargetMachine per translation unit, "target-cpu" and
7072/// "target-features" that determine the TargetMachine are per-function and can
7073/// be overrided using __attribute__((target("OPTIONS"))).
7074static std::unique_ptr<TargetMachine>
7076 Module *M = F->getParent();
7077
7078 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7079 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7080 const llvm::Triple &Triple = M->getTargetTriple();
7081
7082 std::string Error;
7084 if (!TheTarget)
7085 return {};
7086
7088 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7089 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7090 /*CodeModel=*/std::nullopt, OptLevel));
7091}
7092
7093/// Heuristically determine the best-performant unroll factor for \p CLI. This
7094/// depends on the target processor. We are re-using the same heuristics as the
7095/// LoopUnrollPass.
7097 Function *F = CLI->getFunction();
7098
7099 // Assume the user requests the most aggressive unrolling, even if the rest of
7100 // the code is optimized using a lower setting.
7102 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7103
7104 // Blocks must have terminators.
7105 // FIXME: Don't run analyses on incomplete/invalid IR.
7107 for (BasicBlock &BB : *F)
7108 if (!BB.hasTerminator())
7109 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7110
7112 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7113 FAM.registerPass([]() { return AssumptionAnalysis(); });
7114 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7115 FAM.registerPass([]() { return LoopAnalysis(); });
7116 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7117 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7118 TargetIRAnalysis TIRA;
7119 if (TM)
7120 TIRA = TargetIRAnalysis(
7121 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7122 FAM.registerPass([&]() { return TIRA; });
7123
7124 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7126 ScalarEvolution &&SE = SEA.run(*F, FAM);
7128 DominatorTree &&DT = DTA.run(*F, FAM);
7129 LoopAnalysis LIA;
7130 LoopInfo &&LI = LIA.run(*F, FAM);
7132 AssumptionCache &&AC = ACT.run(*F, FAM);
7134
7135 for (Instruction *I : UIs)
7136 I->eraseFromParent();
7137
7138 Loop *L = LI.getLoopFor(CLI->getHeader());
7139 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7140
7142 L, SE, TTI,
7143 /*BlockFrequencyInfo=*/nullptr,
7144 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7145 /*UserThreshold=*/std::nullopt,
7146 /*UserCount=*/std::nullopt,
7147 /*UserAllowPartial=*/true,
7148 /*UserAllowRuntime=*/true,
7149 /*UserUpperBound=*/std::nullopt,
7150 /*UserFullUnrollMaxCount=*/std::nullopt);
7151
7152 UP.Force = true;
7153
7154 // Account for additional optimizations taking place before the LoopUnrollPass
7155 // would unroll the loop.
7158
7159 // Use normal unroll factors even if the rest of the code is optimized for
7160 // size.
7163
7164 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7165 << " Threshold=" << UP.Threshold << "\n"
7166 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7167 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7168 << " PartialOptSizeThreshold="
7169 << UP.PartialOptSizeThreshold << "\n");
7170
7171 // Disable peeling.
7174 /*UserAllowPeeling=*/false,
7175 /*UserAllowProfileBasedPeeling=*/false,
7176 /*UnrollingSpecficValues=*/false);
7177
7179 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7180
7181 // Assume that reads and writes to stack variables can be eliminated by
7182 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7183 // size.
7184 for (BasicBlock *BB : L->blocks()) {
7185 for (Instruction &I : *BB) {
7186 Value *Ptr;
7187 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7188 Ptr = Load->getPointerOperand();
7189 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7190 Ptr = Store->getPointerOperand();
7191 } else
7192 continue;
7193
7194 Ptr = Ptr->stripPointerCasts();
7195
7196 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7197 if (Alloca->getParent() == &F->getEntryBlock())
7198 EphValues.insert(&I);
7199 }
7200 }
7201 }
7202
7203 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7204
7205 // Loop is not unrollable if the loop contains certain instructions.
7206 if (!UCE.canUnroll()) {
7207 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7208 return 1;
7209 }
7210
7211 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7212 << "\n");
7213
7214 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7215 // be able to use it.
7216 int TripCount = 0;
7217 int MaxTripCount = 0;
7218 bool MaxOrZero = false;
7219 unsigned TripMultiple = 0;
7220
7221 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7222 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7223 unsigned Factor = UP.Count;
7224 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7225
7226 // This function returns 1 to signal to not unroll a loop.
7227 if (Factor == 0)
7228 return 1;
7229 return Factor;
7230}
7231
7233 int32_t Factor,
7234 CanonicalLoopInfo **UnrolledCLI) {
7235 assert(Factor >= 0 && "Unroll factor must not be negative");
7236
7237 Function *F = Loop->getFunction();
7238 LLVMContext &Ctx = F->getContext();
7239
7240 // If the unrolled loop is not used for another loop-associated directive, it
7241 // is sufficient to add metadata for the LoopUnrollPass.
7242 if (!UnrolledCLI) {
7243 SmallVector<Metadata *, 2> LoopMetadata;
7244 LoopMetadata.push_back(
7245 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7246
7247 if (Factor >= 1) {
7249 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7250 LoopMetadata.push_back(MDNode::get(
7251 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7252 }
7253
7254 addLoopMetadata(Loop, LoopMetadata);
7255 return;
7256 }
7257
7258 // Heuristically determine the unroll factor.
7259 if (Factor == 0)
7261
7262 // No change required with unroll factor 1.
7263 if (Factor == 1) {
7264 *UnrolledCLI = Loop;
7265 return;
7266 }
7267
7268 assert(Factor >= 2 &&
7269 "unrolling only makes sense with a factor of 2 or larger");
7270
7271 Type *IndVarTy = Loop->getIndVarType();
7272
7273 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7274 // unroll the inner loop.
7275 Value *FactorVal =
7276 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7277 /*isSigned=*/false));
7278 std::vector<CanonicalLoopInfo *> LoopNest =
7279 tileLoops(DL, {Loop}, {FactorVal});
7280 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7281 *UnrolledCLI = LoopNest[0];
7282 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7283
7284 // LoopUnrollPass can only fully unroll loops with constant trip count.
7285 // Unroll by the unroll factor with a fallback epilog for the remainder
7286 // iterations if necessary.
7288 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7290 InnerLoop,
7291 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7293 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7294
7295#ifndef NDEBUG
7296 (*UnrolledCLI)->assertOK();
7297#endif
7298}
7299
7302 llvm::Value *BufSize, llvm::Value *CpyBuf,
7303 llvm::Value *CpyFn, llvm::Value *DidIt) {
7304 if (!updateToLocation(Loc))
7305 return Loc.IP;
7306
7307 uint32_t SrcLocStrSize;
7308 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7309 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7310 Value *ThreadId = getOrCreateThreadID(Ident);
7311
7312 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7313
7314 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7315
7316 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7317 createRuntimeFunctionCall(Fn, Args);
7318
7319 return Builder.saveIP();
7320}
7321
7323 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7324 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7326
7327 if (!updateToLocation(Loc))
7328 return Loc.IP;
7329
7330 // If needed allocate and initialize `DidIt` with 0.
7331 // DidIt: flag variable: 1=single thread; 0=not single thread.
7332 llvm::Value *DidIt = nullptr;
7333 if (!CPVars.empty()) {
7334 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7335 Builder.CreateStore(Builder.getInt32(0), DidIt);
7336 }
7337
7338 Directive OMPD = Directive::OMPD_single;
7339 uint32_t SrcLocStrSize;
7340 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7341 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7342 Value *ThreadId = getOrCreateThreadID(Ident);
7343 Value *Args[] = {Ident, ThreadId};
7344
7345 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7346 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7347
7348 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7349 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7350
7351 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7352 if (Error Err = FiniCB(IP))
7353 return Err;
7354
7355 // The thread that executes the single region must set `DidIt` to 1.
7356 // This is used by __kmpc_copyprivate, to know if the caller is the
7357 // single thread or not.
7358 if (DidIt)
7359 Builder.CreateStore(Builder.getInt32(1), DidIt);
7360
7361 return Error::success();
7362 };
7363
7364 // generates the following:
7365 // if (__kmpc_single()) {
7366 // .... single region ...
7367 // __kmpc_end_single
7368 // }
7369 // __kmpc_copyprivate
7370 // __kmpc_barrier
7371
7372 InsertPointOrErrorTy AfterIP =
7373 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7374 /*Conditional*/ true,
7375 /*hasFinalize*/ true);
7376 if (!AfterIP)
7377 return AfterIP.takeError();
7378
7379 if (DidIt) {
7380 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7381 // NOTE BufSize is currently unused, so just pass 0.
7383 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7384 CPFuncs[I], DidIt);
7385 // NOTE __kmpc_copyprivate already inserts a barrier
7386 } else if (!IsNowait) {
7387 InsertPointOrErrorTy AfterIP =
7389 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7390 /* CheckCancelFlag */ false);
7391 if (!AfterIP)
7392 return AfterIP.takeError();
7393 }
7394 return Builder.saveIP();
7395}
7396
7398 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7399 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7400
7401 if (!updateToLocation(Loc))
7402 return Loc.IP;
7403
7404 Directive OMPD = Directive::OMPD_critical;
7405 uint32_t SrcLocStrSize;
7406 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7407 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7408 Value *ThreadId = getOrCreateThreadID(Ident);
7409 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7410 Value *Args[] = {Ident, ThreadId, LockVar};
7411
7412 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7413 Function *RTFn = nullptr;
7414 if (HintInst) {
7415 // Add Hint to entry Args and create call
7416 EnterArgs.push_back(HintInst);
7417 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7418 } else {
7419 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7420 }
7421 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7422
7423 Function *ExitRTLFn =
7424 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7425 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7426
7427 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7428 /*Conditional*/ false, /*hasFinalize*/ true);
7429}
7430
7433 InsertPointTy AllocaIP, unsigned NumLoops,
7434 ArrayRef<llvm::Value *> StoreValues,
7435 const Twine &Name, bool IsDependSource) {
7436 assert(
7437 llvm::all_of(StoreValues,
7438 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7439 "OpenMP runtime requires depend vec with i64 type");
7440
7441 if (!updateToLocation(Loc))
7442 return Loc.IP;
7443
7444 // Allocate space for vector and generate alloc instruction.
7445 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7446 Builder.restoreIP(AllocaIP);
7447 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7448 ArgsBase->setAlignment(Align(8));
7450
7451 // Store the index value with offset in depend vector.
7452 for (unsigned I = 0; I < NumLoops; ++I) {
7453 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7454 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7455 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7456 STInst->setAlignment(Align(8));
7457 }
7458
7459 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7460 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7461
7462 uint32_t SrcLocStrSize;
7463 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7464 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7465 Value *ThreadId = getOrCreateThreadID(Ident);
7466 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7467
7468 Function *RTLFn = nullptr;
7469 if (IsDependSource)
7470 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7471 else
7472 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7473 createRuntimeFunctionCall(RTLFn, Args);
7474
7475 return Builder.saveIP();
7476}
7477
7479 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7480 FinalizeCallbackTy FiniCB, bool IsThreads) {
7481 if (!updateToLocation(Loc))
7482 return Loc.IP;
7483
7484 Directive OMPD = Directive::OMPD_ordered;
7485 Instruction *EntryCall = nullptr;
7486 Instruction *ExitCall = nullptr;
7487
7488 if (IsThreads) {
7489 uint32_t SrcLocStrSize;
7490 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7491 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7492 Value *ThreadId = getOrCreateThreadID(Ident);
7493 Value *Args[] = {Ident, ThreadId};
7494
7495 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7496 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7497
7498 Function *ExitRTLFn =
7499 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7500 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7501 }
7502
7503 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7504 /*Conditional*/ false, /*hasFinalize*/ true);
7505}
7506
7507OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7508 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7509 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7510 bool HasFinalize, bool IsCancellable) {
7511
7512 if (HasFinalize)
7513 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7514
7515 // Create inlined region's entry and body blocks, in preparation
7516 // for conditional creation
7517 BasicBlock *EntryBB = Builder.GetInsertBlock();
7518 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7520 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7521 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7522 BasicBlock *FiniBB =
7523 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7524
7525 Builder.SetInsertPoint(EntryBB->getTerminator());
7526 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7527
7528 // generate body
7529 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7530 /* CodeGenIP */ Builder.saveIP()))
7531 return Err;
7532
7533 // emit exit call and do any needed finalization.
7534 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7535 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7536 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7537 "Unexpected control flow graph state!!");
7538 InsertPointOrErrorTy AfterIP =
7539 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7540 if (!AfterIP)
7541 return AfterIP.takeError();
7542
7543 // If we are skipping the region of a non conditional, remove the exit
7544 // block, and clear the builder's insertion point.
7545 assert(SplitPos->getParent() == ExitBB &&
7546 "Unexpected Insertion point location!");
7547 auto merged = MergeBlockIntoPredecessor(ExitBB);
7548 BasicBlock *ExitPredBB = SplitPos->getParent();
7549 auto InsertBB = merged ? ExitPredBB : ExitBB;
7551 SplitPos->eraseFromParent();
7552 Builder.SetInsertPoint(InsertBB);
7553
7554 return Builder.saveIP();
7555}
7556
7557OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7558 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7559 // if nothing to do, Return current insertion point.
7560 if (!Conditional || !EntryCall)
7561 return Builder.saveIP();
7562
7563 BasicBlock *EntryBB = Builder.GetInsertBlock();
7564 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7565 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7566 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7567
7568 // Emit thenBB and set the Builder's insertion point there for
7569 // body generation next. Place the block after the current block.
7570 Function *CurFn = EntryBB->getParent();
7571 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7572
7573 // Move Entry branch to end of ThenBB, and replace with conditional
7574 // branch (If-stmt)
7575 Instruction *EntryBBTI = EntryBB->getTerminator();
7576 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7577 EntryBBTI->removeFromParent();
7578 Builder.SetInsertPoint(UI);
7579 Builder.Insert(EntryBBTI);
7580 UI->eraseFromParent();
7581 Builder.SetInsertPoint(ThenBB->getTerminator());
7582
7583 // return an insertion point to ExitBB.
7584 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7585}
7586
7587OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7588 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7589 bool HasFinalize) {
7590
7591 Builder.restoreIP(FinIP);
7592
7593 // If there is finalization to do, emit it before the exit call
7594 if (HasFinalize) {
7595 assert(!FinalizationStack.empty() &&
7596 "Unexpected finalization stack state!");
7597
7598 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7599 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7600
7601 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7602 return std::move(Err);
7603
7604 // Exit condition: insertion point is before the terminator of the new Fini
7605 // block
7606 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7607 }
7608
7609 if (!ExitCall)
7610 return Builder.saveIP();
7611
7612 // place the Exitcall as last instruction before Finalization block terminator
7613 ExitCall->removeFromParent();
7614 Builder.Insert(ExitCall);
7615
7616 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7617 ExitCall->getIterator());
7618}
7619
7621 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7622 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7623 if (!IP.isSet())
7624 return IP;
7625
7627
7628 // creates the following CFG structure
7629 // OMP_Entry : (MasterAddr != PrivateAddr)?
7630 // F T
7631 // | \
7632 // | copin.not.master
7633 // | /
7634 // v /
7635 // copyin.not.master.end
7636 // |
7637 // v
7638 // OMP.Entry.Next
7639
7640 BasicBlock *OMP_Entry = IP.getBlock();
7641 Function *CurFn = OMP_Entry->getParent();
7642 BasicBlock *CopyBegin =
7643 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7644 BasicBlock *CopyEnd = nullptr;
7645
7646 // If entry block is terminated, split to preserve the branch to following
7647 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7649 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7650 "copyin.not.master.end");
7651 OMP_Entry->getTerminator()->eraseFromParent();
7652 } else {
7653 CopyEnd =
7654 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7655 }
7656
7657 Builder.SetInsertPoint(OMP_Entry);
7658 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7659 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7660 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7661 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7662
7663 Builder.SetInsertPoint(CopyBegin);
7664 if (BranchtoEnd)
7665 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7666
7667 return Builder.saveIP();
7668}
7669
7671 Value *Size, Value *Allocator,
7672 std::string Name) {
7674 if (!updateToLocation(Loc))
7675 return nullptr;
7676
7677 uint32_t SrcLocStrSize;
7678 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7679 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7680 Value *ThreadId = getOrCreateThreadID(Ident);
7681 Value *Args[] = {ThreadId, Size, Allocator};
7682
7683 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7684
7685 return createRuntimeFunctionCall(Fn, Args, Name);
7686}
7687
7689 Value *Align, Value *Size,
7690 Value *Allocator,
7691 std::string Name) {
7693 if (!updateToLocation(Loc))
7694 return nullptr;
7695
7696 uint32_t SrcLocStrSize;
7697 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7698 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7699 Value *ThreadId = getOrCreateThreadID(Ident);
7700 Value *Args[] = {ThreadId, Align, Size, Allocator};
7701
7702 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_aligned_alloc);
7703
7704 return Builder.CreateCall(Fn, Args, Name);
7705}
7706
7708 Value *Addr, Value *Allocator,
7709 std::string Name) {
7711 if (!updateToLocation(Loc))
7712 return nullptr;
7713
7714 uint32_t SrcLocStrSize;
7715 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7716 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7717 Value *ThreadId = getOrCreateThreadID(Ident);
7718 Value *Args[] = {ThreadId, Addr, Allocator};
7719 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7720 return createRuntimeFunctionCall(Fn, Args, Name);
7721}
7722
7724 const LocationDescription &Loc, Value *InteropVar,
7725 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7726 Value *DependenceAddress, bool HaveNowaitClause) {
7729
7730 uint32_t SrcLocStrSize;
7731 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7732 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7733 Value *ThreadId = getOrCreateThreadID(Ident);
7734 if (Device == nullptr)
7735 Device = Constant::getAllOnesValue(Int32);
7736 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7737 if (NumDependences == nullptr) {
7738 NumDependences = ConstantInt::get(Int32, 0);
7739 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7740 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7741 }
7742 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7743 Value *Args[] = {
7744 Ident, ThreadId, InteropVar, InteropTypeVal,
7745 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7746
7747 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7748
7749 return createRuntimeFunctionCall(Fn, Args);
7750}
7751
7753 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7754 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7757
7758 uint32_t SrcLocStrSize;
7759 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7760 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7761 Value *ThreadId = getOrCreateThreadID(Ident);
7762 if (Device == nullptr)
7763 Device = Constant::getAllOnesValue(Int32);
7764 if (NumDependences == nullptr) {
7765 NumDependences = ConstantInt::get(Int32, 0);
7766 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7767 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7768 }
7769 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7770 Value *Args[] = {
7771 Ident, ThreadId, InteropVar, Device,
7772 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7773
7774 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7775
7776 return createRuntimeFunctionCall(Fn, Args);
7777}
7778
7780 Value *InteropVar, Value *Device,
7781 Value *NumDependences,
7782 Value *DependenceAddress,
7783 bool HaveNowaitClause) {
7786 uint32_t SrcLocStrSize;
7787 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7788 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7789 Value *ThreadId = getOrCreateThreadID(Ident);
7790 if (Device == nullptr)
7791 Device = Constant::getAllOnesValue(Int32);
7792 if (NumDependences == nullptr) {
7793 NumDependences = ConstantInt::get(Int32, 0);
7794 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7795 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7796 }
7797 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7798 Value *Args[] = {
7799 Ident, ThreadId, InteropVar, Device,
7800 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7801
7802 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7803
7804 return createRuntimeFunctionCall(Fn, Args);
7805}
7806
7809 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7812
7813 uint32_t SrcLocStrSize;
7814 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7815 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7816 Value *ThreadId = getOrCreateThreadID(Ident);
7817 Constant *ThreadPrivateCache =
7818 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7819 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7820
7821 Function *Fn =
7822 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7823
7824 return createRuntimeFunctionCall(Fn, Args);
7825}
7826
7828 const LocationDescription &Loc,
7830 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7831 "expected num_threads and num_teams to be specified");
7832
7833 if (!updateToLocation(Loc))
7834 return Loc.IP;
7835
7836 uint32_t SrcLocStrSize;
7837 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7838 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7839 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7840 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7841 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7842 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7843 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7844
7845 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7846 Function *Kernel = DebugKernelWrapper;
7847
7848 // We need to strip the debug prefix to get the correct kernel name.
7849 StringRef KernelName = Kernel->getName();
7850 const std::string DebugPrefix = "_debug__";
7851 if (KernelName.ends_with(DebugPrefix)) {
7852 KernelName = KernelName.drop_back(DebugPrefix.length());
7853 Kernel = M.getFunction(KernelName);
7854 assert(Kernel && "Expected the real kernel to exist");
7855 }
7856
7857 // Manifest the launch configuration in the metadata matching the kernel
7858 // environment.
7859 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7860 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7861
7862 // If MaxThreads not set, select the maximum between the default workgroup
7863 // size and the MinThreads value.
7864 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7865 if (MaxThreadsVal < 0) {
7866 if (hasGridValue(T)) {
7867 MaxThreadsVal =
7868 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
7869 Attrs.MinThreads);
7870 } else {
7871 MaxThreadsVal = Attrs.MinThreads;
7872 }
7873 }
7874
7875 if (MaxThreadsVal > 0)
7876 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7877
7878 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7879 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7880 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7881 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7882 Constant *ReductionDataSize =
7883 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7884 Constant *ReductionBufferLength =
7885 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7886
7888 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7889 const DataLayout &DL = Fn->getDataLayout();
7890
7891 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7892 Constant *DynamicEnvironmentInitializer =
7893 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7894 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7895 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7896 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7897 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7898 DL.getDefaultGlobalsAddressSpace());
7899 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7900
7901 Constant *DynamicEnvironment =
7902 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7903 ? DynamicEnvironmentGV
7904 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7905 DynamicEnvironmentPtr);
7906
7907 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7908 ConfigurationEnvironment, {
7909 UseGenericStateMachineVal,
7910 MayUseNestedParallelismVal,
7911 IsSPMDVal,
7912 MinThreads,
7913 MaxThreads,
7914 MinTeams,
7915 MaxTeams,
7916 ReductionDataSize,
7917 ReductionBufferLength,
7918 });
7919 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7920 KernelEnvironment, {
7921 ConfigurationEnvironmentInitializer,
7922 Ident,
7923 DynamicEnvironment,
7924 });
7925 std::string KernelEnvironmentName =
7926 (KernelName + "_kernel_environment").str();
7927 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7928 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7929 KernelEnvironmentInitializer, KernelEnvironmentName,
7930 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7931 DL.getDefaultGlobalsAddressSpace());
7932 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7933
7934 Constant *KernelEnvironment =
7935 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7936 ? KernelEnvironmentGV
7937 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7938 KernelEnvironmentPtr);
7939 Value *KernelLaunchEnvironment =
7940 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
7941 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7942 KernelLaunchEnvironment =
7943 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7944 ? KernelLaunchEnvironment
7945 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7946 KernelLaunchEnvParamTy);
7947 CallInst *ThreadKind = createRuntimeFunctionCall(
7948 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7949
7950 Value *ExecUserCode = Builder.CreateICmpEQ(
7951 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7952 "exec_user_code");
7953
7954 // ThreadKind = __kmpc_target_init(...)
7955 // if (ThreadKind == -1)
7956 // user_code
7957 // else
7958 // return;
7959
7960 auto *UI = Builder.CreateUnreachable();
7961 BasicBlock *CheckBB = UI->getParent();
7962 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7963
7964 BasicBlock *WorkerExitBB = BasicBlock::Create(
7965 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7966 Builder.SetInsertPoint(WorkerExitBB);
7967 Builder.CreateRetVoid();
7968
7969 auto *CheckBBTI = CheckBB->getTerminator();
7970 Builder.SetInsertPoint(CheckBBTI);
7971 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7972
7973 CheckBBTI->eraseFromParent();
7974 UI->eraseFromParent();
7975
7976 // Continue in the "user_code" block, see diagram above and in
7977 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7978 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7979}
7980
7982 int32_t TeamsReductionDataSize,
7983 int32_t TeamsReductionBufferLength) {
7984 if (!updateToLocation(Loc))
7985 return;
7986
7988 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7989
7991
7992 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7993 return;
7994
7995 Function *Kernel = Builder.GetInsertBlock()->getParent();
7996 // We need to strip the debug prefix to get the correct kernel name.
7997 StringRef KernelName = Kernel->getName();
7998 const std::string DebugPrefix = "_debug__";
7999 if (KernelName.ends_with(DebugPrefix))
8000 KernelName = KernelName.drop_back(DebugPrefix.length());
8001 auto *KernelEnvironmentGV =
8002 M.getNamedGlobal((KernelName + "_kernel_environment").str());
8003 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
8004 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
8005 auto *NewInitializer = ConstantFoldInsertValueInstruction(
8006 KernelEnvironmentInitializer,
8007 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
8008 NewInitializer = ConstantFoldInsertValueInstruction(
8009 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
8010 {0, 8});
8011 KernelEnvironmentGV->setInitializer(NewInitializer);
8012}
8013
8014static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
8015 bool Min) {
8016 if (Kernel.hasFnAttribute(Name)) {
8017 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
8018 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
8019 }
8020 Kernel.addFnAttr(Name, llvm::utostr(Value));
8021}
8022
8023std::pair<int32_t, int32_t>
8025 int32_t ThreadLimit =
8026 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
8027
8028 if (T.isAMDGPU()) {
8029 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
8030 if (!Attr.isValid() || !Attr.isStringAttribute())
8031 return {0, ThreadLimit};
8032 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
8033 int32_t LB, UB;
8034 if (!llvm::to_integer(UBStr, UB, 10))
8035 return {0, ThreadLimit};
8036 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
8037 if (!llvm::to_integer(LBStr, LB, 10))
8038 return {0, UB};
8039 return {LB, UB};
8040 }
8041
8042 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
8043 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
8044 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
8045 }
8046 return {0, ThreadLimit};
8047}
8048
8050 Function &Kernel, int32_t LB,
8051 int32_t UB) {
8052 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
8053
8054 if (T.isAMDGPU()) {
8055 Kernel.addFnAttr("amdgpu-flat-work-group-size",
8056 llvm::utostr(LB) + "," + llvm::utostr(UB));
8057 return;
8058 }
8059
8061}
8062
8063std::pair<int32_t, int32_t>
8065 // TODO: Read from backend annotations if available.
8066 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8067}
8068
8070 int32_t LB, int32_t UB) {
8071 if (T.isNVPTX())
8072 if (UB > 0)
8074 if (T.isAMDGPU())
8075 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
8076
8077 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8078}
8079
8080void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8081 Function *OutlinedFn) {
8082 if (Config.isTargetDevice()) {
8084 // TODO: Determine if DSO local can be set to true.
8085 OutlinedFn->setDSOLocal(false);
8087 if (T.isAMDGCN())
8089 else if (T.isNVPTX())
8091 else if (T.isSPIRV())
8093 }
8094}
8095
8096Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8097 StringRef EntryFnIDName) {
8098 if (Config.isTargetDevice()) {
8099 assert(OutlinedFn && "The outlined function must exist if embedded");
8100 return OutlinedFn;
8101 }
8102
8103 return new GlobalVariable(
8104 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8105 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8106}
8107
8108Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8109 StringRef EntryFnName) {
8110 if (OutlinedFn)
8111 return OutlinedFn;
8112
8113 assert(!M.getGlobalVariable(EntryFnName, true) &&
8114 "Named kernel already exists?");
8115 return new GlobalVariable(
8116 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8117 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8118}
8119
8121 TargetRegionEntryInfo &EntryInfo,
8122 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8123 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8124
8125 SmallString<64> EntryFnName;
8126 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8127
8128 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8129 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8130 if (!CBResult)
8131 return CBResult.takeError();
8132 OutlinedFn = *CBResult;
8133 } else {
8134 OutlinedFn = nullptr;
8135 }
8136
8137 // If this target outline function is not an offload entry, we don't need to
8138 // register it. This may be in the case of a false if clause, or if there are
8139 // no OpenMP targets.
8140 if (!IsOffloadEntry)
8141 return Error::success();
8142
8143 std::string EntryFnIDName =
8144 Config.isTargetDevice()
8145 ? std::string(EntryFnName)
8146 : createPlatformSpecificName({EntryFnName, "region_id"});
8147
8148 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8149 EntryFnName, EntryFnIDName);
8150 return Error::success();
8151}
8152
8154 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8155 StringRef EntryFnName, StringRef EntryFnIDName) {
8156 if (OutlinedFn)
8157 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8158 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8159 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8160 OffloadInfoManager.registerTargetRegionEntryInfo(
8161 EntryInfo, EntryAddr, OutlinedFnID,
8163 return OutlinedFnID;
8164}
8165
8167 const LocationDescription &Loc, InsertPointTy AllocaIP,
8168 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8169 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8170 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8172 BodyGenTy BodyGenType)>
8173 BodyGenCB,
8174 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8175 if (!updateToLocation(Loc))
8176 return InsertPointTy();
8177
8178 Builder.restoreIP(CodeGenIP);
8179
8180 bool IsStandAlone = !BodyGenCB;
8181 MapInfosTy *MapInfo;
8182 // Generate the code for the opening of the data environment. Capture all the
8183 // arguments of the runtime call by reference because they are used in the
8184 // closing of the region.
8185 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8186 InsertPointTy CodeGenIP) -> Error {
8187 MapInfo = &GenMapInfoCB(Builder.saveIP());
8188 if (Error Err = emitOffloadingArrays(
8189 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8190 /*IsNonContiguous=*/true, DeviceAddrCB))
8191 return Err;
8192
8193 TargetDataRTArgs RTArgs;
8195
8196 // Emit the number of elements in the offloading arrays.
8197 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8198
8199 // Source location for the ident struct
8200 if (!SrcLocInfo) {
8201 uint32_t SrcLocStrSize;
8202 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8203 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8204 }
8205
8206 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8207 SrcLocInfo, DeviceID,
8208 PointerNum, RTArgs.BasePointersArray,
8209 RTArgs.PointersArray, RTArgs.SizesArray,
8210 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8211 RTArgs.MappersArray};
8212
8213 if (IsStandAlone) {
8214 assert(MapperFunc && "MapperFunc missing for standalone target data");
8215
8216 auto TaskBodyCB = [&](Value *, Value *,
8218 if (Info.HasNoWait) {
8219 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8223 }
8224
8226 OffloadingArgs);
8227
8228 if (Info.HasNoWait) {
8229 BasicBlock *OffloadContBlock =
8230 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8231 Function *CurFn = Builder.GetInsertBlock()->getParent();
8232 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8233 Builder.restoreIP(Builder.saveIP());
8234 }
8235 return Error::success();
8236 };
8237
8238 bool RequiresOuterTargetTask = Info.HasNoWait;
8239 if (!RequiresOuterTargetTask)
8240 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8241 /*TargetTaskAllocaIP=*/{}));
8242 else
8243 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8244 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8245 } else {
8246 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8247 omp::OMPRTL___tgt_target_data_begin_mapper);
8248
8249 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8250
8251 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8252 if (isa<AllocaInst>(DeviceMap.second.second)) {
8253 auto *LI =
8254 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8255 Builder.CreateStore(LI, DeviceMap.second.second);
8256 }
8257 }
8258
8259 // If device pointer privatization is required, emit the body of the
8260 // region here. It will have to be duplicated: with and without
8261 // privatization.
8262 InsertPointOrErrorTy AfterIP =
8263 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8264 if (!AfterIP)
8265 return AfterIP.takeError();
8266 Builder.restoreIP(*AfterIP);
8267 }
8268 return Error::success();
8269 };
8270
8271 // If we need device pointer privatization, we need to emit the body of the
8272 // region with no privatization in the 'else' branch of the conditional.
8273 // Otherwise, we don't have to do anything.
8274 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8275 InsertPointTy CodeGenIP) -> Error {
8276 InsertPointOrErrorTy AfterIP =
8277 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8278 if (!AfterIP)
8279 return AfterIP.takeError();
8280 Builder.restoreIP(*AfterIP);
8281 return Error::success();
8282 };
8283
8284 // Generate code for the closing of the data region.
8285 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8286 TargetDataRTArgs RTArgs;
8287 Info.EmitDebug = !MapInfo->Names.empty();
8288 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8289
8290 // Emit the number of elements in the offloading arrays.
8291 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8292
8293 // Source location for the ident struct
8294 if (!SrcLocInfo) {
8295 uint32_t SrcLocStrSize;
8296 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8297 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8298 }
8299
8300 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8301 PointerNum, RTArgs.BasePointersArray,
8302 RTArgs.PointersArray, RTArgs.SizesArray,
8303 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8304 RTArgs.MappersArray};
8305 Function *EndMapperFunc =
8306 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8307
8308 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8309 return Error::success();
8310 };
8311
8312 // We don't have to do anything to close the region if the if clause evaluates
8313 // to false.
8314 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8315 return Error::success();
8316 };
8317
8318 Error Err = [&]() -> Error {
8319 if (BodyGenCB) {
8320 Error Err = [&]() {
8321 if (IfCond)
8322 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8323 return BeginThenGen(AllocaIP, Builder.saveIP());
8324 }();
8325
8326 if (Err)
8327 return Err;
8328
8329 // If we don't require privatization of device pointers, we emit the body
8330 // in between the runtime calls. This avoids duplicating the body code.
8331 InsertPointOrErrorTy AfterIP =
8332 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8333 if (!AfterIP)
8334 return AfterIP.takeError();
8335 restoreIPandDebugLoc(Builder, *AfterIP);
8336
8337 if (IfCond)
8338 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8339 return EndThenGen(AllocaIP, Builder.saveIP());
8340 }
8341 if (IfCond)
8342 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8343 return BeginThenGen(AllocaIP, Builder.saveIP());
8344 }();
8345
8346 if (Err)
8347 return Err;
8348
8349 return Builder.saveIP();
8350}
8351
8354 bool IsGPUDistribute) {
8355 assert((IVSize == 32 || IVSize == 64) &&
8356 "IV size is not compatible with the omp runtime");
8357 RuntimeFunction Name;
8358 if (IsGPUDistribute)
8359 Name = IVSize == 32
8360 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8361 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8362 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8363 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8364 else
8365 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8366 : omp::OMPRTL___kmpc_for_static_init_4u)
8367 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8368 : omp::OMPRTL___kmpc_for_static_init_8u);
8369
8370 return getOrCreateRuntimeFunction(M, Name);
8371}
8372
8374 bool IVSigned) {
8375 assert((IVSize == 32 || IVSize == 64) &&
8376 "IV size is not compatible with the omp runtime");
8377 RuntimeFunction Name = IVSize == 32
8378 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8379 : omp::OMPRTL___kmpc_dispatch_init_4u)
8380 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8381 : omp::OMPRTL___kmpc_dispatch_init_8u);
8382
8383 return getOrCreateRuntimeFunction(M, Name);
8384}
8385
8387 bool IVSigned) {
8388 assert((IVSize == 32 || IVSize == 64) &&
8389 "IV size is not compatible with the omp runtime");
8390 RuntimeFunction Name = IVSize == 32
8391 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8392 : omp::OMPRTL___kmpc_dispatch_next_4u)
8393 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8394 : omp::OMPRTL___kmpc_dispatch_next_8u);
8395
8396 return getOrCreateRuntimeFunction(M, Name);
8397}
8398
8400 bool IVSigned) {
8401 assert((IVSize == 32 || IVSize == 64) &&
8402 "IV size is not compatible with the omp runtime");
8403 RuntimeFunction Name = IVSize == 32
8404 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8405 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8406 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8407 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8408
8409 return getOrCreateRuntimeFunction(M, Name);
8410}
8411
8413 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8414}
8415
8417 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8418 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8419
8420 DISubprogram *NewSP = Func->getSubprogram();
8421 if (!NewSP)
8422 return;
8423
8425
8426 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8427 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8428 // Only use cached variable if the arg number matches. This is important
8429 // so that DIVariable created for privatized variables are not discarded.
8430 if (NewVar && (arg == NewVar->getArg()))
8431 return NewVar;
8432
8434 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8435 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8436 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8437 return NewVar;
8438 };
8439
8440 auto UpdateDebugRecord = [&](auto *DR) {
8441 DILocalVariable *OldVar = DR->getVariable();
8442 unsigned ArgNo = 0;
8443 for (auto Loc : DR->location_ops()) {
8444 auto Iter = ValueReplacementMap.find(Loc);
8445 if (Iter != ValueReplacementMap.end()) {
8446 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8447 ArgNo = std::get<1>(Iter->second) + 1;
8448 }
8449 }
8450 if (ArgNo != 0)
8451 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8452 };
8453
8455 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8456 if (DVR->getNumVariableLocationOps() != 1u) {
8457 DVR->setKillLocation();
8458 return;
8459 }
8460 Value *Loc = DVR->getVariableLocationOp(0u);
8461 BasicBlock *CurBB = DVR->getParent();
8462 BasicBlock *RequiredBB = nullptr;
8463
8464 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8465 RequiredBB = LocInst->getParent();
8466 else if (isa<llvm::Argument>(Loc))
8467 RequiredBB = &DVR->getFunction()->getEntryBlock();
8468
8469 if (RequiredBB && RequiredBB != CurBB) {
8470 assert(!RequiredBB->empty());
8471 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8472 RequiredBB->back().getIterator());
8473 DVRsToDelete.push_back(DVR);
8474 }
8475 };
8476
8477 // The location and scope of variable intrinsics and records still point to
8478 // the parent function of the target region. Update them.
8479 for (Instruction &I : instructions(Func)) {
8481 "Unexpected debug intrinsic");
8482 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8483 UpdateDebugRecord(&DVR);
8484 MoveDebugRecordToCorrectBlock(&DVR);
8485 }
8486 }
8487 for (auto *DVR : DVRsToDelete)
8488 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8489 // An extra argument is passed to the device. Create the debug data for it.
8490 if (OMPBuilder.Config.isTargetDevice()) {
8491 DICompileUnit *CU = NewSP->getUnit();
8492 Module *M = Func->getParent();
8493 DIBuilder DB(*M, true, CU);
8494 DIType *VoidPtrTy =
8495 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8496 unsigned ArgNo = Func->arg_size();
8497 DILocalVariable *Var = DB.createParameterVariable(
8498 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8499 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8500 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8501 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8502 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8503 &(*Func->begin()));
8504 }
8505}
8506
8508 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8509 return cast<Operator>(V)->getOperand(0);
8510 return V;
8511}
8512
8514 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8516 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8519 SmallVector<Type *> ParameterTypes;
8520 if (OMPBuilder.Config.isTargetDevice()) {
8521 // All parameters to target devices are passed as pointers
8522 // or i64. This assumes 64-bit address spaces/pointers.
8523 for (auto &Arg : Inputs)
8524 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8525 ? Arg->getType()
8526 : Type::getInt64Ty(Builder.getContext()));
8527 } else {
8528 for (auto &Arg : Inputs)
8529 ParameterTypes.push_back(Arg->getType());
8530 }
8531
8532 // The implicit dyn_ptr argument is always the last parameter on both host
8533 // and device so the argument counts match without runtime manipulation.
8534 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8535 ParameterTypes.push_back(PtrTy);
8536
8537 auto BB = Builder.GetInsertBlock();
8538 auto M = BB->getModule();
8539 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8540 /*isVarArg*/ false);
8541 auto Func =
8542 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8543
8544 // Forward target-cpu and target-features function attributes from the
8545 // original function to the new outlined function.
8546 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8547
8548 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8549 if (TargetCpuAttr.isStringAttribute())
8550 Func->addFnAttr(TargetCpuAttr);
8551
8552 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8553 if (TargetFeaturesAttr.isStringAttribute())
8554 Func->addFnAttr(TargetFeaturesAttr);
8555
8556 if (OMPBuilder.Config.isTargetDevice()) {
8557 Value *ExecMode =
8558 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8559 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8560 }
8561
8562 // Save insert point.
8563 IRBuilder<>::InsertPointGuard IPG(Builder);
8564 // We will generate the entries in the outlined function but the debug
8565 // location may still be pointing to the parent function. Reset it now.
8566 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8567
8568 // Generate the region into the function.
8569 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8570 Builder.SetInsertPoint(EntryBB);
8571
8572 // Insert target init call in the device compilation pass.
8573 if (OMPBuilder.Config.isTargetDevice())
8574 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8575
8576 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8577
8578 // As we embed the user code in the middle of our target region after we
8579 // generate entry code, we must move what allocas we can into the entry
8580 // block to avoid possible breaking optimisations for device
8581 if (OMPBuilder.Config.isTargetDevice())
8583
8584 // Insert target deinit call in the device compilation pass.
8585 BasicBlock *OutlinedBodyBB =
8586 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8588 Builder.saveIP(),
8589 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8590 if (!AfterIP)
8591 return AfterIP.takeError();
8592 Builder.restoreIP(*AfterIP);
8593 if (OMPBuilder.Config.isTargetDevice())
8594 OMPBuilder.createTargetDeinit(Builder);
8595
8596 // Insert return instruction.
8597 Builder.CreateRetVoid();
8598
8599 // New Alloca IP at entry point of created device function.
8600 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8601 auto AllocaIP = Builder.saveIP();
8602
8603 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8604
8605 // Do not include the artificial dyn_ptr argument.
8606 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8607
8609
8610 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8611 // Things like GEP's can come in the form of Constants. Constants and
8612 // ConstantExpr's do not have access to the knowledge of what they're
8613 // contained in, so we must dig a little to find an instruction so we
8614 // can tell if they're used inside of the function we're outlining. We
8615 // also replace the original constant expression with a new instruction
8616 // equivalent; an instruction as it allows easy modification in the
8617 // following loop, as we can now know the constant (instruction) is
8618 // owned by our target function and replaceUsesOfWith can now be invoked
8619 // on it (cannot do this with constants it seems). A brand new one also
8620 // allows us to be cautious as it is perhaps possible the old expression
8621 // was used inside of the function but exists and is used externally
8622 // (unlikely by the nature of a Constant, but still).
8623 // NOTE: We cannot remove dead constants that have been rewritten to
8624 // instructions at this stage, we run the risk of breaking later lowering
8625 // by doing so as we could still be in the process of lowering the module
8626 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8627 // constants we have created rewritten versions of.
8628 if (auto *Const = dyn_cast<Constant>(Input))
8629 convertUsersOfConstantsToInstructions(Const, Func, false);
8630
8631 // Collect users before iterating over them to avoid invalidating the
8632 // iteration in case a user uses Input more than once (e.g. a call
8633 // instruction).
8634 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8635 // Collect all the instructions
8637 if (auto *Instr = dyn_cast<Instruction>(User))
8638 if (Instr->getFunction() == Func)
8639 Instr->replaceUsesOfWith(Input, InputCopy);
8640 };
8641
8642 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8643
8644 // Rewrite uses of input valus to parameters.
8645 for (auto InArg : zip(Inputs, ArgRange)) {
8646 Value *Input = std::get<0>(InArg);
8647 Argument &Arg = std::get<1>(InArg);
8648 Value *InputCopy = nullptr;
8649
8651 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8652 if (!AfterIP)
8653 return AfterIP.takeError();
8654 Builder.restoreIP(*AfterIP);
8655 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8656
8657 // In certain cases a Global may be set up for replacement, however, this
8658 // Global may be used in multiple arguments to the kernel, just segmented
8659 // apart, for example, if we have a global array, that is sectioned into
8660 // multiple mappings (technically not legal in OpenMP, but there is a case
8661 // in Fortran for Common Blocks where this is neccesary), we will end up
8662 // with GEP's into this array inside the kernel, that refer to the Global
8663 // but are technically separate arguments to the kernel for all intents and
8664 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8665 // index, it will fold into an referal to the Global, if we then encounter
8666 // this folded GEP during replacement all of the references to the
8667 // Global in the kernel will be replaced with the argument we have generated
8668 // that corresponds to it, including any other GEP's that refer to the
8669 // Global that may be other arguments. This will invalidate all of the other
8670 // preceding mapped arguments that refer to the same global that may be
8671 // separate segments. To prevent this, we defer global processing until all
8672 // other processing has been performed.
8675 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8676 continue;
8677 }
8678
8680 continue;
8681
8682 ReplaceValue(Input, InputCopy, Func);
8683 }
8684
8685 // Replace all of our deferred Input values, currently just Globals.
8686 for (auto Deferred : DeferredReplacement)
8687 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8688
8689 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8690 ValueReplacementMap);
8691 return Func;
8692}
8693/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8694/// of pointers containing shared data between the parent task and the created
8695/// task.
8697 IRBuilderBase &Builder,
8698 Value *TaskWithPrivates,
8699 Type *TaskWithPrivatesTy) {
8700
8701 Type *TaskTy = OMPIRBuilder.Task;
8702 LLVMContext &Ctx = Builder.getContext();
8703 Value *TaskT =
8704 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8705 Value *Shareds = TaskT;
8706 // TaskWithPrivatesTy can be one of the following
8707 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8708 // %struct.privates }
8709 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8710 //
8711 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8712 // its first member has to be the task descriptor. TaskTy is the type of the
8713 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8714 // first member of TaskT, gives us the pointer to shared data.
8715 if (TaskWithPrivatesTy != TaskTy)
8716 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8717 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8718}
8719/// Create an entry point for a target task with the following.
8720/// It'll have the following signature
8721/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8722/// This function is called from emitTargetTask once the
8723/// code to launch the target kernel has been outlined already.
8724/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8725/// into the task structure so that the deferred target task can access this
8726/// data even after the stack frame of the generating task has been rolled
8727/// back. Offloading arrays contain base pointers, pointers, sizes etc
8728/// of the data that the target kernel will access. These in effect are the
8729/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8731 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8732 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8733 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8734
8735 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8736 // This is because PrivatesTy is the type of the structure in which
8737 // we pass the offloading arrays to the deferred target task.
8738 assert((!NumOffloadingArrays || PrivatesTy) &&
8739 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8740 "to privatize");
8741
8742 Module &M = OMPBuilder.M;
8743 // KernelLaunchFunction is the target launch function, i.e.
8744 // the function that sets up kernel arguments and calls
8745 // __tgt_target_kernel to launch the kernel on the device.
8746 //
8747 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8748
8749 // StaleCI is the CallInst which is the call to the outlined
8750 // target kernel launch function. If there are local live-in values
8751 // that the outlined function uses then these are aggregated into a structure
8752 // which is passed as the second argument. If there are no local live-in
8753 // values or if all values used by the outlined kernel are global variables,
8754 // then there's only one argument, the threadID. So, StaleCI can be
8755 //
8756 // %structArg = alloca { ptr, ptr }, align 8
8757 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8758 // store ptr %20, ptr %gep_, align 8
8759 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8760 // store ptr %21, ptr %gep_8, align 8
8761 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8762 //
8763 // OR
8764 //
8765 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8767 StaleCI->getIterator());
8768
8769 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8770
8771 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8772 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8773 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8774
8775 auto ProxyFnTy =
8776 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8777 /* isVarArg */ false);
8778 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8779 ".omp_target_task_proxy_func",
8780 Builder.GetInsertBlock()->getModule());
8781 Value *ThreadId = ProxyFn->getArg(0);
8782 Value *TaskWithPrivates = ProxyFn->getArg(1);
8783 ThreadId->setName("thread.id");
8784 TaskWithPrivates->setName("task");
8785
8786 bool HasShareds = SharedArgsOperandNo > 0;
8787 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8788 BasicBlock *EntryBB =
8789 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8790 Builder.SetInsertPoint(EntryBB);
8791
8792 SmallVector<Value *> KernelLaunchArgs;
8793 KernelLaunchArgs.reserve(StaleCI->arg_size());
8794 KernelLaunchArgs.push_back(ThreadId);
8795
8796 if (HasOffloadingArrays) {
8797 assert(TaskTy != TaskWithPrivatesTy &&
8798 "If there are offloading arrays to pass to the target"
8799 "TaskTy cannot be the same as TaskWithPrivatesTy");
8800 (void)TaskTy;
8801 Value *Privates =
8802 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8803 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8804 KernelLaunchArgs.push_back(
8805 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8806 }
8807
8808 if (HasShareds) {
8809 auto *ArgStructAlloca =
8810 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8811 assert(ArgStructAlloca &&
8812 "Unable to find the alloca instruction corresponding to arguments "
8813 "for extracted function");
8814 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8815 std::optional<TypeSize> ArgAllocSize =
8816 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8817 assert(ArgStructType && ArgAllocSize &&
8818 "Unable to determine size of arguments for extracted function");
8819 uint64_t StructSize = ArgAllocSize->getFixedValue();
8820
8821 AllocaInst *NewArgStructAlloca =
8822 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8823
8824 Value *SharedsSize = Builder.getInt64(StructSize);
8825
8827 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8828
8829 Builder.CreateMemCpy(
8830 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8831 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8832 KernelLaunchArgs.push_back(NewArgStructAlloca);
8833 }
8834 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8835 Builder.CreateRetVoid();
8836 return ProxyFn;
8837}
8839
8840 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8841 return GEP->getSourceElementType();
8842 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8843 return Alloca->getAllocatedType();
8844
8845 llvm_unreachable("Unhandled Instruction type");
8846 return nullptr;
8847}
8848// This function returns a struct that has at most two members.
8849// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8850// descriptor. The second member, if needed, is a struct containing arrays
8851// that need to be passed to the offloaded target kernel. For example,
8852// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8853// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8854// respectively, then the types created by this function are
8855//
8856// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8857// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8858// %struct.privates }
8859// %struct.task_with_privates is returned by this function.
8860// If there aren't any offloading arrays to pass to the target kernel,
8861// %struct.kmp_task_ompbuilder_t is returned.
8862static StructType *
8864 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8865
8866 if (OffloadingArraysToPrivatize.empty())
8867 return OMPIRBuilder.Task;
8868
8869 SmallVector<Type *, 4> StructFieldTypes;
8870 for (Value *V : OffloadingArraysToPrivatize) {
8871 assert(V->getType()->isPointerTy() &&
8872 "Expected pointer to array to privatize. Got a non-pointer value "
8873 "instead");
8874 Type *ArrayTy = getOffloadingArrayType(V);
8875 assert(ArrayTy && "ArrayType cannot be nullptr");
8876 StructFieldTypes.push_back(ArrayTy);
8877 }
8878 StructType *PrivatesStructTy =
8879 StructType::create(StructFieldTypes, "struct.privates");
8880 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8881 "struct.task_with_privates");
8882}
8884 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8885 TargetRegionEntryInfo &EntryInfo,
8887 Function *&OutlinedFn, Constant *&OutlinedFnID,
8891
8892 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8893 [&](StringRef EntryFnName) {
8894 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8895 EntryFnName, Inputs, CBFunc,
8896 ArgAccessorFuncCB);
8897 };
8898
8899 return OMPBuilder.emitTargetRegionFunction(
8900 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8901 OutlinedFnID);
8902}
8903
8905 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8907 const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs,
8908 bool HasNoWait) {
8909
8910 // The following explains the code-gen scenario for the `target` directive. A
8911 // similar scneario is followed for other device-related directives (e.g.
8912 // `target enter data`) but in similar fashion since we only need to emit task
8913 // that encapsulates the proper runtime call.
8914 //
8915 // When we arrive at this function, the target region itself has been
8916 // outlined into the function OutlinedFn.
8917 // So at ths point, for
8918 // --------------------------------------------------------------
8919 // void user_code_that_offloads(...) {
8920 // omp target depend(..) map(from:a) map(to:b) private(i)
8921 // do i = 1, 10
8922 // a(i) = b(i) + n
8923 // }
8924 //
8925 // --------------------------------------------------------------
8926 //
8927 // we have
8928 //
8929 // --------------------------------------------------------------
8930 //
8931 // void user_code_that_offloads(...) {
8932 // %.offload_baseptrs = alloca [2 x ptr], align 8
8933 // %.offload_ptrs = alloca [2 x ptr], align 8
8934 // %.offload_mappers = alloca [2 x ptr], align 8
8935 // ;; target region has been outlined and now we need to
8936 // ;; offload to it via a target task.
8937 // }
8938 // void outlined_device_function(ptr a, ptr b, ptr n) {
8939 // n = *n_ptr;
8940 // do i = 1, 10
8941 // a(i) = b(i) + n
8942 // }
8943 //
8944 // We have to now do the following
8945 // (i) Make an offloading call to outlined_device_function using the OpenMP
8946 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8947 // emitted by emitKernelLaunch
8948 // (ii) Create a task entry point function that calls kernel_launch_function
8949 // and is the entry point for the target task. See
8950 // '@.omp_target_task_proxy_func in the pseudocode below.
8951 // (iii) Create a task with the task entry point created in (ii)
8952 //
8953 // That is we create the following
8954 // struct task_with_privates {
8955 // struct kmp_task_ompbuilder_t task_struct;
8956 // struct privates {
8957 // [2 x ptr] ; baseptrs
8958 // [2 x ptr] ; ptrs
8959 // [2 x i64] ; sizes
8960 // }
8961 // }
8962 // void user_code_that_offloads(...) {
8963 // %.offload_baseptrs = alloca [2 x ptr], align 8
8964 // %.offload_ptrs = alloca [2 x ptr], align 8
8965 // %.offload_sizes = alloca [2 x i64], align 8
8966 //
8967 // %structArg = alloca { ptr, ptr, ptr }, align 8
8968 // %strucArg[0] = a
8969 // %strucArg[1] = b
8970 // %strucArg[2] = &n
8971 //
8972 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8973 // sizeof(kmp_task_ompbuilder_t),
8974 // sizeof(structArg),
8975 // @.omp_target_task_proxy_func,
8976 // ...)
8977 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8978 // sizeof(structArg))
8979 // memcpy(target_task_with_privates->privates->baseptrs,
8980 // offload_baseptrs, sizeof(offload_baseptrs)
8981 // memcpy(target_task_with_privates->privates->ptrs,
8982 // offload_ptrs, sizeof(offload_ptrs)
8983 // memcpy(target_task_with_privates->privates->sizes,
8984 // offload_sizes, sizeof(offload_sizes)
8985 // dependencies_array = ...
8986 // ;; if nowait not present
8987 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8988 // call @__kmpc_omp_task_begin_if0(...)
8989 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8990 // %target_task_with_privates)
8991 // call @__kmpc_omp_task_complete_if0(...)
8992 // }
8993 //
8994 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8995 // ptr %task) {
8996 // %structArg = alloca {ptr, ptr, ptr}
8997 // %task_ptr = getelementptr(%task, 0, 0)
8998 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8999 // mempcy(%structArg, %shared_data, sizeof(%structArg))
9000 //
9001 // %offloading_arrays = getelementptr(%task, 0, 1)
9002 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
9003 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
9004 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
9005 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
9006 // %offload_sizes, %structArg)
9007 // }
9008 //
9009 // We need the proxy function because the signature of the task entry point
9010 // expected by kmpc_omp_task is always the same and will be different from
9011 // that of the kernel_launch function.
9012 //
9013 // kernel_launch_function is generated by emitKernelLaunch and has the
9014 // always_inline attribute. For this example, it'll look like so:
9015 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
9016 // %offload_sizes, %structArg) alwaysinline {
9017 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
9018 // ; load aggregated data from %structArg
9019 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
9020 // ; offload_sizes
9021 // call i32 @__tgt_target_kernel(...,
9022 // outlined_device_function,
9023 // ptr %kernel_args)
9024 // }
9025 // void outlined_device_function(ptr a, ptr b, ptr n) {
9026 // n = *n_ptr;
9027 // do i = 1, 10
9028 // a(i) = b(i) + n
9029 // }
9030 //
9031 BasicBlock *TargetTaskBodyBB =
9032 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
9033 BasicBlock *TargetTaskAllocaBB =
9034 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
9035
9036 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
9037 TargetTaskAllocaBB->begin());
9038 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
9039
9040 OutlineInfo OI;
9041 OI.EntryBB = TargetTaskAllocaBB;
9042 OI.OuterAllocaBB = AllocaIP.getBlock();
9043
9044 // Add the thread ID argument.
9047 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
9048
9049 // Generate the task body which will subsequently be outlined.
9050 Builder.restoreIP(TargetTaskBodyIP);
9051 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
9052 return Err;
9053
9054 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
9055 // it is given. These blocks are enumerated by
9056 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
9057 // to be outside the region. In other words, OI.ExitBlock is expected to be
9058 // the start of the region after the outlining. We used to set OI.ExitBlock
9059 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
9060 // except when the task body is a single basic block. In that case,
9061 // OI.ExitBlock is set to the single task body block and will get left out of
9062 // the outlining process. So, simply create a new empty block to which we
9063 // uncoditionally branch from where TaskBodyCB left off
9064 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9065 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
9066 /*IsFinished=*/true);
9067
9068 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9069 bool NeedsTargetTask = HasNoWait && DeviceID;
9070 if (NeedsTargetTask) {
9071 for (auto *V :
9072 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9073 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9074 RTArgs.SizesArray}) {
9076 OffloadingArraysToPrivatize.push_back(V);
9078 }
9079 }
9080 }
9081 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9082 DeviceID, OffloadingArraysToPrivatize](
9083 Function &OutlinedFn) mutable {
9084 assert(OutlinedFn.hasOneUse() &&
9085 "there must be a single user for the outlined function");
9086
9087 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9088
9089 // The first argument of StaleCI is always the thread id.
9090 // The next few arguments are the pointers to offloading arrays
9091 // if any. (see OffloadingArraysToPrivatize)
9092 // Finally, all other local values that are live-in into the outlined region
9093 // end up in a structure whose pointer is passed as the last argument. This
9094 // piece of data is passed in the "shared" field of the task structure. So,
9095 // we know we have to pass shareds to the task if the number of arguments is
9096 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9097 // thread id. Further, for safety, we assert that the number of arguments of
9098 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9099 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9100 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9101 assert((!HasShareds ||
9102 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9103 "Wrong number of arguments for StaleCI when shareds are present");
9104 int SharedArgOperandNo =
9105 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9106
9107 StructType *TaskWithPrivatesTy =
9108 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9109 StructType *PrivatesTy = nullptr;
9110
9111 if (!OffloadingArraysToPrivatize.empty())
9112 PrivatesTy =
9113 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9114
9116 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9117 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9118
9119 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9120 << "\n");
9121
9122 Builder.SetInsertPoint(StaleCI);
9123
9124 // Gather the arguments for emitting the runtime call.
9125 uint32_t SrcLocStrSize;
9126 Constant *SrcLocStr =
9128 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9129
9130 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9131 //
9132 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9133 // the DeviceID to the deferred task and also since
9134 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9135 Function *TaskAllocFn =
9136 !NeedsTargetTask
9137 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9139 OMPRTL___kmpc_omp_target_task_alloc);
9140
9141 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9142 // call.
9143 Value *ThreadID = getOrCreateThreadID(Ident);
9144
9145 // Argument - `sizeof_kmp_task_t` (TaskSize)
9146 // Tasksize refers to the size in bytes of kmp_task_t data structure
9147 // plus any other data to be passed to the target task, if any, which
9148 // is packed into a struct. kmp_task_t and the struct so created are
9149 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9150 Value *TaskSize = Builder.getInt64(
9151 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9152
9153 // Argument - `sizeof_shareds` (SharedsSize)
9154 // SharedsSize refers to the shareds array size in the kmp_task_t data
9155 // structure.
9156 Value *SharedsSize = Builder.getInt64(0);
9157 if (HasShareds) {
9158 auto *ArgStructAlloca =
9159 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9160 assert(ArgStructAlloca &&
9161 "Unable to find the alloca instruction corresponding to arguments "
9162 "for extracted function");
9163 std::optional<TypeSize> ArgAllocSize =
9164 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9165 assert(ArgAllocSize &&
9166 "Unable to determine size of arguments for extracted function");
9167 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9168 }
9169
9170 // Argument - `flags`
9171 // Task is tied iff (Flags & 1) == 1.
9172 // Task is untied iff (Flags & 1) == 0.
9173 // Task is final iff (Flags & 2) == 2.
9174 // Task is not final iff (Flags & 2) == 0.
9175 // A target task is not final and is untied.
9176 Value *Flags = Builder.getInt32(0);
9177
9178 // Emit the @__kmpc_omp_task_alloc runtime call
9179 // The runtime call returns a pointer to an area where the task captured
9180 // variables must be copied before the task is run (TaskData)
9181 CallInst *TaskData = nullptr;
9182
9183 SmallVector<llvm::Value *> TaskAllocArgs = {
9184 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9185 /*flags=*/Flags,
9186 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9187 /*task_func=*/ProxyFn};
9188
9189 if (NeedsTargetTask) {
9190 assert(DeviceID && "Expected non-empty device ID.");
9191 TaskAllocArgs.push_back(DeviceID);
9192 }
9193
9194 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9195
9196 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9197 if (HasShareds) {
9198 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9200 *this, Builder, TaskData, TaskWithPrivatesTy);
9201 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9202 SharedsSize);
9203 }
9204 if (!OffloadingArraysToPrivatize.empty()) {
9205 Value *Privates =
9206 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9207 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9208 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9209 [[maybe_unused]] Type *ArrayType =
9210 getOffloadingArrayType(PtrToPrivatize);
9211 assert(ArrayType && "ArrayType cannot be nullptr");
9212
9213 Type *ElementType = PrivatesTy->getElementType(i);
9214 assert(ElementType == ArrayType &&
9215 "ElementType should match ArrayType");
9216 (void)ArrayType;
9217
9218 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9219 Builder.CreateMemCpy(
9220 Dst, Alignment, PtrToPrivatize, Alignment,
9221 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9222 }
9223 }
9224
9225 Value *DepArray = nullptr;
9226 Value *NumDeps = nullptr;
9227 if (Dependencies.DepArray) {
9228 DepArray = Dependencies.DepArray;
9229 NumDeps = Dependencies.NumDeps;
9230 } else if (!Dependencies.Deps.empty()) {
9231 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
9232 NumDeps = Builder.getInt32(Dependencies.Deps.size());
9233 }
9234
9235 // ---------------------------------------------------------------
9236 // V5.2 13.8 target construct
9237 // If the nowait clause is present, execution of the target task
9238 // may be deferred. If the nowait clause is not present, the target task is
9239 // an included task.
9240 // ---------------------------------------------------------------
9241 // The above means that the lack of a nowait on the target construct
9242 // translates to '#pragma omp task if(0)'
9243 if (!NeedsTargetTask) {
9244 if (DepArray) {
9245 Function *TaskWaitFn =
9246 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9248 TaskWaitFn,
9249 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9250 /*ndeps=*/NumDeps,
9251 /*dep_list=*/DepArray,
9252 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9253 /*noalias_dep_list=*/
9255 }
9256 // Included task.
9257 Function *TaskBeginFn =
9258 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9259 Function *TaskCompleteFn =
9260 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9261 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9262 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9263 CI->setDebugLoc(StaleCI->getDebugLoc());
9264 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9265 } else if (DepArray) {
9266 // HasNoWait - meaning the task may be deferred. Call
9267 // __kmpc_omp_task_with_deps if there are dependencies,
9268 // else call __kmpc_omp_task
9269 Function *TaskFn =
9270 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9272 TaskFn,
9273 {Ident, ThreadID, TaskData, NumDeps, DepArray,
9274 ConstantInt::get(Builder.getInt32Ty(), 0),
9276 } else {
9277 // Emit the @__kmpc_omp_task runtime call to spawn the task
9278 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9279 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9280 }
9281
9282 StaleCI->eraseFromParent();
9283 for (Instruction *I : llvm::reverse(ToBeDeleted))
9284 I->eraseFromParent();
9285 };
9286 addOutlineInfo(std::move(OI));
9287
9288 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9289 << *(Builder.GetInsertBlock()) << "\n");
9290 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9291 << *(Builder.GetInsertBlock()->getParent()->getParent())
9292 << "\n");
9293 return Builder.saveIP();
9294}
9295
9297 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9298 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9299 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9300 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9301 if (Error Err =
9302 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9303 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9304 return Err;
9305 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9306 return Error::success();
9307}
9308
9309static void
9315 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9319 const OpenMPIRBuilder::DependenciesInfo &Dependencies,
9320 bool HasNoWait, Value *DynCGroupMem,
9321 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9322 // Generate a function call to the host fallback implementation of the target
9323 // region. This is called by the host when no offload entry was generated for
9324 // the target region and when the offloading call fails at runtime.
9325 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9327 Builder.restoreIP(IP);
9328 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9329 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9330 FallbackArgs.push_back(
9331 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9332 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9333 return Builder.saveIP();
9334 };
9335
9336 bool HasDependencies = !Dependencies.empty();
9337 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9338
9340
9341 auto TaskBodyCB =
9342 [&](Value *DeviceID, Value *RTLoc,
9343 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9344 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9345 // produce any.
9347 // emitKernelLaunch makes the necessary runtime call to offload the
9348 // kernel. We then outline all that code into a separate function
9349 // ('kernel_launch_function' in the pseudo code above). This function is
9350 // then called by the target task proxy function (see
9351 // '@.omp_target_task_proxy_func' in the pseudo code above)
9352 // "@.omp_target_task_proxy_func' is generated by
9353 // emitTargetTaskProxyFunction.
9354 if (OutlinedFnID && DeviceID)
9355 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9356 EmitTargetCallFallbackCB, KArgs,
9357 DeviceID, RTLoc, TargetTaskAllocaIP);
9358
9359 // We only need to do the outlining if `DeviceID` is set to avoid calling
9360 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9361 // generating the `else` branch of an `if` clause.
9362 //
9363 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9364 // In this case, we execute the host implementation directly.
9365 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9366 }());
9367
9368 OMPBuilder.Builder.restoreIP(AfterIP);
9369 return Error::success();
9370 };
9371
9372 auto &&EmitTargetCallElse =
9373 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9375 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9376 // produce any.
9378 if (RequiresOuterTargetTask) {
9379 // Arguments that are intended to be directly forwarded to an
9380 // emitKernelLaunch call are pased as nullptr, since
9381 // OutlinedFnID=nullptr results in that call not being done.
9383 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9384 /*RTLoc=*/nullptr, AllocaIP,
9385 Dependencies, EmptyRTArgs, HasNoWait);
9386 }
9387 return EmitTargetCallFallbackCB(Builder.saveIP());
9388 }());
9389
9390 Builder.restoreIP(AfterIP);
9391 return Error::success();
9392 };
9393
9394 auto &&EmitTargetCallThen =
9395 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9397 Info.HasNoWait = HasNoWait;
9398 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9399
9401 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9402 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9403 /*IsNonContiguous=*/true,
9404 /*ForEndCall=*/false))
9405 return Err;
9406
9407 SmallVector<Value *, 3> NumTeamsC;
9408 for (auto [DefaultVal, RuntimeVal] :
9409 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9410 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9411 : Builder.getInt32(DefaultVal));
9412
9413 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9414 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9415 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9416 if (Clause)
9417 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9418 /*isSigned=*/false);
9419 return Clause;
9420 };
9421 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9422 if (Clause)
9423 Result =
9424 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9425 Result, Clause)
9426 : Clause;
9427 };
9428
9429 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9430 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9431 SmallVector<Value *, 3> NumThreadsC;
9432 Value *MaxThreadsClause =
9433 RuntimeAttrs.TeamsThreadLimit.size() == 1
9434 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9435 : nullptr;
9436
9437 for (auto [TeamsVal, TargetVal] : zip_equal(
9438 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9439 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9440 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9441
9442 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9443 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9444
9445 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9446 }
9447
9448 unsigned NumTargetItems = Info.NumberOfPtrs;
9449 uint32_t SrcLocStrSize;
9450 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9451 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9452 llvm::omp::IdentFlag(0), 0);
9453
9454 Value *TripCount = RuntimeAttrs.LoopTripCount
9455 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9456 Builder.getInt64Ty(),
9457 /*isSigned=*/false)
9458 : Builder.getInt64(0);
9459
9460 // Request zero groupprivate bytes by default.
9461 if (!DynCGroupMem)
9462 DynCGroupMem = Builder.getInt32(0);
9463
9465 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9466 HasNoWait, DynCGroupMemFallback);
9467
9468 // Assume no error was returned because TaskBodyCB and
9469 // EmitTargetCallFallbackCB don't produce any.
9471 // The presence of certain clauses on the target directive require the
9472 // explicit generation of the target task.
9473 if (RequiresOuterTargetTask)
9474 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9475 RTLoc, AllocaIP, Dependencies,
9476 KArgs.RTArgs, Info.HasNoWait);
9477
9478 return OMPBuilder.emitKernelLaunch(
9479 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9480 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9481 }());
9482
9483 Builder.restoreIP(AfterIP);
9484 return Error::success();
9485 };
9486
9487 // If we don't have an ID for the target region, it means an offload entry
9488 // wasn't created. In this case we just run the host fallback directly and
9489 // ignore any potential 'if' clauses.
9490 if (!OutlinedFnID) {
9491 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9492 return;
9493 }
9494
9495 // If there's no 'if' clause, only generate the kernel launch code path.
9496 if (!IfCond) {
9497 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9498 return;
9499 }
9500
9501 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9502 EmitTargetCallElse, AllocaIP));
9503}
9504
9506 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9507 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9508 TargetRegionEntryInfo &EntryInfo,
9509 const TargetKernelDefaultAttrs &DefaultAttrs,
9510 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9511 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9514 CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies,
9515 bool HasNowait, Value *DynCGroupMem,
9516 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9517
9518 if (!updateToLocation(Loc))
9519 return InsertPointTy();
9520
9521 Builder.restoreIP(CodeGenIP);
9522
9523 Function *OutlinedFn;
9524 Constant *OutlinedFnID = nullptr;
9525 // The target region is outlined into its own function. The LLVM IR for
9526 // the target region itself is generated using the callbacks CBFunc
9527 // and ArgAccessorFuncCB
9529 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9530 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9531 return Err;
9532
9533 // If we are not on the target device, then we need to generate code
9534 // to make a remote call (offload) to the previously outlined function
9535 // that represents the target region. Do that now.
9536 if (!Config.isTargetDevice())
9537 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9538 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9539 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9540 DynCGroupMemFallback);
9541 return Builder.saveIP();
9542}
9543
9544std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9545 StringRef FirstSeparator,
9546 StringRef Separator) {
9547 SmallString<128> Buffer;
9548 llvm::raw_svector_ostream OS(Buffer);
9549 StringRef Sep = FirstSeparator;
9550 for (StringRef Part : Parts) {
9551 OS << Sep << Part;
9552 Sep = Separator;
9553 }
9554 return OS.str().str();
9555}
9556
9557std::string
9559 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9560 Config.separator());
9561}
9562
9564 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9565 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9566 if (Elem.second) {
9567 assert(Elem.second->getValueType() == Ty &&
9568 "OMP internal variable has different type than requested");
9569 } else {
9570 // TODO: investigate the appropriate linkage type used for the global
9571 // variable for possibly changing that to internal or private, or maybe
9572 // create different versions of the function for different OMP internal
9573 // variables.
9574 const DataLayout &DL = M.getDataLayout();
9575 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9576 // default global AS is 1.
9577 // See double-target-call-with-declare-target.f90 and
9578 // declare-target-vars-in-target-region.f90 libomptarget
9579 // tests.
9580 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9581 : M.getTargetTriple().isAMDGPU()
9582 ? 0
9583 : DL.getDefaultGlobalsAddressSpace();
9584 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9587 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9588 Constant::getNullValue(Ty), Elem.first(),
9589 /*InsertBefore=*/nullptr,
9590 GlobalValue::NotThreadLocal, AddressSpaceVal);
9591 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9592 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9593 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9594 Elem.second = GV;
9595 }
9596
9597 return Elem.second;
9598}
9599
9600Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9601 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9602 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9603 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9604}
9605
9607 LLVMContext &Ctx = Builder.getContext();
9608 Value *Null =
9609 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9610 Value *SizeGep =
9611 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9612 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9613 return SizePtrToInt;
9614}
9615
9618 std::string VarName) {
9619 llvm::Constant *MaptypesArrayInit =
9620 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9621 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9622 M, MaptypesArrayInit->getType(),
9623 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9624 VarName);
9625 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9626 return MaptypesArrayGlobal;
9627}
9628
9630 InsertPointTy AllocaIP,
9631 unsigned NumOperands,
9632 struct MapperAllocas &MapperAllocas) {
9633 if (!updateToLocation(Loc))
9634 return;
9635
9636 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9637 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9638 Builder.restoreIP(AllocaIP);
9639 AllocaInst *ArgsBase = Builder.CreateAlloca(
9640 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9641 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9642 ".offload_ptrs");
9643 AllocaInst *ArgSizes = Builder.CreateAlloca(
9644 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9646 MapperAllocas.ArgsBase = ArgsBase;
9647 MapperAllocas.Args = Args;
9648 MapperAllocas.ArgSizes = ArgSizes;
9649}
9650
9652 Function *MapperFunc, Value *SrcLocInfo,
9653 Value *MaptypesArg, Value *MapnamesArg,
9655 int64_t DeviceID, unsigned NumOperands) {
9656 if (!updateToLocation(Loc))
9657 return;
9658
9659 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9660 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9661 Value *ArgsBaseGEP =
9662 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9663 {Builder.getInt32(0), Builder.getInt32(0)});
9664 Value *ArgsGEP =
9665 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9666 {Builder.getInt32(0), Builder.getInt32(0)});
9667 Value *ArgSizesGEP =
9668 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9669 {Builder.getInt32(0), Builder.getInt32(0)});
9670 Value *NullPtr =
9671 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9672 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9673 Builder.getInt32(NumOperands),
9674 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9675 MaptypesArg, MapnamesArg, NullPtr});
9676}
9677
9679 TargetDataRTArgs &RTArgs,
9680 TargetDataInfo &Info,
9681 bool ForEndCall) {
9682 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9683 "expected region end call to runtime only when end call is separate");
9684 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9685 auto VoidPtrTy = UnqualPtrTy;
9686 auto VoidPtrPtrTy = UnqualPtrTy;
9687 auto Int64Ty = Type::getInt64Ty(M.getContext());
9688 auto Int64PtrTy = UnqualPtrTy;
9689
9690 if (!Info.NumberOfPtrs) {
9691 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9692 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9693 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9694 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9695 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9696 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9697 return;
9698 }
9699
9700 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9701 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9702 Info.RTArgs.BasePointersArray,
9703 /*Idx0=*/0, /*Idx1=*/0);
9704 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9705 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9706 /*Idx0=*/0,
9707 /*Idx1=*/0);
9708 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9709 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9710 /*Idx0=*/0, /*Idx1=*/0);
9711 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9712 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9713 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9714 : Info.RTArgs.MapTypesArray,
9715 /*Idx0=*/0,
9716 /*Idx1=*/0);
9717
9718 // Only emit the mapper information arrays if debug information is
9719 // requested.
9720 if (!Info.EmitDebug)
9721 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9722 else
9723 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9724 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9725 /*Idx0=*/0,
9726 /*Idx1=*/0);
9727 // If there is no user-defined mapper, set the mapper array to nullptr to
9728 // avoid an unnecessary data privatization
9729 if (!Info.HasMapper)
9730 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9731 else
9732 RTArgs.MappersArray =
9733 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9734}
9735
9737 InsertPointTy CodeGenIP,
9738 MapInfosTy &CombinedInfo,
9739 TargetDataInfo &Info) {
9741 CombinedInfo.NonContigInfo;
9742
9743 // Build an array of struct descriptor_dim and then assign it to
9744 // offload_args.
9745 //
9746 // struct descriptor_dim {
9747 // uint64_t offset;
9748 // uint64_t count;
9749 // uint64_t stride
9750 // };
9751 Type *Int64Ty = Builder.getInt64Ty();
9753 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9754 "struct.descriptor_dim");
9755
9756 enum { OffsetFD = 0, CountFD, StrideFD };
9757 // We need two index variable here since the size of "Dims" is the same as
9758 // the size of Components, however, the size of offset, count, and stride is
9759 // equal to the size of base declaration that is non-contiguous.
9760 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9761 // Skip emitting ir if dimension size is 1 since it cannot be
9762 // non-contiguous.
9763 if (NonContigInfo.Dims[I] == 1)
9764 continue;
9765 Builder.restoreIP(AllocaIP);
9766 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9767 AllocaInst *DimsAddr =
9768 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9769 Builder.restoreIP(CodeGenIP);
9770 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9771 unsigned RevIdx = EE - II - 1;
9772 Value *DimsLVal = Builder.CreateInBoundsGEP(
9773 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9774 // Offset
9775 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9776 Builder.CreateAlignedStore(
9777 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9778 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9779 // Count
9780 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9781 Builder.CreateAlignedStore(
9782 NonContigInfo.Counts[L][RevIdx], CountLVal,
9783 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9784 // Stride
9785 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9786 Builder.CreateAlignedStore(
9787 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9788 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9789 }
9790 // args[I] = &dims
9791 Builder.restoreIP(CodeGenIP);
9792 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9793 DimsAddr, Builder.getPtrTy());
9794 Value *P = Builder.CreateConstInBoundsGEP2_32(
9795 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9796 Info.RTArgs.PointersArray, 0, I);
9797 Builder.CreateAlignedStore(
9798 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9799 ++L;
9800 }
9801}
9802
9803void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9804 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9805 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9806 BasicBlock *ExitBB, bool IsInit) {
9807 StringRef Prefix = IsInit ? ".init" : ".del";
9808
9809 // Evaluate if this is an array section.
9811 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9812 Value *IsArray =
9813 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9814 Value *DeleteBit = Builder.CreateAnd(
9815 MapType,
9816 Builder.getInt64(
9817 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9818 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9819 Value *DeleteCond;
9820 Value *Cond;
9821 if (IsInit) {
9822 // base != begin?
9823 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9824 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9825 DeleteCond = Builder.CreateIsNull(
9826 DeleteBit,
9827 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9828 } else {
9829 Cond = IsArray;
9830 DeleteCond = Builder.CreateIsNotNull(
9831 DeleteBit,
9832 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9833 }
9834 Cond = Builder.CreateAnd(Cond, DeleteCond);
9835 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9836
9837 emitBlock(BodyBB, MapperFn);
9838 // Get the array size by multiplying element size and element number (i.e., \p
9839 // Size).
9840 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9841 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9842 // memory allocation/deletion purpose only.
9843 Value *MapTypeArg = Builder.CreateAnd(
9844 MapType,
9845 Builder.getInt64(
9846 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9847 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9848 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9849 MapTypeArg = Builder.CreateOr(
9850 MapTypeArg,
9851 Builder.getInt64(
9852 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9853 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9854
9855 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9856 // data structure.
9857 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9858 ArraySize, MapTypeArg, MapName};
9860 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9861 OffloadingArgs);
9862}
9863
9866 llvm::Value *BeginArg)>
9867 GenMapInfoCB,
9868 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9869 SmallVector<Type *> Params;
9870 Params.emplace_back(Builder.getPtrTy());
9871 Params.emplace_back(Builder.getPtrTy());
9872 Params.emplace_back(Builder.getPtrTy());
9873 Params.emplace_back(Builder.getInt64Ty());
9874 Params.emplace_back(Builder.getInt64Ty());
9875 Params.emplace_back(Builder.getPtrTy());
9876
9877 auto *FnTy =
9878 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9879
9880 SmallString<64> TyStr;
9881 raw_svector_ostream Out(TyStr);
9882 Function *MapperFn =
9884 MapperFn->addFnAttr(Attribute::NoInline);
9885 MapperFn->addFnAttr(Attribute::NoUnwind);
9886 MapperFn->addParamAttr(0, Attribute::NoUndef);
9887 MapperFn->addParamAttr(1, Attribute::NoUndef);
9888 MapperFn->addParamAttr(2, Attribute::NoUndef);
9889 MapperFn->addParamAttr(3, Attribute::NoUndef);
9890 MapperFn->addParamAttr(4, Attribute::NoUndef);
9891 MapperFn->addParamAttr(5, Attribute::NoUndef);
9892
9893 // Start the mapper function code generation.
9894 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9895 auto SavedIP = Builder.saveIP();
9896 Builder.SetInsertPoint(EntryBB);
9897
9898 Value *MapperHandle = MapperFn->getArg(0);
9899 Value *BaseIn = MapperFn->getArg(1);
9900 Value *BeginIn = MapperFn->getArg(2);
9901 Value *Size = MapperFn->getArg(3);
9902 Value *MapType = MapperFn->getArg(4);
9903 Value *MapName = MapperFn->getArg(5);
9904
9905 // Compute the starting and end addresses of array elements.
9906 // Prepare common arguments for array initiation and deletion.
9907 // Convert the size in bytes into the number of array elements.
9908 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9909 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9910 Value *PtrBegin = BeginIn;
9911 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9912
9913 // Emit array initiation if this is an array section and \p MapType indicates
9914 // that memory allocation is required.
9915 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9916 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9917 MapType, MapName, ElementSize, HeadBB,
9918 /*IsInit=*/true);
9919
9920 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9921
9922 // Emit the loop header block.
9923 emitBlock(HeadBB, MapperFn);
9924 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9925 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9926 // Evaluate whether the initial condition is satisfied.
9927 Value *IsEmpty =
9928 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9929 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9930
9931 // Emit the loop body block.
9932 emitBlock(BodyBB, MapperFn);
9933 BasicBlock *LastBB = BodyBB;
9934 PHINode *PtrPHI =
9935 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9936 PtrPHI->addIncoming(PtrBegin, HeadBB);
9937
9938 // Get map clause information. Fill up the arrays with all mapped variables.
9939 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9940 if (!Info)
9941 return Info.takeError();
9942
9943 // Call the runtime API __tgt_mapper_num_components to get the number of
9944 // pre-existing components.
9945 Value *OffloadingArgs[] = {MapperHandle};
9946 Value *PreviousSize = createRuntimeFunctionCall(
9947 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9948 OffloadingArgs);
9949 Value *ShiftedPreviousSize =
9950 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9951
9952 // Fill up the runtime mapper handle for all components.
9953 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9954 Value *CurBaseArg = Info->BasePointers[I];
9955 Value *CurBeginArg = Info->Pointers[I];
9956 Value *CurSizeArg = Info->Sizes[I];
9957 Value *CurNameArg = Info->Names.size()
9958 ? Info->Names[I]
9959 : Constant::getNullValue(Builder.getPtrTy());
9960
9961 // Extract the MEMBER_OF field from the map type.
9962 Value *OriMapType = Builder.getInt64(
9963 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9964 Info->Types[I]));
9965 Value *MemberMapType =
9966 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9967
9968 // Combine the map type inherited from user-defined mapper with that
9969 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9970 // bits of the \a MapType, which is the input argument of the mapper
9971 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9972 // bits of MemberMapType.
9973 // [OpenMP 5.0], 1.2.6. map-type decay.
9974 // | alloc | to | from | tofrom | release | delete
9975 // ----------------------------------------------------------
9976 // alloc | alloc | alloc | alloc | alloc | release | delete
9977 // to | alloc | to | alloc | to | release | delete
9978 // from | alloc | alloc | from | from | release | delete
9979 // tofrom | alloc | to | from | tofrom | release | delete
9980 Value *LeftToFrom = Builder.CreateAnd(
9981 MapType,
9982 Builder.getInt64(
9983 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9984 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9985 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9986 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9987 BasicBlock *AllocElseBB =
9988 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9989 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9990 BasicBlock *ToElseBB =
9991 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9992 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9993 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9994 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9995 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9996 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9997 emitBlock(AllocBB, MapperFn);
9998 Value *AllocMapType = Builder.CreateAnd(
9999 MemberMapType,
10000 Builder.getInt64(
10001 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10002 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10003 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10004 Builder.CreateBr(EndBB);
10005 emitBlock(AllocElseBB, MapperFn);
10006 Value *IsTo = Builder.CreateICmpEQ(
10007 LeftToFrom,
10008 Builder.getInt64(
10009 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10010 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10011 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
10012 // In case of to, clear OMP_MAP_FROM.
10013 emitBlock(ToBB, MapperFn);
10014 Value *ToMapType = Builder.CreateAnd(
10015 MemberMapType,
10016 Builder.getInt64(
10017 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10018 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10019 Builder.CreateBr(EndBB);
10020 emitBlock(ToElseBB, MapperFn);
10021 Value *IsFrom = Builder.CreateICmpEQ(
10022 LeftToFrom,
10023 Builder.getInt64(
10024 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10025 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10026 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
10027 // In case of from, clear OMP_MAP_TO.
10028 emitBlock(FromBB, MapperFn);
10029 Value *FromMapType = Builder.CreateAnd(
10030 MemberMapType,
10031 Builder.getInt64(
10032 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10033 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10034 // In case of tofrom, do nothing.
10035 emitBlock(EndBB, MapperFn);
10036 LastBB = EndBB;
10037 PHINode *CurMapType =
10038 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
10039 CurMapType->addIncoming(AllocMapType, AllocBB);
10040 CurMapType->addIncoming(ToMapType, ToBB);
10041 CurMapType->addIncoming(FromMapType, FromBB);
10042 CurMapType->addIncoming(MemberMapType, ToElseBB);
10043
10044 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
10045 CurSizeArg, CurMapType, CurNameArg};
10046
10047 auto ChildMapperFn = CustomMapperCB(I);
10048 if (!ChildMapperFn)
10049 return ChildMapperFn.takeError();
10050 if (*ChildMapperFn) {
10051 // Call the corresponding mapper function.
10052 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
10053 ->setDoesNotThrow();
10054 } else {
10055 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10056 // data structure.
10058 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10059 OffloadingArgs);
10060 }
10061 }
10062
10063 // Update the pointer to point to the next element that needs to be mapped,
10064 // and check whether we have mapped all elements.
10065 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
10066 "omp.arraymap.next");
10067 PtrPHI->addIncoming(PtrNext, LastBB);
10068 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
10069 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
10070 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
10071
10072 emitBlock(ExitBB, MapperFn);
10073 // Emit array deletion if this is an array section and \p MapType indicates
10074 // that deletion is required.
10075 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10076 MapType, MapName, ElementSize, DoneBB,
10077 /*IsInit=*/false);
10078
10079 // Emit the function exit block.
10080 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10081
10082 Builder.CreateRetVoid();
10083 Builder.restoreIP(SavedIP);
10084 return MapperFn;
10085}
10086
10088 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10089 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10090 bool IsNonContiguous,
10091 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10092
10093 // Reset the array information.
10094 Info.clearArrayInfo();
10095 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10096
10097 if (Info.NumberOfPtrs == 0)
10098 return Error::success();
10099
10100 Builder.restoreIP(AllocaIP);
10101 // Detect if we have any capture size requiring runtime evaluation of the
10102 // size so that a constant array could be eventually used.
10103 ArrayType *PointerArrayType =
10104 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10105
10106 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10107 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10108
10109 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10110 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10111 AllocaInst *MappersArray = Builder.CreateAlloca(
10112 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10113 Info.RTArgs.MappersArray = MappersArray;
10114
10115 // If we don't have any VLA types or other types that require runtime
10116 // evaluation, we can use a constant array for the map sizes, otherwise we
10117 // need to fill up the arrays as we do for the pointers.
10118 Type *Int64Ty = Builder.getInt64Ty();
10119 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10120 ConstantInt::get(Int64Ty, 0));
10121 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10122 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10123 bool IsNonContigEntry =
10124 IsNonContiguous &&
10125 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10126 CombinedInfo.Types[I] &
10127 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10128 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10129 // descriptor_dim records), not the byte size.
10130 if (IsNonContigEntry) {
10131 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10132 "Index must be in-bounds for NON_CONTIG Dims array");
10133 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10134 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10135 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10136 continue;
10137 }
10138 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10139 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10140 ConstSizes[I] = CI;
10141 continue;
10142 }
10143 }
10144 RuntimeSizes.set(I);
10145 }
10146
10147 if (RuntimeSizes.all()) {
10148 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10149 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10150 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10151 restoreIPandDebugLoc(Builder, CodeGenIP);
10152 } else {
10153 auto *SizesArrayInit = ConstantArray::get(
10154 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10155 std::string Name = createPlatformSpecificName({"offload_sizes"});
10156 auto *SizesArrayGbl =
10157 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10158 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10159 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10160
10161 if (!RuntimeSizes.any()) {
10162 Info.RTArgs.SizesArray = SizesArrayGbl;
10163 } else {
10164 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10165 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10166 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10167 AllocaInst *Buffer = Builder.CreateAlloca(
10168 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10169 Buffer->setAlignment(OffloadSizeAlign);
10170 restoreIPandDebugLoc(Builder, CodeGenIP);
10171 Builder.CreateMemCpy(
10172 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10173 SizesArrayGbl, OffloadSizeAlign,
10174 Builder.getIntN(
10175 IndexSize,
10176 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10177
10178 Info.RTArgs.SizesArray = Buffer;
10179 }
10180 restoreIPandDebugLoc(Builder, CodeGenIP);
10181 }
10182
10183 // The map types are always constant so we don't need to generate code to
10184 // fill arrays. Instead, we create an array constant.
10186 for (auto mapFlag : CombinedInfo.Types)
10187 Mapping.push_back(
10188 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10189 mapFlag));
10190 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10191 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10192 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10193
10194 // The information types are only built if provided.
10195 if (!CombinedInfo.Names.empty()) {
10196 auto *MapNamesArrayGbl = createOffloadMapnames(
10197 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10198 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10199 Info.EmitDebug = true;
10200 } else {
10201 Info.RTArgs.MapNamesArray =
10203 Info.EmitDebug = false;
10204 }
10205
10206 // If there's a present map type modifier, it must not be applied to the end
10207 // of a region, so generate a separate map type array in that case.
10208 if (Info.separateBeginEndCalls()) {
10209 bool EndMapTypesDiffer = false;
10210 for (uint64_t &Type : Mapping) {
10211 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10212 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10213 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10214 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10215 EndMapTypesDiffer = true;
10216 }
10217 }
10218 if (EndMapTypesDiffer) {
10219 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10220 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10221 }
10222 }
10223
10224 PointerType *PtrTy = Builder.getPtrTy();
10225 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10226 Value *BPVal = CombinedInfo.BasePointers[I];
10227 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10228 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10229 0, I);
10230 Builder.CreateAlignedStore(BPVal, BP,
10231 M.getDataLayout().getPrefTypeAlign(PtrTy));
10232
10233 if (Info.requiresDevicePointerInfo()) {
10234 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10235 CodeGenIP = Builder.saveIP();
10236 Builder.restoreIP(AllocaIP);
10237 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10238 Builder.restoreIP(CodeGenIP);
10239 if (DeviceAddrCB)
10240 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10241 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10242 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10243 if (DeviceAddrCB)
10244 DeviceAddrCB(I, BP);
10245 }
10246 }
10247
10248 Value *PVal = CombinedInfo.Pointers[I];
10249 Value *P = Builder.CreateConstInBoundsGEP2_32(
10250 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10251 I);
10252 // TODO: Check alignment correct.
10253 Builder.CreateAlignedStore(PVal, P,
10254 M.getDataLayout().getPrefTypeAlign(PtrTy));
10255
10256 if (RuntimeSizes.test(I)) {
10257 Value *S = Builder.CreateConstInBoundsGEP2_32(
10258 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10259 /*Idx0=*/0,
10260 /*Idx1=*/I);
10261 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10262 Int64Ty,
10263 /*isSigned=*/true),
10264 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10265 }
10266 // Fill up the mapper array.
10267 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10268 Value *MFunc = ConstantPointerNull::get(PtrTy);
10269
10270 auto CustomMFunc = CustomMapperCB(I);
10271 if (!CustomMFunc)
10272 return CustomMFunc.takeError();
10273 if (*CustomMFunc)
10274 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10275
10276 Value *MAddr = Builder.CreateInBoundsGEP(
10277 PointerArrayType, MappersArray,
10278 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10279 Builder.CreateAlignedStore(
10280 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10281 }
10282
10283 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10284 Info.NumberOfPtrs == 0)
10285 return Error::success();
10286 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10287 return Error::success();
10288}
10289
10291 BasicBlock *CurBB = Builder.GetInsertBlock();
10292
10293 if (!CurBB || CurBB->hasTerminator()) {
10294 // If there is no insert point or the previous block is already
10295 // terminated, don't touch it.
10296 } else {
10297 // Otherwise, create a fall-through branch.
10298 Builder.CreateBr(Target);
10299 }
10300
10301 Builder.ClearInsertionPoint();
10302}
10303
10305 bool IsFinished) {
10306 BasicBlock *CurBB = Builder.GetInsertBlock();
10307
10308 // Fall out of the current block (if necessary).
10309 emitBranch(BB);
10310
10311 if (IsFinished && BB->use_empty()) {
10312 BB->eraseFromParent();
10313 return;
10314 }
10315
10316 // Place the block after the current block, if possible, or else at
10317 // the end of the function.
10318 if (CurBB && CurBB->getParent())
10319 CurFn->insert(std::next(CurBB->getIterator()), BB);
10320 else
10321 CurFn->insert(CurFn->end(), BB);
10322 Builder.SetInsertPoint(BB);
10323}
10324
10326 BodyGenCallbackTy ElseGen,
10327 InsertPointTy AllocaIP) {
10328 // If the condition constant folds and can be elided, try to avoid emitting
10329 // the condition and the dead arm of the if/else.
10330 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10331 auto CondConstant = CI->getSExtValue();
10332 if (CondConstant)
10333 return ThenGen(AllocaIP, Builder.saveIP());
10334
10335 return ElseGen(AllocaIP, Builder.saveIP());
10336 }
10337
10338 Function *CurFn = Builder.GetInsertBlock()->getParent();
10339
10340 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10341 // emit the conditional branch.
10342 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10343 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10344 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10345 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10346 // Emit the 'then' code.
10347 emitBlock(ThenBlock, CurFn);
10348 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10349 return Err;
10350 emitBranch(ContBlock);
10351 // Emit the 'else' code if present.
10352 // There is no need to emit line number for unconditional branch.
10353 emitBlock(ElseBlock, CurFn);
10354 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10355 return Err;
10356 // There is no need to emit line number for unconditional branch.
10357 emitBranch(ContBlock);
10358 // Emit the continuation block for code after the if.
10359 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10360 return Error::success();
10361}
10362
10363bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10364 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10367 "Unexpected Atomic Ordering.");
10368
10369 bool Flush = false;
10371
10372 switch (AK) {
10373 case Read:
10376 FlushAO = AtomicOrdering::Acquire;
10377 Flush = true;
10378 }
10379 break;
10380 case Write:
10381 case Compare:
10382 case Update:
10385 FlushAO = AtomicOrdering::Release;
10386 Flush = true;
10387 }
10388 break;
10389 case Capture:
10390 switch (AO) {
10392 FlushAO = AtomicOrdering::Acquire;
10393 Flush = true;
10394 break;
10396 FlushAO = AtomicOrdering::Release;
10397 Flush = true;
10398 break;
10402 Flush = true;
10403 break;
10404 default:
10405 // do nothing - leave silently.
10406 break;
10407 }
10408 }
10409
10410 if (Flush) {
10411 // Currently Flush RT call still doesn't take memory_ordering, so for when
10412 // that happens, this tries to do the resolution of which atomic ordering
10413 // to use with but issue the flush call
10414 // TODO: pass `FlushAO` after memory ordering support is added
10415 (void)FlushAO;
10416 emitFlush(Loc);
10417 }
10418
10419 // for AO == AtomicOrdering::Monotonic and all other case combinations
10420 // do nothing
10421 return Flush;
10422}
10423
10427 AtomicOrdering AO, InsertPointTy AllocaIP) {
10428 if (!updateToLocation(Loc))
10429 return Loc.IP;
10430
10431 assert(X.Var->getType()->isPointerTy() &&
10432 "OMP Atomic expects a pointer to target memory");
10433 Type *XElemTy = X.ElemTy;
10434 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10435 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10436 "OMP atomic read expected a scalar type");
10437
10438 Value *XRead = nullptr;
10439
10440 if (XElemTy->isIntegerTy()) {
10441 LoadInst *XLD =
10442 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10443 XLD->setAtomic(AO);
10444 XRead = cast<Value>(XLD);
10445 } else if (XElemTy->isStructTy()) {
10446 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10447 // target does not support `atomicrmw` of the size of the struct
10448 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10449 OldVal->setAtomic(AO);
10450 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10451 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10452 OpenMPIRBuilder::AtomicInfo atomicInfo(
10453 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10454 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10455 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10456 XRead = AtomicLoadRes.first;
10457 OldVal->eraseFromParent();
10458 } else {
10459 // We need to perform atomic op as integer
10460 IntegerType *IntCastTy =
10461 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10462 LoadInst *XLoad =
10463 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10464 XLoad->setAtomic(AO);
10465 if (XElemTy->isFloatingPointTy()) {
10466 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10467 } else {
10468 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10469 }
10470 }
10471 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10472 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10473 return Builder.saveIP();
10474}
10475
10478 AtomicOpValue &X, Value *Expr,
10479 AtomicOrdering AO, InsertPointTy AllocaIP) {
10480 if (!updateToLocation(Loc))
10481 return Loc.IP;
10482
10483 assert(X.Var->getType()->isPointerTy() &&
10484 "OMP Atomic expects a pointer to target memory");
10485 Type *XElemTy = X.ElemTy;
10486 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10487 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10488 "OMP atomic write expected a scalar type");
10489
10490 if (XElemTy->isIntegerTy()) {
10491 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10492 XSt->setAtomic(AO);
10493 } else if (XElemTy->isStructTy()) {
10494 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10495 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10496 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10497 OpenMPIRBuilder::AtomicInfo atomicInfo(
10498 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10499 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10500 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10501 OldVal->eraseFromParent();
10502 } else {
10503 // We need to bitcast and perform atomic op as integers
10504 IntegerType *IntCastTy =
10505 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10506 Value *ExprCast =
10507 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10508 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10509 XSt->setAtomic(AO);
10510 }
10511
10512 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10513 return Builder.saveIP();
10514}
10515
10518 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10519 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10520 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10521 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10522 if (!updateToLocation(Loc))
10523 return Loc.IP;
10524
10525 LLVM_DEBUG({
10526 Type *XTy = X.Var->getType();
10527 assert(XTy->isPointerTy() &&
10528 "OMP Atomic expects a pointer to target memory");
10529 Type *XElemTy = X.ElemTy;
10530 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10531 XElemTy->isPointerTy()) &&
10532 "OMP atomic update expected a scalar type");
10533 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10534 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10535 "OpenMP atomic does not support LT or GT operations");
10536 });
10537
10538 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10539 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10540 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10541 if (!AtomicResult)
10542 return AtomicResult.takeError();
10543 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10544 return Builder.saveIP();
10545}
10546
10547// FIXME: Duplicating AtomicExpand
10548Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10549 AtomicRMWInst::BinOp RMWOp) {
10550 switch (RMWOp) {
10551 case AtomicRMWInst::Add:
10552 return Builder.CreateAdd(Src1, Src2);
10553 case AtomicRMWInst::Sub:
10554 return Builder.CreateSub(Src1, Src2);
10555 case AtomicRMWInst::And:
10556 return Builder.CreateAnd(Src1, Src2);
10558 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10559 case AtomicRMWInst::Or:
10560 return Builder.CreateOr(Src1, Src2);
10561 case AtomicRMWInst::Xor:
10562 return Builder.CreateXor(Src1, Src2);
10567 case AtomicRMWInst::Max:
10568 case AtomicRMWInst::Min:
10581 llvm_unreachable("Unsupported atomic update operation");
10582 }
10583 llvm_unreachable("Unsupported atomic update operation");
10584}
10585
10586Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10587 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10589 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10590 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10591 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10592 // or a complex datatype.
10593 bool emitRMWOp = false;
10594 switch (RMWOp) {
10595 case AtomicRMWInst::Add:
10596 case AtomicRMWInst::And:
10598 case AtomicRMWInst::Or:
10599 case AtomicRMWInst::Xor:
10601 emitRMWOp = XElemTy;
10602 break;
10603 case AtomicRMWInst::Sub:
10604 emitRMWOp = (IsXBinopExpr && XElemTy);
10605 break;
10606 default:
10607 emitRMWOp = false;
10608 }
10609 emitRMWOp &= XElemTy->isIntegerTy();
10610
10611 std::pair<Value *, Value *> Res;
10612 if (emitRMWOp) {
10613 AtomicRMWInst *RMWInst =
10614 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10615 if (T.isAMDGPU()) {
10616 if (IsIgnoreDenormalMode)
10617 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10618 llvm::MDNode::get(Builder.getContext(), {}));
10619 if (!IsFineGrainedMemory)
10620 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10621 llvm::MDNode::get(Builder.getContext(), {}));
10622 if (!IsRemoteMemory)
10623 RMWInst->setMetadata("amdgpu.no.remote.memory",
10624 llvm::MDNode::get(Builder.getContext(), {}));
10625 }
10626 Res.first = RMWInst;
10627 // not needed except in case of postfix captures. Generate anyway for
10628 // consistency with the else part. Will be removed with any DCE pass.
10629 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10630 if (RMWOp == AtomicRMWInst::Xchg)
10631 Res.second = Res.first;
10632 else
10633 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10634 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10635 XElemTy->isStructTy()) {
10636 LoadInst *OldVal =
10637 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10638 OldVal->setAtomic(AO);
10639 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10640 unsigned LoadSize =
10641 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10642
10643 OpenMPIRBuilder::AtomicInfo atomicInfo(
10644 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10645 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10646 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10647 BasicBlock *CurBB = Builder.GetInsertBlock();
10648 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10649 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10650 BasicBlock *ExitBB =
10651 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10652 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10653 X->getName() + ".atomic.cont");
10654 ContBB->getTerminator()->eraseFromParent();
10655 Builder.restoreIP(AllocaIP);
10656 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10657 NewAtomicAddr->setName(X->getName() + "x.new.val");
10658 Builder.SetInsertPoint(ContBB);
10659 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10660 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10661 Value *OldExprVal = PHI;
10662 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10663 if (!CBResult)
10664 return CBResult.takeError();
10665 Value *Upd = *CBResult;
10666 Builder.CreateStore(Upd, NewAtomicAddr);
10669 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10670 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10671 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10672 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10673 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10674 OldVal->eraseFromParent();
10675 Res.first = OldExprVal;
10676 Res.second = Upd;
10677
10678 if (UnreachableInst *ExitTI =
10680 CurBBTI->eraseFromParent();
10681 Builder.SetInsertPoint(ExitBB);
10682 } else {
10683 Builder.SetInsertPoint(ExitTI);
10684 }
10685 } else {
10686 IntegerType *IntCastTy =
10687 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10688 LoadInst *OldVal =
10689 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10690 OldVal->setAtomic(AO);
10691 // CurBB
10692 // | /---\
10693 // ContBB |
10694 // | \---/
10695 // ExitBB
10696 BasicBlock *CurBB = Builder.GetInsertBlock();
10697 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10698 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10699 BasicBlock *ExitBB =
10700 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10701 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10702 X->getName() + ".atomic.cont");
10703 ContBB->getTerminator()->eraseFromParent();
10704 Builder.restoreIP(AllocaIP);
10705 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10706 NewAtomicAddr->setName(X->getName() + "x.new.val");
10707 Builder.SetInsertPoint(ContBB);
10708 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10709 PHI->addIncoming(OldVal, CurBB);
10710 bool IsIntTy = XElemTy->isIntegerTy();
10711 Value *OldExprVal = PHI;
10712 if (!IsIntTy) {
10713 if (XElemTy->isFloatingPointTy()) {
10714 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10715 X->getName() + ".atomic.fltCast");
10716 } else {
10717 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10718 X->getName() + ".atomic.ptrCast");
10719 }
10720 }
10721
10722 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10723 if (!CBResult)
10724 return CBResult.takeError();
10725 Value *Upd = *CBResult;
10726 Builder.CreateStore(Upd, NewAtomicAddr);
10727 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10730 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10731 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10732 Result->setVolatile(VolatileX);
10733 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10734 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10735 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10736 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10737
10738 Res.first = OldExprVal;
10739 Res.second = Upd;
10740
10741 // set Insertion point in exit block
10742 if (UnreachableInst *ExitTI =
10744 CurBBTI->eraseFromParent();
10745 Builder.SetInsertPoint(ExitBB);
10746 } else {
10747 Builder.SetInsertPoint(ExitTI);
10748 }
10749 }
10750
10751 return Res;
10752}
10753
10756 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10757 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10758 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10759 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10760 if (!updateToLocation(Loc))
10761 return Loc.IP;
10762
10763 LLVM_DEBUG({
10764 Type *XTy = X.Var->getType();
10765 assert(XTy->isPointerTy() &&
10766 "OMP Atomic expects a pointer to target memory");
10767 Type *XElemTy = X.ElemTy;
10768 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10769 XElemTy->isPointerTy()) &&
10770 "OMP atomic capture expected a scalar type");
10771 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10772 "OpenMP atomic does not support LT or GT operations");
10773 });
10774
10775 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10776 // 'x' is simply atomically rewritten with 'expr'.
10777 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10778 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10779 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10780 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10781 if (!AtomicResult)
10782 return AtomicResult.takeError();
10783 Value *CapturedVal =
10784 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10785 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10786
10787 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10788 return Builder.saveIP();
10789}
10790
10794 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10795 bool IsFailOnly) {
10796
10798 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10799 IsPostfixUpdate, IsFailOnly, Failure);
10800}
10801
10805 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10806 bool IsFailOnly, AtomicOrdering Failure) {
10807
10808 if (!updateToLocation(Loc))
10809 return Loc.IP;
10810
10811 assert(X.Var->getType()->isPointerTy() &&
10812 "OMP atomic expects a pointer to target memory");
10813 // compare capture
10814 if (V.Var) {
10815 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10816 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10817 }
10818
10819 bool IsInteger = E->getType()->isIntegerTy();
10820
10821 if (Op == OMPAtomicCompareOp::EQ) {
10822 AtomicCmpXchgInst *Result = nullptr;
10823 if (!IsInteger) {
10824 IntegerType *IntCastTy =
10825 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10826 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10827 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10828 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10829 AO, Failure);
10830 } else {
10831 Result =
10832 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10833 }
10834
10835 if (V.Var) {
10836 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10837 if (!IsInteger)
10838 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10839 assert(OldValue->getType() == V.ElemTy &&
10840 "OldValue and V must be of same type");
10841 if (IsPostfixUpdate) {
10842 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10843 } else {
10844 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10845 if (IsFailOnly) {
10846 // CurBB----
10847 // | |
10848 // v |
10849 // ContBB |
10850 // | |
10851 // v |
10852 // ExitBB <-
10853 //
10854 // where ContBB only contains the store of old value to 'v'.
10855 BasicBlock *CurBB = Builder.GetInsertBlock();
10856 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10857 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10858 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10859 CurBBTI, X.Var->getName() + ".atomic.exit");
10860 BasicBlock *ContBB = CurBB->splitBasicBlock(
10861 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10862 ContBB->getTerminator()->eraseFromParent();
10863 CurBB->getTerminator()->eraseFromParent();
10864
10865 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10866
10867 Builder.SetInsertPoint(ContBB);
10868 Builder.CreateStore(OldValue, V.Var);
10869 Builder.CreateBr(ExitBB);
10870
10871 if (UnreachableInst *ExitTI =
10873 CurBBTI->eraseFromParent();
10874 Builder.SetInsertPoint(ExitBB);
10875 } else {
10876 Builder.SetInsertPoint(ExitTI);
10877 }
10878 } else {
10879 Value *CapturedValue =
10880 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10881 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10882 }
10883 }
10884 }
10885 // The comparison result has to be stored.
10886 if (R.Var) {
10887 assert(R.Var->getType()->isPointerTy() &&
10888 "r.var must be of pointer type");
10889 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10890
10891 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10892 Value *ResultCast = R.IsSigned
10893 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10894 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10895 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10896 }
10897 } else {
10898 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10899 "Op should be either max or min at this point");
10900 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10901
10902 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10903 // Let's take max as example.
10904 // OpenMP form:
10905 // x = x > expr ? expr : x;
10906 // LLVM form:
10907 // *ptr = *ptr > val ? *ptr : val;
10908 // We need to transform to LLVM form.
10909 // x = x <= expr ? x : expr;
10911 if (IsXBinopExpr) {
10912 if (IsInteger) {
10913 if (X.IsSigned)
10914 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10916 else
10917 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10919 } else {
10920 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10922 }
10923 } else {
10924 if (IsInteger) {
10925 if (X.IsSigned)
10926 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10928 else
10929 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10931 } else {
10932 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10934 }
10935 }
10936
10937 AtomicRMWInst *OldValue =
10938 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10939 if (V.Var) {
10940 Value *CapturedValue = nullptr;
10941 if (IsPostfixUpdate) {
10942 CapturedValue = OldValue;
10943 } else {
10944 CmpInst::Predicate Pred;
10945 switch (NewOp) {
10946 case AtomicRMWInst::Max:
10947 Pred = CmpInst::ICMP_SGT;
10948 break;
10950 Pred = CmpInst::ICMP_UGT;
10951 break;
10953 Pred = CmpInst::FCMP_OGT;
10954 break;
10955 case AtomicRMWInst::Min:
10956 Pred = CmpInst::ICMP_SLT;
10957 break;
10959 Pred = CmpInst::ICMP_ULT;
10960 break;
10962 Pred = CmpInst::FCMP_OLT;
10963 break;
10964 default:
10965 llvm_unreachable("unexpected comparison op");
10966 }
10967 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10968 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10969 }
10970 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10971 }
10972 }
10973
10974 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10975
10976 return Builder.saveIP();
10977}
10978
10981 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10982 Value *NumTeamsUpper, Value *ThreadLimit,
10983 Value *IfExpr) {
10984 if (!updateToLocation(Loc))
10985 return InsertPointTy();
10986
10987 uint32_t SrcLocStrSize;
10988 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10989 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10990 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10991
10992 // Outer allocation basicblock is the entry block of the current function.
10993 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10994 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10995 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10996 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10997 }
10998
10999 // The current basic block is split into four basic blocks. After outlining,
11000 // they will be mapped as follows:
11001 // ```
11002 // def current_fn() {
11003 // current_basic_block:
11004 // br label %teams.exit
11005 // teams.exit:
11006 // ; instructions after teams
11007 // }
11008 //
11009 // def outlined_fn() {
11010 // teams.alloca:
11011 // br label %teams.body
11012 // teams.body:
11013 // ; instructions within teams body
11014 // }
11015 // ```
11016 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
11017 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
11018 BasicBlock *AllocaBB =
11019 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
11020
11021 bool SubClausesPresent =
11022 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
11023 // Push num_teams
11024 if (!Config.isTargetDevice() && SubClausesPresent) {
11025 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
11026 "if lowerbound is non-null, then upperbound must also be non-null "
11027 "for bounds on num_teams");
11028
11029 if (NumTeamsUpper == nullptr)
11030 NumTeamsUpper = Builder.getInt32(0);
11031
11032 if (NumTeamsLower == nullptr)
11033 NumTeamsLower = NumTeamsUpper;
11034
11035 if (IfExpr) {
11036 assert(IfExpr->getType()->isIntegerTy() &&
11037 "argument to if clause must be an integer value");
11038
11039 // upper = ifexpr ? upper : 1
11040 if (IfExpr->getType() != Int1)
11041 IfExpr = Builder.CreateICmpNE(IfExpr,
11042 ConstantInt::get(IfExpr->getType(), 0));
11043 NumTeamsUpper = Builder.CreateSelect(
11044 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
11045
11046 // lower = ifexpr ? lower : 1
11047 NumTeamsLower = Builder.CreateSelect(
11048 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
11049 }
11050
11051 if (ThreadLimit == nullptr)
11052 ThreadLimit = Builder.getInt32(0);
11053
11054 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
11055 // truncate or sign extend the passed values to match the int32 parameters.
11056 Value *NumTeamsLowerInt32 =
11057 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
11058 Value *NumTeamsUpperInt32 =
11059 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
11060 Value *ThreadLimitInt32 =
11061 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
11062
11063 Value *ThreadNum = getOrCreateThreadID(Ident);
11064
11066 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
11067 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
11068 ThreadLimitInt32});
11069 }
11070 // Generate the body of teams.
11071 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11072 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11073 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11074 return Err;
11075
11076 OutlineInfo OI;
11077 OI.EntryBB = AllocaBB;
11078 OI.ExitBB = ExitBB;
11079 OI.OuterAllocaBB = &OuterAllocaBB;
11080
11081 // Insert fake values for global tid and bound tid.
11083 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11085 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11087 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11088
11089 auto HostPostOutlineCB = [this, Ident,
11090 ToBeDeleted](Function &OutlinedFn) mutable {
11091 // The stale call instruction will be replaced with a new call instruction
11092 // for runtime call with the outlined function.
11093
11094 assert(OutlinedFn.hasOneUse() &&
11095 "there must be a single user for the outlined function");
11096 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11097 ToBeDeleted.push_back(StaleCI);
11098
11099 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11100 "Outlined function must have two or three arguments only");
11101
11102 bool HasShared = OutlinedFn.arg_size() == 3;
11103
11104 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11105 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11106 if (HasShared)
11107 OutlinedFn.getArg(2)->setName("data");
11108
11109 // Call to the runtime function for teams in the current function.
11110 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11111 "outlined function.");
11112 Builder.SetInsertPoint(StaleCI);
11113 SmallVector<Value *> Args = {
11114 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11115 if (HasShared)
11116 Args.push_back(StaleCI->getArgOperand(2));
11119 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11120 Args);
11121
11122 for (Instruction *I : llvm::reverse(ToBeDeleted))
11123 I->eraseFromParent();
11124 };
11125
11126 if (!Config.isTargetDevice())
11127 OI.PostOutlineCB = HostPostOutlineCB;
11128
11129 addOutlineInfo(std::move(OI));
11130
11131 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11132
11133 return Builder.saveIP();
11134}
11135
11138 InsertPointTy OuterAllocaIP,
11139 BodyGenCallbackTy BodyGenCB) {
11140 if (!updateToLocation(Loc))
11141 return InsertPointTy();
11142
11143 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
11144
11145 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11146 BasicBlock *BodyBB =
11147 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11148 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11149 }
11150 BasicBlock *ExitBB =
11151 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11152 BasicBlock *BodyBB =
11153 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11154 BasicBlock *AllocaBB =
11155 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11156
11157 // Generate the body of distribute clause
11158 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11159 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11160 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11161 return Err;
11162
11163 // When using target we use different runtime functions which require a
11164 // callback.
11165 if (Config.isTargetDevice()) {
11166 OutlineInfo OI;
11167 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
11168 OI.EntryBB = AllocaBB;
11169 OI.ExitBB = ExitBB;
11170
11171 addOutlineInfo(std::move(OI));
11172 }
11173 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11174
11175 return Builder.saveIP();
11176}
11177
11180 std::string VarName) {
11181 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11183 Names.size()),
11184 Names);
11185 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11186 M, MapNamesArrayInit->getType(),
11187 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11188 VarName);
11189 return MapNamesArrayGlobal;
11190}
11191
11192// Create all simple and struct types exposed by the runtime and remember
11193// the llvm::PointerTypes of them for easy access later.
11194void OpenMPIRBuilder::initializeTypes(Module &M) {
11195 LLVMContext &Ctx = M.getContext();
11196 StructType *T;
11197 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11198 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11199#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11200#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11201 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11202 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11203#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11204 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11205 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11206#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11207 T = StructType::getTypeByName(Ctx, StructName); \
11208 if (!T) \
11209 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11210 VarName = T; \
11211 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11212#include "llvm/Frontend/OpenMP/OMPKinds.def"
11213}
11214
11217 SmallVectorImpl<BasicBlock *> &BlockVector) {
11219 BlockSet.insert(EntryBB);
11220 BlockSet.insert(ExitBB);
11221
11222 Worklist.push_back(EntryBB);
11223 while (!Worklist.empty()) {
11224 BasicBlock *BB = Worklist.pop_back_val();
11225 BlockVector.push_back(BB);
11226 for (BasicBlock *SuccBB : successors(BB))
11227 if (BlockSet.insert(SuccBB).second)
11228 Worklist.push_back(SuccBB);
11229 }
11230}
11231
11233 uint64_t Size, int32_t Flags,
11235 StringRef Name) {
11236 if (!Config.isGPU()) {
11239 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11240 return;
11241 }
11242 // TODO: Add support for global variables on the device after declare target
11243 // support.
11244 Function *Fn = dyn_cast<Function>(Addr);
11245 if (!Fn)
11246 return;
11247
11248 // Add a function attribute for the kernel.
11249 Fn->addFnAttr("kernel");
11250 if (T.isAMDGCN())
11251 Fn->addFnAttr("uniform-work-group-size");
11252 Fn->addFnAttr(Attribute::MustProgress);
11253}
11254
11255// We only generate metadata for function that contain target regions.
11258
11259 // If there are no entries, we don't need to do anything.
11260 if (OffloadInfoManager.empty())
11261 return;
11262
11263 LLVMContext &C = M.getContext();
11266 16>
11267 OrderedEntries(OffloadInfoManager.size());
11268
11269 // Auxiliary methods to create metadata values and strings.
11270 auto &&GetMDInt = [this](unsigned V) {
11271 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11272 };
11273
11274 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11275
11276 // Create the offloading info metadata node.
11277 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11278 auto &&TargetRegionMetadataEmitter =
11279 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11280 const TargetRegionEntryInfo &EntryInfo,
11282 // Generate metadata for target regions. Each entry of this metadata
11283 // contains:
11284 // - Entry 0 -> Kind of this type of metadata (0).
11285 // - Entry 1 -> Device ID of the file where the entry was identified.
11286 // - Entry 2 -> File ID of the file where the entry was identified.
11287 // - Entry 3 -> Mangled name of the function where the entry was
11288 // identified.
11289 // - Entry 4 -> Line in the file where the entry was identified.
11290 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11291 // - Entry 6 -> Order the entry was created.
11292 // The first element of the metadata node is the kind.
11293 Metadata *Ops[] = {
11294 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11295 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11296 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11297 GetMDInt(E.getOrder())};
11298
11299 // Save this entry in the right position of the ordered entries array.
11300 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11301
11302 // Add metadata to the named metadata node.
11303 MD->addOperand(MDNode::get(C, Ops));
11304 };
11305
11306 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11307
11308 // Create function that emits metadata for each device global variable entry;
11309 auto &&DeviceGlobalVarMetadataEmitter =
11310 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11311 StringRef MangledName,
11313 // Generate metadata for global variables. Each entry of this metadata
11314 // contains:
11315 // - Entry 0 -> Kind of this type of metadata (1).
11316 // - Entry 1 -> Mangled name of the variable.
11317 // - Entry 2 -> Declare target kind.
11318 // - Entry 3 -> Order the entry was created.
11319 // The first element of the metadata node is the kind.
11320 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11321 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11322
11323 // Save this entry in the right position of the ordered entries array.
11324 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11325 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11326
11327 // Add metadata to the named metadata node.
11328 MD->addOperand(MDNode::get(C, Ops));
11329 };
11330
11331 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11332 DeviceGlobalVarMetadataEmitter);
11333
11334 for (const auto &E : OrderedEntries) {
11335 assert(E.first && "All ordered entries must exist!");
11336 if (const auto *CE =
11338 E.first)) {
11339 if (!CE->getID() || !CE->getAddress()) {
11340 // Do not blame the entry if the parent funtion is not emitted.
11341 TargetRegionEntryInfo EntryInfo = E.second;
11342 StringRef FnName = EntryInfo.ParentName;
11343 if (!M.getNamedValue(FnName))
11344 continue;
11345 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11346 continue;
11347 }
11348 createOffloadEntry(CE->getID(), CE->getAddress(),
11349 /*Size=*/0, CE->getFlags(),
11351 } else if (const auto *CE = dyn_cast<
11353 E.first)) {
11356 CE->getFlags());
11357 switch (Flags) {
11360 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11361 continue;
11362 if (!CE->getAddress()) {
11363 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11364 continue;
11365 }
11366 // The vaiable has no definition - no need to add the entry.
11367 if (CE->getVarSize() == 0)
11368 continue;
11369 break;
11371 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11372 (!Config.isTargetDevice() && CE->getAddress())) &&
11373 "Declaret target link address is set.");
11374 if (Config.isTargetDevice())
11375 continue;
11376 if (!CE->getAddress()) {
11378 continue;
11379 }
11380 break;
11383 if (!CE->getAddress()) {
11384 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11385 continue;
11386 }
11387 break;
11388 default:
11389 break;
11390 }
11391
11392 // Hidden or internal symbols on the device are not externally visible.
11393 // We should not attempt to register them by creating an offloading
11394 // entry. Indirect variables are handled separately on the device.
11395 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11396 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11397 (Flags !=
11399 Flags != OffloadEntriesInfoManager::
11400 OMPTargetGlobalVarEntryIndirectVTable))
11401 continue;
11402
11403 // Indirect globals need to use a special name that doesn't match the name
11404 // of the associated host global.
11406 Flags ==
11408 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11409 Flags, CE->getLinkage(), CE->getVarName());
11410 else
11411 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11412 Flags, CE->getLinkage());
11413
11414 } else {
11415 llvm_unreachable("Unsupported entry kind.");
11416 }
11417 }
11418
11419 // Emit requires directive globals to a special entry so the runtime can
11420 // register them when the device image is loaded.
11421 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11422 // entries should be redesigned to better suit this use-case.
11423 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11427 ".requires", /*Size=*/0,
11429 Config.getRequiresFlags());
11430}
11431
11434 unsigned FileID, unsigned Line, unsigned Count) {
11435 raw_svector_ostream OS(Name);
11436 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11437 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11438 if (Count)
11439 OS << "_" << Count;
11440}
11441
11443 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11444 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11446 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11447 EntryInfo.Line, NewCount);
11448}
11449
11452 vfs::FileSystem &VFS,
11453 StringRef ParentName) {
11454 sys::fs::UniqueID ID(0xdeadf17e, 0);
11455 auto FileIDInfo = CallBack();
11456 uint64_t FileID = 0;
11457 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11458 ID = Status->getUniqueID();
11459 FileID = Status->getUniqueID().getFile();
11460 } else {
11461 // If the inode ID could not be determined, create a hash value
11462 // the current file name and use that as an ID.
11463 FileID = hash_value(std::get<0>(FileIDInfo));
11464 }
11465
11466 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11467 std::get<1>(FileIDInfo));
11468}
11469
11471 unsigned Offset = 0;
11472 for (uint64_t Remain =
11473 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11475 !(Remain & 1); Remain = Remain >> 1)
11476 Offset++;
11477 return Offset;
11478}
11479
11482 // Rotate by getFlagMemberOffset() bits.
11483 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11484 << getFlagMemberOffset());
11485}
11486
11489 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11490 // If the entry is PTR_AND_OBJ but has not been marked with the special
11491 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11492 // marked as MEMBER_OF.
11493 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11495 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11498 return;
11499
11500 // Entries with ATTACH are not members-of anything. They are handled
11501 // separately by the runtime after other maps have been handled.
11502 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11504 return;
11505
11506 // Reset the placeholder value to prepare the flag for the assignment of the
11507 // proper MEMBER_OF value.
11508 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11509 Flags |= MemberOfFlag;
11510}
11511
11515 bool IsDeclaration, bool IsExternallyVisible,
11516 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11517 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11518 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11519 std::function<Constant *()> GlobalInitializer,
11520 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11521 // TODO: convert this to utilise the IRBuilder Config rather than
11522 // a passed down argument.
11523 if (OpenMPSIMD)
11524 return nullptr;
11525
11528 CaptureClause ==
11530 Config.hasRequiresUnifiedSharedMemory())) {
11531 SmallString<64> PtrName;
11532 {
11533 raw_svector_ostream OS(PtrName);
11534 OS << MangledName;
11535 if (!IsExternallyVisible)
11536 OS << format("_%x", EntryInfo.FileID);
11537 OS << "_decl_tgt_ref_ptr";
11538 }
11539
11540 Value *Ptr = M.getNamedValue(PtrName);
11541
11542 if (!Ptr) {
11543 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11544 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11545
11546 auto *GV = cast<GlobalVariable>(Ptr);
11547 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11548
11549 if (!Config.isTargetDevice()) {
11550 if (GlobalInitializer)
11551 GV->setInitializer(GlobalInitializer());
11552 else
11553 GV->setInitializer(GlobalValue);
11554 }
11555
11557 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11558 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11559 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11560 }
11561
11562 return cast<Constant>(Ptr);
11563 }
11564
11565 return nullptr;
11566}
11567
11571 bool IsDeclaration, bool IsExternallyVisible,
11572 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11573 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11574 std::vector<Triple> TargetTriple,
11575 std::function<Constant *()> GlobalInitializer,
11576 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11577 Constant *Addr) {
11579 (TargetTriple.empty() && !Config.isTargetDevice()))
11580 return;
11581
11583 StringRef VarName;
11584 int64_t VarSize;
11586
11588 CaptureClause ==
11590 !Config.hasRequiresUnifiedSharedMemory()) {
11592 VarName = MangledName;
11593 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11594
11595 if (!IsDeclaration)
11596 VarSize = divideCeil(
11597 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11598 else
11599 VarSize = 0;
11600 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11601
11602 // This is a workaround carried over from Clang which prevents undesired
11603 // optimisation of internal variables.
11604 if (Config.isTargetDevice() &&
11605 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11606 // Do not create a "ref-variable" if the original is not also available
11607 // on the host.
11608 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11609 return;
11610
11611 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11612
11613 if (!M.getNamedValue(RefName)) {
11614 Constant *AddrRef =
11615 getOrCreateInternalVariable(Addr->getType(), RefName);
11616 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11617 GvAddrRef->setConstant(true);
11618 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11619 GvAddrRef->setInitializer(Addr);
11620 GeneratedRefs.push_back(GvAddrRef);
11621 }
11622 }
11623 } else {
11626 else
11628
11629 if (Config.isTargetDevice()) {
11630 VarName = (Addr) ? Addr->getName() : "";
11631 Addr = nullptr;
11632 } else {
11634 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11635 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11636 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11637 VarName = (Addr) ? Addr->getName() : "";
11638 }
11639 VarSize = M.getDataLayout().getPointerSize();
11641 }
11642
11643 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11644 Flags, Linkage);
11645}
11646
11647/// Loads all the offload entries information from the host IR
11648/// metadata.
11650 // If we are in target mode, load the metadata from the host IR. This code has
11651 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11652
11653 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11654 if (!MD)
11655 return;
11656
11657 for (MDNode *MN : MD->operands()) {
11658 auto &&GetMDInt = [MN](unsigned Idx) {
11659 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11660 return cast<ConstantInt>(V->getValue())->getZExtValue();
11661 };
11662
11663 auto &&GetMDString = [MN](unsigned Idx) {
11664 auto *V = cast<MDString>(MN->getOperand(Idx));
11665 return V->getString();
11666 };
11667
11668 switch (GetMDInt(0)) {
11669 default:
11670 llvm_unreachable("Unexpected metadata!");
11671 break;
11672 case OffloadEntriesInfoManager::OffloadEntryInfo::
11673 OffloadingEntryInfoTargetRegion: {
11674 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11675 /*DeviceID=*/GetMDInt(1),
11676 /*FileID=*/GetMDInt(2),
11677 /*Line=*/GetMDInt(4),
11678 /*Count=*/GetMDInt(5));
11679 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11680 /*Order=*/GetMDInt(6));
11681 break;
11682 }
11683 case OffloadEntriesInfoManager::OffloadEntryInfo::
11684 OffloadingEntryInfoDeviceGlobalVar:
11685 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11686 /*MangledName=*/GetMDString(1),
11688 /*Flags=*/GetMDInt(2)),
11689 /*Order=*/GetMDInt(3));
11690 break;
11691 }
11692 }
11693}
11694
11696 StringRef HostFilePath) {
11697 if (HostFilePath.empty())
11698 return;
11699
11700 auto Buf = VFS.getBufferForFile(HostFilePath);
11701 if (std::error_code Err = Buf.getError()) {
11702 report_fatal_error(("error opening host file from host file path inside of "
11703 "OpenMPIRBuilder: " +
11704 Err.message())
11705 .c_str());
11706 }
11707
11708 LLVMContext Ctx;
11710 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11711 if (std::error_code Err = M.getError()) {
11713 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11714 .c_str());
11715 }
11716
11717 loadOffloadInfoMetadata(*M.get());
11718}
11719
11722 llvm::StringRef Name) {
11723 Builder.restoreIP(Loc.IP);
11724
11725 BasicBlock *CurBB = Builder.GetInsertBlock();
11726 assert(CurBB &&
11727 "expected a valid insertion block for creating an iterator loop");
11728 Function *F = CurBB->getParent();
11729
11730 InsertPointTy SplitIP = Builder.saveIP();
11731 if (SplitIP.getPoint() == CurBB->end())
11732 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
11733 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
11734
11735 BasicBlock *ContBB =
11736 splitBB(SplitIP, /*CreateBranch=*/false,
11737 Builder.getCurrentDebugLocation(), "omp.it.cont");
11738
11739 CanonicalLoopInfo *CLI =
11740 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
11741 /*PreInsertBefore=*/ContBB,
11742 /*PostInsertBefore=*/ContBB, Name);
11743
11744 // Enter loop from original block.
11745 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
11746
11747 // Remove the unconditional branch inserted by createLoopSkeleton in the body
11748 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
11749 T->eraseFromParent();
11750
11751 InsertPointTy BodyIP = CLI->getBodyIP();
11752 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
11753 return Err;
11754
11755 // Body must either fallthrough to the latch or branch directly to it.
11756 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
11757 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
11758 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
11760 "iterator bodygen must terminate the canonical body with an "
11761 "unconditional branch to the loop latch",
11763 }
11764 } else {
11765 // Ensure we end the loop body by jumping to the latch.
11766 Builder.SetInsertPoint(CLI->getBody());
11767 Builder.CreateBr(CLI->getLatch());
11768 }
11769
11770 // Link After -> ContBB
11771 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
11772 if (!CLI->getAfter()->hasTerminator())
11773 Builder.CreateBr(ContBB);
11774
11775 return InsertPointTy{ContBB, ContBB->begin()};
11776}
11777
11778/// Mangle the parameter part of the vector function name according to
11779/// their OpenMP classification. The mangling function is defined in
11780/// section 4.5 of the AAVFABI(2021Q1).
11781static std::string mangleVectorParameters(
11783 SmallString<256> Buffer;
11784 llvm::raw_svector_ostream Out(Buffer);
11785 for (const auto &ParamAttr : ParamAttrs) {
11786 switch (ParamAttr.Kind) {
11788 Out << 'l';
11789 break;
11791 Out << 'R';
11792 break;
11794 Out << 'U';
11795 break;
11797 Out << 'L';
11798 break;
11800 Out << 'u';
11801 break;
11803 Out << 'v';
11804 break;
11805 }
11806 if (ParamAttr.HasVarStride)
11807 Out << "s" << ParamAttr.StrideOrArg;
11808 else if (ParamAttr.Kind ==
11810 ParamAttr.Kind ==
11812 ParamAttr.Kind ==
11814 ParamAttr.Kind ==
11816 // Don't print the step value if it is not present or if it is
11817 // equal to 1.
11818 if (ParamAttr.StrideOrArg < 0)
11819 Out << 'n' << -ParamAttr.StrideOrArg;
11820 else if (ParamAttr.StrideOrArg != 1)
11821 Out << ParamAttr.StrideOrArg;
11822 }
11823
11824 if (!!ParamAttr.Alignment)
11825 Out << 'a' << ParamAttr.Alignment;
11826 }
11827
11828 return std::string(Out.str());
11829}
11830
11832 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
11834 struct ISADataTy {
11835 char ISA;
11836 unsigned VecRegSize;
11837 };
11838 ISADataTy ISAData[] = {
11839 {'b', 128}, // SSE
11840 {'c', 256}, // AVX
11841 {'d', 256}, // AVX2
11842 {'e', 512}, // AVX512
11843 };
11845 switch (Branch) {
11847 Masked.push_back('N');
11848 Masked.push_back('M');
11849 break;
11851 Masked.push_back('N');
11852 break;
11854 Masked.push_back('M');
11855 break;
11856 }
11857 for (char Mask : Masked) {
11858 for (const ISADataTy &Data : ISAData) {
11860 llvm::raw_svector_ostream Out(Buffer);
11861 Out << "_ZGV" << Data.ISA << Mask;
11862 if (!VLENVal) {
11863 assert(NumElts && "Non-zero simdlen/cdtsize expected");
11864 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
11865 } else {
11866 Out << VLENVal;
11867 }
11868 Out << mangleVectorParameters(ParamAttrs);
11869 Out << '_' << Fn->getName();
11870 Fn->addFnAttr(Out.str());
11871 }
11872 }
11873}
11874
11875// Function used to add the attribute. The parameter `VLEN` is templated to
11876// allow the use of `x` when targeting scalable functions for SVE.
11877template <typename T>
11878static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
11879 char ISA, StringRef ParSeq,
11880 StringRef MangledName, bool OutputBecomesInput,
11881 llvm::Function *Fn) {
11882 SmallString<256> Buffer;
11883 llvm::raw_svector_ostream Out(Buffer);
11884 Out << Prefix << ISA << LMask << VLEN;
11885 if (OutputBecomesInput)
11886 Out << 'v';
11887 Out << ParSeq << '_' << MangledName;
11888 Fn->addFnAttr(Out.str());
11889}
11890
11891// Helper function to generate the Advanced SIMD names depending on the value
11892// of the NDS when simdlen is not present.
11893static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
11894 StringRef Prefix, char ISA,
11895 StringRef ParSeq, StringRef MangledName,
11896 bool OutputBecomesInput,
11897 llvm::Function *Fn) {
11898 switch (NDS) {
11899 case 8:
11900 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11901 OutputBecomesInput, Fn);
11902 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
11903 OutputBecomesInput, Fn);
11904 break;
11905 case 16:
11906 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11907 OutputBecomesInput, Fn);
11908 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11909 OutputBecomesInput, Fn);
11910 break;
11911 case 32:
11912 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11913 OutputBecomesInput, Fn);
11914 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11915 OutputBecomesInput, Fn);
11916 break;
11917 case 64:
11918 case 128:
11919 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11920 OutputBecomesInput, Fn);
11921 break;
11922 default:
11923 llvm_unreachable("Scalar type is too wide.");
11924 }
11925}
11926
11927/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
11929 llvm::Function *Fn, unsigned UserVLEN,
11931 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
11932 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
11933
11934 // Sort out parameter sequence.
11935 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
11936 StringRef Prefix = "_ZGV";
11937 StringRef MangledName = Fn->getName();
11938
11939 // Generate simdlen from user input (if any).
11940 if (UserVLEN) {
11941 if (ISA == 's') {
11942 // SVE generates only a masked function.
11943 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11944 OutputBecomesInput, Fn);
11945 return;
11946 }
11947
11948 switch (Branch) {
11950 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11951 OutputBecomesInput, Fn);
11952 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11953 OutputBecomesInput, Fn);
11954 break;
11956 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11957 OutputBecomesInput, Fn);
11958 break;
11960 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11961 OutputBecomesInput, Fn);
11962 break;
11963 }
11964 return;
11965 }
11966
11967 if (ISA == 's') {
11968 // SVE, section 3.4.1, item 1.
11969 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
11970 OutputBecomesInput, Fn);
11971 return;
11972 }
11973
11974 switch (Branch) {
11976 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11977 MangledName, OutputBecomesInput, Fn);
11978 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11979 MangledName, OutputBecomesInput, Fn);
11980 break;
11982 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11983 MangledName, OutputBecomesInput, Fn);
11984 break;
11986 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11987 MangledName, OutputBecomesInput, Fn);
11988 break;
11989 }
11990}
11991
11992//===----------------------------------------------------------------------===//
11993// OffloadEntriesInfoManager
11994//===----------------------------------------------------------------------===//
11995
11997 return OffloadEntriesTargetRegion.empty() &&
11998 OffloadEntriesDeviceGlobalVar.empty();
11999}
12000
12001unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
12002 const TargetRegionEntryInfo &EntryInfo) const {
12003 auto It = OffloadEntriesTargetRegionCount.find(
12004 getTargetRegionEntryCountKey(EntryInfo));
12005 if (It == OffloadEntriesTargetRegionCount.end())
12006 return 0;
12007 return It->second;
12008}
12009
12010void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
12011 const TargetRegionEntryInfo &EntryInfo) {
12012 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
12013 EntryInfo.Count + 1;
12014}
12015
12016/// Initialize target region entry.
12018 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
12019 OffloadEntriesTargetRegion[EntryInfo] =
12020 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
12022 ++OffloadingEntriesNum;
12023}
12024
12026 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
12028 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
12029
12030 // Update the EntryInfo with the next available count for this location.
12031 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12032
12033 // If we are emitting code for a target, the entry is already initialized,
12034 // only has to be registered.
12035 if (OMPBuilder->Config.isTargetDevice()) {
12036 // This could happen if the device compilation is invoked standalone.
12037 if (!hasTargetRegionEntryInfo(EntryInfo)) {
12038 return;
12039 }
12040 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
12041 Entry.setAddress(Addr);
12042 Entry.setID(ID);
12043 Entry.setFlags(Flags);
12044 } else {
12046 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
12047 return;
12048 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
12049 "Target region entry already registered!");
12050 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
12051 OffloadEntriesTargetRegion[EntryInfo] = Entry;
12052 ++OffloadingEntriesNum;
12053 }
12054 incrementTargetRegionEntryInfoCount(EntryInfo);
12055}
12056
12058 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
12059
12060 // Update the EntryInfo with the next available count for this location.
12061 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12062
12063 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
12064 if (It == OffloadEntriesTargetRegion.end()) {
12065 return false;
12066 }
12067 // Fail if this entry is already registered.
12068 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
12069 return false;
12070 return true;
12071}
12072
12074 const OffloadTargetRegionEntryInfoActTy &Action) {
12075 // Scan all target region entries and perform the provided action.
12076 for (const auto &It : OffloadEntriesTargetRegion) {
12077 Action(It.first, It.second);
12078 }
12079}
12080
12082 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12083 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12084 ++OffloadingEntriesNum;
12085}
12086
12088 StringRef VarName, Constant *Addr, int64_t VarSize,
12090 if (OMPBuilder->Config.isTargetDevice()) {
12091 // This could happen if the device compilation is invoked standalone.
12092 if (!hasDeviceGlobalVarEntryInfo(VarName))
12093 return;
12094 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12095 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12096 if (Entry.getVarSize() == 0) {
12097 Entry.setVarSize(VarSize);
12098 Entry.setLinkage(Linkage);
12099 }
12100 return;
12101 }
12102 Entry.setVarSize(VarSize);
12103 Entry.setLinkage(Linkage);
12104 Entry.setAddress(Addr);
12105 } else {
12106 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12107 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12108 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12109 "Entry not initialized!");
12110 if (Entry.getVarSize() == 0) {
12111 Entry.setVarSize(VarSize);
12112 Entry.setLinkage(Linkage);
12113 }
12114 return;
12115 }
12117 Flags ==
12119 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12120 Addr, VarSize, Flags, Linkage,
12121 VarName.str());
12122 else
12123 OffloadEntriesDeviceGlobalVar.try_emplace(
12124 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12125 ++OffloadingEntriesNum;
12126 }
12127}
12128
12131 // Scan all target region entries and perform the provided action.
12132 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12133 Action(E.getKey(), E.getValue());
12134}
12135
12136//===----------------------------------------------------------------------===//
12137// CanonicalLoopInfo
12138//===----------------------------------------------------------------------===//
12139
12140void CanonicalLoopInfo::collectControlBlocks(
12142 // We only count those BBs as control block for which we do not need to
12143 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12144 // flow. For consistency, this also means we do not add the Body block, which
12145 // is just the entry to the body code.
12146 BBs.reserve(BBs.size() + 6);
12147 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12148}
12149
12151 assert(isValid() && "Requires a valid canonical loop");
12152 for (BasicBlock *Pred : predecessors(Header)) {
12153 if (Pred != Latch)
12154 return Pred;
12155 }
12156 llvm_unreachable("Missing preheader");
12157}
12158
12159void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12160 assert(isValid() && "Requires a valid canonical loop");
12161
12162 Instruction *CmpI = &getCond()->front();
12163 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12164 CmpI->setOperand(1, TripCount);
12165
12166#ifndef NDEBUG
12167 assertOK();
12168#endif
12169}
12170
12171void CanonicalLoopInfo::mapIndVar(
12172 llvm::function_ref<Value *(Instruction *)> Updater) {
12173 assert(isValid() && "Requires a valid canonical loop");
12174
12175 Instruction *OldIV = getIndVar();
12176
12177 // Record all uses excluding those introduced by the updater. Uses by the
12178 // CanonicalLoopInfo itself to keep track of the number of iterations are
12179 // excluded.
12180 SmallVector<Use *> ReplacableUses;
12181 for (Use &U : OldIV->uses()) {
12182 auto *User = dyn_cast<Instruction>(U.getUser());
12183 if (!User)
12184 continue;
12185 if (User->getParent() == getCond())
12186 continue;
12187 if (User->getParent() == getLatch())
12188 continue;
12189 ReplacableUses.push_back(&U);
12190 }
12191
12192 // Run the updater that may introduce new uses
12193 Value *NewIV = Updater(OldIV);
12194
12195 // Replace the old uses with the value returned by the updater.
12196 for (Use *U : ReplacableUses)
12197 U->set(NewIV);
12198
12199#ifndef NDEBUG
12200 assertOK();
12201#endif
12202}
12203
12205#ifndef NDEBUG
12206 // No constraints if this object currently does not describe a loop.
12207 if (!isValid())
12208 return;
12209
12210 BasicBlock *Preheader = getPreheader();
12211 BasicBlock *Body = getBody();
12212 BasicBlock *After = getAfter();
12213
12214 // Verify standard control-flow we use for OpenMP loops.
12215 assert(Preheader);
12216 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12217 "Preheader must terminate with unconditional branch");
12218 assert(Preheader->getSingleSuccessor() == Header &&
12219 "Preheader must jump to header");
12220
12221 assert(Header);
12222 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12223 "Header must terminate with unconditional branch");
12224 assert(Header->getSingleSuccessor() == Cond &&
12225 "Header must jump to exiting block");
12226
12227 assert(Cond);
12228 assert(Cond->getSinglePredecessor() == Header &&
12229 "Exiting block only reachable from header");
12230
12231 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12232 "Exiting block must terminate with conditional branch");
12233 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12234 "Exiting block's first successor jump to the body");
12235 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12236 "Exiting block's second successor must exit the loop");
12237
12238 assert(Body);
12239 assert(Body->getSinglePredecessor() == Cond &&
12240 "Body only reachable from exiting block");
12241 assert(!isa<PHINode>(Body->front()));
12242
12243 assert(Latch);
12244 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12245 "Latch must terminate with unconditional branch");
12246 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12247 // TODO: To support simple redirecting of the end of the body code that has
12248 // multiple; introduce another auxiliary basic block like preheader and after.
12249 assert(Latch->getSinglePredecessor() != nullptr);
12250 assert(!isa<PHINode>(Latch->front()));
12251
12252 assert(Exit);
12253 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12254 "Exit block must terminate with unconditional branch");
12255 assert(Exit->getSingleSuccessor() == After &&
12256 "Exit block must jump to after block");
12257
12258 assert(After);
12259 assert(After->getSinglePredecessor() == Exit &&
12260 "After block only reachable from exit block");
12261 assert(After->empty() || !isa<PHINode>(After->front()));
12262
12263 Instruction *IndVar = getIndVar();
12264 assert(IndVar && "Canonical induction variable not found?");
12265 assert(isa<IntegerType>(IndVar->getType()) &&
12266 "Induction variable must be an integer");
12267 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12268 "Induction variable must be a PHI in the loop header");
12269 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12270 assert(
12271 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12272 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12273
12274 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12275 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12276 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12277 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12278 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12279 ->isOne());
12280
12281 Value *TripCount = getTripCount();
12282 assert(TripCount && "Loop trip count not found?");
12283 assert(IndVar->getType() == TripCount->getType() &&
12284 "Trip count and induction variable must have the same type");
12285
12286 auto *CmpI = cast<CmpInst>(&Cond->front());
12287 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12288 "Exit condition must be a signed less-than comparison");
12289 assert(CmpI->getOperand(0) == IndVar &&
12290 "Exit condition must compare the induction variable");
12291 assert(CmpI->getOperand(1) == TripCount &&
12292 "Exit condition must compare with the trip count");
12293#endif
12294}
12295
12297 Header = nullptr;
12298 Cond = nullptr;
12299 Latch = nullptr;
12300 Exit = nullptr;
12301}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2822
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:990
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies={}, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskDependency(IRBuilderBase &Builder, Value *Entry, const DependData &Dep)
Store one kmp_depend_info entry at the given Entry pointer.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI CallInst * createOMPAlignedAlloc(const LocationDescription &Loc, Value *Align, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_align_alloc.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, const DependenciesInfo &Dependencies={}, const AffinityData &Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1041
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1101
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1115
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:167
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:87
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:375
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
omp::RTLDependenceKindTy DepKind
A struct to pack static and dynamic dependency information for a task.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...