LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
65
66#include <cstdint>
67#include <optional>
68
69#define DEBUG_TYPE "openmp-ir-builder"
70
71using namespace llvm;
72using namespace omp;
73
74static cl::opt<bool>
75 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
76 cl::desc("Use optimistic attributes describing "
77 "'as-if' properties of runtime calls."),
78 cl::init(false));
79
81 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
82 cl::desc("Factor for the unroll threshold to account for code "
83 "simplifications still taking place"),
84 cl::init(1.5));
85
86#ifndef NDEBUG
87/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
88/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
89/// an InsertPoint stores the instruction before something is inserted. For
90/// instance, if both point to the same instruction, two IRBuilders alternating
91/// creating instruction will cause the instructions to be interleaved.
94 if (!IP1.isSet() || !IP2.isSet())
95 return false;
96 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
97}
98
100 // Valid ordered/unordered and base algorithm combinations.
101 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
102 case OMPScheduleType::UnorderedStaticChunked:
103 case OMPScheduleType::UnorderedStatic:
104 case OMPScheduleType::UnorderedDynamicChunked:
105 case OMPScheduleType::UnorderedGuidedChunked:
106 case OMPScheduleType::UnorderedRuntime:
107 case OMPScheduleType::UnorderedAuto:
108 case OMPScheduleType::UnorderedTrapezoidal:
109 case OMPScheduleType::UnorderedGreedy:
110 case OMPScheduleType::UnorderedBalanced:
111 case OMPScheduleType::UnorderedGuidedIterativeChunked:
112 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
113 case OMPScheduleType::UnorderedSteal:
114 case OMPScheduleType::UnorderedStaticBalancedChunked:
115 case OMPScheduleType::UnorderedGuidedSimd:
116 case OMPScheduleType::UnorderedRuntimeSimd:
117 case OMPScheduleType::OrderedStaticChunked:
118 case OMPScheduleType::OrderedStatic:
119 case OMPScheduleType::OrderedDynamicChunked:
120 case OMPScheduleType::OrderedGuidedChunked:
121 case OMPScheduleType::OrderedRuntime:
122 case OMPScheduleType::OrderedAuto:
123 case OMPScheduleType::OrderdTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedStaticChunked:
125 case OMPScheduleType::NomergeUnorderedStatic:
126 case OMPScheduleType::NomergeUnorderedDynamicChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedChunked:
128 case OMPScheduleType::NomergeUnorderedRuntime:
129 case OMPScheduleType::NomergeUnorderedAuto:
130 case OMPScheduleType::NomergeUnorderedTrapezoidal:
131 case OMPScheduleType::NomergeUnorderedGreedy:
132 case OMPScheduleType::NomergeUnorderedBalanced:
133 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
134 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
135 case OMPScheduleType::NomergeUnorderedSteal:
136 case OMPScheduleType::NomergeOrderedStaticChunked:
137 case OMPScheduleType::NomergeOrderedStatic:
138 case OMPScheduleType::NomergeOrderedDynamicChunked:
139 case OMPScheduleType::NomergeOrderedGuidedChunked:
140 case OMPScheduleType::NomergeOrderedRuntime:
141 case OMPScheduleType::NomergeOrderedAuto:
142 case OMPScheduleType::NomergeOrderedTrapezoidal:
143 case OMPScheduleType::OrderedDistributeChunked:
144 case OMPScheduleType::OrderedDistribute:
145 break;
146 default:
147 return false;
148 }
149
150 // Must not set both monotonicity modifiers at the same time.
151 OMPScheduleType MonotonicityFlags =
152 SchedType & OMPScheduleType::MonotonicityMask;
153 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
154 return false;
155
156 return true;
157}
158#endif
159
160/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
161/// debug location to the last instruction in the specified basic block if the
162/// insert point points to the end of the block.
165 Builder.restoreIP(IP);
166 llvm::BasicBlock *BB = Builder.GetInsertBlock();
167 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
168 if (!BB->empty() && I == BB->end())
169 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
170}
171
172static bool hasGridValue(const Triple &T) {
173 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
174}
175
176static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
177 if (T.isAMDGPU()) {
178 StringRef Features =
179 Kernel->getFnAttribute("target-features").getValueAsString();
180 if (Features.count("+wavefrontsize64"))
183 }
184 if (T.isNVPTX())
186 if (T.isSPIRV())
188 llvm_unreachable("No grid value available for this architecture!");
189}
190
191/// Determine which scheduling algorithm to use, determined from schedule clause
192/// arguments.
193static OMPScheduleType
194getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
195 bool HasSimdModifier, bool HasDistScheduleChunks) {
196 // Currently, the default schedule it static.
197 switch (ClauseKind) {
198 case OMP_SCHEDULE_Default:
199 case OMP_SCHEDULE_Static:
200 return HasChunks ? OMPScheduleType::BaseStaticChunked
201 : OMPScheduleType::BaseStatic;
202 case OMP_SCHEDULE_Dynamic:
203 return OMPScheduleType::BaseDynamicChunked;
204 case OMP_SCHEDULE_Guided:
205 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
206 : OMPScheduleType::BaseGuidedChunked;
207 case OMP_SCHEDULE_Auto:
209 case OMP_SCHEDULE_Runtime:
210 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
211 : OMPScheduleType::BaseRuntime;
212 case OMP_SCHEDULE_Distribute:
213 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
214 : OMPScheduleType::BaseDistribute;
215 }
216 llvm_unreachable("unhandled schedule clause argument");
217}
218
219/// Adds ordering modifier flags to schedule type.
220static OMPScheduleType
222 bool HasOrderedClause) {
223 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
224 OMPScheduleType::None &&
225 "Must not have ordering nor monotonicity flags already set");
226
227 OMPScheduleType OrderingModifier = HasOrderedClause
228 ? OMPScheduleType::ModifierOrdered
229 : OMPScheduleType::ModifierUnordered;
230 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
231
232 // Unsupported combinations
233 if (OrderingScheduleType ==
234 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
235 return OMPScheduleType::OrderedGuidedChunked;
236 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
237 OMPScheduleType::ModifierOrdered))
238 return OMPScheduleType::OrderedRuntime;
239
240 return OrderingScheduleType;
241}
242
243/// Adds monotonicity modifier flags to schedule type.
244static OMPScheduleType
246 bool HasSimdModifier, bool HasMonotonic,
247 bool HasNonmonotonic, bool HasOrderedClause) {
248 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
249 OMPScheduleType::None &&
250 "Must not have monotonicity flags already set");
251 assert((!HasMonotonic || !HasNonmonotonic) &&
252 "Monotonic and Nonmonotonic are contradicting each other");
253
254 if (HasMonotonic) {
255 return ScheduleType | OMPScheduleType::ModifierMonotonic;
256 } else if (HasNonmonotonic) {
257 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
258 } else {
259 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
260 // If the static schedule kind is specified or if the ordered clause is
261 // specified, and if the nonmonotonic modifier is not specified, the
262 // effect is as if the monotonic modifier is specified. Otherwise, unless
263 // the monotonic modifier is specified, the effect is as if the
264 // nonmonotonic modifier is specified.
265 OMPScheduleType BaseScheduleType =
266 ScheduleType & ~OMPScheduleType::ModifierMask;
267 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
268 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
269 HasOrderedClause) {
270 // The monotonic is used by default in openmp runtime library, so no need
271 // to set it.
272 return ScheduleType;
273 } else {
274 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
275 }
276 }
277}
278
279/// Determine the schedule type using schedule and ordering clause arguments.
280static OMPScheduleType
281computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
282 bool HasSimdModifier, bool HasMonotonicModifier,
283 bool HasNonmonotonicModifier, bool HasOrderedClause,
284 bool HasDistScheduleChunks) {
286 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
287 OMPScheduleType OrderedSchedule =
288 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
290 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
291 HasNonmonotonicModifier, HasOrderedClause);
292
294 return Result;
295}
296
297/// Make \p Source branch to \p Target.
298///
299/// Handles two situations:
300/// * \p Source already has an unconditional branch.
301/// * \p Source is a degenerate block (no terminator because the BB is
302/// the current head of the IR construction).
304 if (Instruction *Term = Source->getTerminatorOrNull()) {
305 auto *Br = cast<UncondBrInst>(Term);
306 BasicBlock *Succ = Br->getSuccessor();
307 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
308 Br->setSuccessor(Target);
309 return;
310 }
311
312 auto *NewBr = UncondBrInst::Create(Target, Source);
313 NewBr->setDebugLoc(DL);
314}
315
317 bool CreateBranch, DebugLoc DL) {
318 assert(New->getFirstInsertionPt() == New->begin() &&
319 "Target BB must not have PHI nodes");
320
321 // Move instructions to new block.
322 BasicBlock *Old = IP.getBlock();
323 // If the `Old` block is empty then there are no instructions to move. But in
324 // the new debug scheme, it could have trailing debug records which will be
325 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
326 // reasons:
327 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
328 // 2. Even if `New` is not empty, the rationale to move those records to `New`
329 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
330 // assumes that `Old` is optimized out and is going away. This is not the case
331 // here. The `Old` block is still being used e.g. a branch instruction is
332 // added to it later in this function.
333 // So we call `BasicBlock::splice` only when `Old` is not empty.
334 if (!Old->empty())
335 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
336
337 if (CreateBranch) {
338 auto *NewBr = UncondBrInst::Create(New, Old);
339 NewBr->setDebugLoc(DL);
340 }
341}
342
343void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
344 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
345 BasicBlock *Old = Builder.GetInsertBlock();
346
347 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
348 if (CreateBranch)
349 Builder.SetInsertPoint(Old->getTerminator());
350 else
351 Builder.SetInsertPoint(Old);
352
353 // SetInsertPoint also updates the Builder's debug location, but we want to
354 // keep the one the Builder was configured to use.
355 Builder.SetCurrentDebugLocation(DebugLoc);
356}
357
359 DebugLoc DL, llvm::Twine Name) {
360 BasicBlock *Old = IP.getBlock();
362 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
363 Old->getParent(), Old->getNextNode());
364 spliceBB(IP, New, CreateBranch, DL);
365 New->replaceSuccessorsPhiUsesWith(Old, New);
366 return New;
367}
368
369BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
370 llvm::Twine Name) {
371 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
372 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
373 if (CreateBranch)
374 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
375 else
376 Builder.SetInsertPoint(Builder.GetInsertBlock());
377 // SetInsertPoint also updates the Builder's debug location, but we want to
378 // keep the one the Builder was configured to use.
379 Builder.SetCurrentDebugLocation(DebugLoc);
380 return New;
381}
382
383BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
384 llvm::Twine Name) {
385 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
386 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
387 if (CreateBranch)
388 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
389 else
390 Builder.SetInsertPoint(Builder.GetInsertBlock());
391 // SetInsertPoint also updates the Builder's debug location, but we want to
392 // keep the one the Builder was configured to use.
393 Builder.SetCurrentDebugLocation(DebugLoc);
394 return New;
395}
396
398 llvm::Twine Suffix) {
399 BasicBlock *Old = Builder.GetInsertBlock();
400 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
401}
402
403// This function creates a fake integer value and a fake use for the integer
404// value. It returns the fake value created. This is useful in modeling the
405// extra arguments to the outlined functions.
407 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
409 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
410 const Twine &Name = "", bool AsPtr = true,
411 bool Is64Bit = false) {
412 Builder.restoreIP(OuterAllocaIP);
413 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
414 Instruction *FakeVal;
415 AllocaInst *FakeValAddr =
416 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
417 ToBeDeleted.push_back(FakeValAddr);
418
419 if (AsPtr) {
420 FakeVal = FakeValAddr;
421 } else {
422 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
423 ToBeDeleted.push_back(FakeVal);
424 }
425
426 // Generate a fake use of this value
427 Builder.restoreIP(InnerAllocaIP);
428 Instruction *UseFakeVal;
429 if (AsPtr) {
430 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
431 } else {
432 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
433 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
434 }
435 ToBeDeleted.push_back(UseFakeVal);
436 return FakeVal;
437}
438
439//===----------------------------------------------------------------------===//
440// OpenMPIRBuilderConfig
441//===----------------------------------------------------------------------===//
442
443namespace {
445/// Values for bit flags for marking which requires clauses have been used.
446enum OpenMPOffloadingRequiresDirFlags {
447 /// flag undefined.
448 OMP_REQ_UNDEFINED = 0x000,
449 /// no requires directive present.
450 OMP_REQ_NONE = 0x001,
451 /// reverse_offload clause.
452 OMP_REQ_REVERSE_OFFLOAD = 0x002,
453 /// unified_address clause.
454 OMP_REQ_UNIFIED_ADDRESS = 0x004,
455 /// unified_shared_memory clause.
456 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
457 /// dynamic_allocators clause.
458 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
459 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
460};
461
462} // anonymous namespace
463
465 : RequiresFlags(OMP_REQ_UNDEFINED) {}
466
469 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
470 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
473 RequiresFlags(OMP_REQ_UNDEFINED) {
474 if (HasRequiresReverseOffload)
475 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
476 if (HasRequiresUnifiedAddress)
477 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
478 if (HasRequiresUnifiedSharedMemory)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 if (HasRequiresDynamicAllocators)
481 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
482}
483
485 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
486}
487
489 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
490}
491
493 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
494}
495
497 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
498}
499
501 return hasRequiresFlags() ? RequiresFlags
502 : static_cast<int64_t>(OMP_REQ_NONE);
503}
504
506 if (Value)
507 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
508 else
509 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
510}
511
513 if (Value)
514 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
515 else
516 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
517}
518
520 if (Value)
521 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
522 else
523 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
524}
525
527 if (Value)
528 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
529 else
530 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
531}
532
533//===----------------------------------------------------------------------===//
534// OpenMPIRBuilder
535//===----------------------------------------------------------------------===//
536
539 SmallVector<Value *> &ArgsVector) {
541 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
542 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
543 constexpr size_t MaxDim = 3;
544 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
545
546 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
547
548 Value *DynCGroupMemFallbackFlag =
549 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
550 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
551 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
552
553 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
554
555 Value *NumTeams3D =
556 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
557 Value *NumThreads3D =
558 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
559 for (unsigned I :
560 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
561 NumTeams3D =
562 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
563 for (unsigned I :
564 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
565 NumThreads3D =
566 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
567
568 ArgsVector = {Version,
569 PointerNum,
570 KernelArgs.RTArgs.BasePointersArray,
571 KernelArgs.RTArgs.PointersArray,
572 KernelArgs.RTArgs.SizesArray,
573 KernelArgs.RTArgs.MapTypesArray,
574 KernelArgs.RTArgs.MapNamesArray,
575 KernelArgs.RTArgs.MappersArray,
576 KernelArgs.NumIterations,
577 Flags,
578 NumTeams3D,
579 NumThreads3D,
580 KernelArgs.DynCGroupMem};
581}
582
584 LLVMContext &Ctx = Fn.getContext();
585
586 // Get the function's current attributes.
587 auto Attrs = Fn.getAttributes();
588 auto FnAttrs = Attrs.getFnAttrs();
589 auto RetAttrs = Attrs.getRetAttrs();
591 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
592 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
593
594 // Add AS to FnAS while taking special care with integer extensions.
595 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
596 bool Param = true) -> void {
597 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
598 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
599 if (HasSignExt || HasZeroExt) {
600 assert(AS.getNumAttributes() == 1 &&
601 "Currently not handling extension attr combined with others.");
602 if (Param) {
603 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else if (auto AK =
606 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
607 FnAS = FnAS.addAttribute(Ctx, AK);
608 } else {
609 FnAS = FnAS.addAttributes(Ctx, AS);
610 }
611 };
612
613#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
614#include "llvm/Frontend/OpenMP/OMPKinds.def"
615
616 // Add attributes to the function declaration.
617 switch (FnID) {
618#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
619 case Enum: \
620 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
621 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
622 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
623 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
624 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
625 break;
626#include "llvm/Frontend/OpenMP/OMPKinds.def"
627 default:
628 // Attributes are optional.
629 break;
630 }
631}
632
635 FunctionType *FnTy = nullptr;
636 Function *Fn = nullptr;
637
638 // Try to find the declation in the module first.
639 switch (FnID) {
640#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
641 case Enum: \
642 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
643 IsVarArg); \
644 Fn = M.getFunction(Str); \
645 break;
646#include "llvm/Frontend/OpenMP/OMPKinds.def"
647 }
648
649 if (!Fn) {
650 // Create a new declaration if we need one.
651 switch (FnID) {
652#define OMP_RTL(Enum, Str, ...) \
653 case Enum: \
654 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
655 break;
656#include "llvm/Frontend/OpenMP/OMPKinds.def"
657 }
658 Fn->setCallingConv(Config.getRuntimeCC());
659 // Add information if the runtime function takes a callback function
660 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
661 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
662 LLVMContext &Ctx = Fn->getContext();
663 MDBuilder MDB(Ctx);
664 // Annotate the callback behavior of the runtime function:
665 // - The callback callee is argument number 2 (microtask).
666 // - The first two arguments of the callback callee are unknown (-1).
667 // - All variadic arguments to the runtime function are passed to the
668 // callback callee.
669 Fn->addMetadata(
670 LLVMContext::MD_callback,
672 2, {-1, -1}, /* VarArgsArePassed */ true)}));
673 }
674 }
675
676 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 addAttributes(FnID, *Fn);
679
680 } else {
681 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
682 << " with type " << *Fn->getFunctionType() << "\n");
683 }
684
685 assert(Fn && "Failed to create OpenMP runtime function");
686
687 return {FnTy, Fn};
688}
689
692 if (!FiniBB) {
693 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
695 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
696 Builder.SetInsertPoint(FiniBB);
697 // FiniCB adds the branch to the exit stub.
698 if (Error Err = FiniCB(Builder.saveIP()))
699 return Err;
700 }
701 return FiniBB;
702}
703
705 BasicBlock *OtherFiniBB) {
706 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
707 if (!FiniBB) {
708 FiniBB = OtherFiniBB;
709
710 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
711 if (Error Err = FiniCB(Builder.saveIP()))
712 return Err;
713
714 return Error::success();
715 }
716
717 // Move instructions from FiniBB to the start of OtherFiniBB.
718 auto EndIt = FiniBB->end();
719 if (FiniBB->size() >= 1)
720 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
721 EndIt = Prev;
722 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
723 EndIt);
724
725 FiniBB->replaceAllUsesWith(OtherFiniBB);
726 FiniBB->eraseFromParent();
727 FiniBB = OtherFiniBB;
728 return Error::success();
729}
730
733 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
734 assert(Fn && "Failed to create OpenMP runtime function pointer");
735 return Fn;
736}
737
740 StringRef Name) {
741 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
742 Call->setCallingConv(Config.getRuntimeCC());
743 return Call;
744}
745
746void OpenMPIRBuilder::initialize() { initializeTypes(M); }
747
750 BasicBlock &EntryBlock = Function->getEntryBlock();
751 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
752
753 // Loop over blocks looking for constant allocas, skipping the entry block
754 // as any allocas there are already in the desired location.
755 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
756 Block++) {
757 for (auto Inst = Block->getReverseIterator()->begin();
758 Inst != Block->getReverseIterator()->end();) {
760 Inst++;
762 continue;
763 AllocaInst->moveBeforePreserving(MoveLocInst);
764 } else {
765 Inst++;
766 }
767 }
768 }
769}
770
773
774 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
775 // TODO: For now, we support simple static allocations, we might need to
776 // move non-static ones as well. However, this will need further analysis to
777 // move the lenght arguments as well.
779 };
780
781 for (llvm::Instruction &Inst : Block)
783 if (ShouldHoistAlloca(*AllocaInst))
784 AllocasToMove.push_back(AllocaInst);
785
786 auto InsertPoint =
787 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
788
789 for (llvm::Instruction *AllocaInst : AllocasToMove)
791}
792
794 PostDominatorTree PostDomTree(*Func);
795 for (llvm::BasicBlock &BB : *Func)
796 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
798}
799
801 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
803 SmallVector<OutlineInfo, 16> DeferredOutlines;
804 for (OutlineInfo &OI : OutlineInfos) {
805 // Skip functions that have not finalized yet; may happen with nested
806 // function generation.
807 if (Fn && OI.getFunction() != Fn) {
808 DeferredOutlines.push_back(OI);
809 continue;
810 }
811
812 ParallelRegionBlockSet.clear();
813 Blocks.clear();
814 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
815
816 Function *OuterFn = OI.getFunction();
817 CodeExtractorAnalysisCache CEAC(*OuterFn);
818 // If we generate code for the target device, we need to allocate
819 // struct for aggregate params in the device default alloca address space.
820 // OpenMP runtime requires that the params of the extracted functions are
821 // passed as zero address space pointers. This flag ensures that
822 // CodeExtractor generates correct code for extracted functions
823 // which are used by OpenMP runtime.
824 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
825 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
826 /* AggregateArgs */ true,
827 /* BlockFrequencyInfo */ nullptr,
828 /* BranchProbabilityInfo */ nullptr,
829 /* AssumptionCache */ nullptr,
830 /* AllowVarArgs */ true,
831 /* AllowAlloca */ true,
832 /* AllocaBlock*/ OI.OuterAllocaBB,
833 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
834
835 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
836 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
837 << " Exit: " << OI.ExitBB->getName() << "\n");
838 assert(Extractor.isEligible() &&
839 "Expected OpenMP outlining to be possible!");
840
841 for (auto *V : OI.ExcludeArgsFromAggregate)
842 Extractor.excludeArgFromAggregate(V);
843
844 Function *OutlinedFn =
845 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
846
847 // Forward target-cpu, target-features attributes to the outlined function.
848 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
849 if (TargetCpuAttr.isStringAttribute())
850 OutlinedFn->addFnAttr(TargetCpuAttr);
851
852 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
853 if (TargetFeaturesAttr.isStringAttribute())
854 OutlinedFn->addFnAttr(TargetFeaturesAttr);
855
856 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
857 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
858 assert(OutlinedFn->getReturnType()->isVoidTy() &&
859 "OpenMP outlined functions should not return a value!");
860
861 // For compability with the clang CG we move the outlined function after the
862 // one with the parallel region.
863 OutlinedFn->removeFromParent();
864 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
865
866 // Remove the artificial entry introduced by the extractor right away, we
867 // made our own entry block after all.
868 {
869 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
870 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
871 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
872 // Move instructions from the to-be-deleted ArtificialEntry to the entry
873 // basic block of the parallel region. CodeExtractor generates
874 // instructions to unwrap the aggregate argument and may sink
875 // allocas/bitcasts for values that are solely used in the outlined region
876 // and do not escape.
877 assert(!ArtificialEntry.empty() &&
878 "Expected instructions to add in the outlined region entry");
879 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
880 End = ArtificialEntry.rend();
881 It != End;) {
882 Instruction &I = *It;
883 It++;
884
885 if (I.isTerminator()) {
886 // Absorb any debug value that terminator may have
887 if (Instruction *TI = OI.EntryBB->getTerminatorOrNull())
888 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
889 continue;
890 }
891
892 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
893 }
894
895 OI.EntryBB->moveBefore(&ArtificialEntry);
896 ArtificialEntry.eraseFromParent();
897 }
898 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
899 assert(OutlinedFn && OutlinedFn->hasNUses(1));
900
901 // Run a user callback, e.g. to add attributes.
902 if (OI.PostOutlineCB)
903 OI.PostOutlineCB(*OutlinedFn);
904
905 if (OI.FixUpNonEntryAllocas)
907 }
908
909 // Remove work items that have been completed.
910 OutlineInfos = std::move(DeferredOutlines);
911
912 // The createTarget functions embeds user written code into
913 // the target region which may inject allocas which need to
914 // be moved to the entry block of our target or risk malformed
915 // optimisations by later passes, this is only relevant for
916 // the device pass which appears to be a little more delicate
917 // when it comes to optimisations (however, we do not block on
918 // that here, it's up to the inserter to the list to do so).
919 // This notbaly has to occur after the OutlinedInfo candidates
920 // have been extracted so we have an end product that will not
921 // be implicitly adversely affected by any raises unless
922 // intentionally appended to the list.
923 // NOTE: This only does so for ConstantData, it could be extended
924 // to ConstantExpr's with further effort, however, they should
925 // largely be folded when they get here. Extending it to runtime
926 // defined/read+writeable allocation sizes would be non-trivial
927 // (need to factor in movement of any stores to variables the
928 // allocation size depends on, as well as the usual loads,
929 // otherwise it'll yield the wrong result after movement) and
930 // likely be more suitable as an LLVM optimisation pass.
933
934 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
935 [](EmitMetadataErrorKind Kind,
936 const TargetRegionEntryInfo &EntryInfo) -> void {
937 errs() << "Error of kind: " << Kind
938 << " when emitting offload entries and metadata during "
939 "OMPIRBuilder finalization \n";
940 };
941
942 if (!OffloadInfoManager.empty())
944
945 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
946 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
947 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
948 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
949 }
950
951 IsFinalized = true;
952}
953
954bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
955
957 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
958}
959
961 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
962 auto *GV =
963 new GlobalVariable(M, I32Ty,
964 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
965 ConstantInt::get(I32Ty, Value), Name);
966 GV->setVisibility(GlobalValue::HiddenVisibility);
967
968 return GV;
969}
970
972 if (List.empty())
973 return;
974
975 // Convert List to what ConstantArray needs.
977 UsedArray.resize(List.size());
978 for (unsigned I = 0, E = List.size(); I != E; ++I)
980 cast<Constant>(&*List[I]), Builder.getPtrTy());
981
982 if (UsedArray.empty())
983 return;
984 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
985
986 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
987 ConstantArray::get(ATy, UsedArray), Name);
988
989 GV->setSection("llvm.metadata");
990}
991
994 OMPTgtExecModeFlags Mode) {
995 auto *Int8Ty = Builder.getInt8Ty();
996 auto *GVMode = new GlobalVariable(
997 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
998 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
999 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1000 return GVMode;
1001}
1002
1004 uint32_t SrcLocStrSize,
1005 IdentFlag LocFlags,
1006 unsigned Reserve2Flags) {
1007 // Enable "C-mode".
1008 LocFlags |= OMP_IDENT_FLAG_KMPC;
1009
1010 Constant *&Ident =
1011 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1012 if (!Ident) {
1013 Constant *I32Null = ConstantInt::getNullValue(Int32);
1014 Constant *IdentData[] = {I32Null,
1015 ConstantInt::get(Int32, uint32_t(LocFlags)),
1016 ConstantInt::get(Int32, Reserve2Flags),
1017 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1018
1019 size_t SrcLocStrArgIdx = 4;
1020 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1022 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1023 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1024 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1025 Constant *Initializer =
1026 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1027
1028 // Look for existing encoding of the location + flags, not needed but
1029 // minimizes the difference to the existing solution while we transition.
1030 for (GlobalVariable &GV : M.globals())
1031 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1032 if (GV.getInitializer() == Initializer)
1033 Ident = &GV;
1034
1035 if (!Ident) {
1036 auto *GV = new GlobalVariable(
1037 M, OpenMPIRBuilder::Ident,
1038 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1040 M.getDataLayout().getDefaultGlobalsAddressSpace());
1041 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1042 GV->setAlignment(Align(8));
1043 Ident = GV;
1044 }
1045 }
1046
1047 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1048}
1049
1051 uint32_t &SrcLocStrSize) {
1052 SrcLocStrSize = LocStr.size();
1053 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1054 if (!SrcLocStr) {
1055 Constant *Initializer =
1056 ConstantDataArray::getString(M.getContext(), LocStr);
1057
1058 // Look for existing encoding of the location, not needed but minimizes the
1059 // difference to the existing solution while we transition.
1060 for (GlobalVariable &GV : M.globals())
1061 if (GV.isConstant() && GV.hasInitializer() &&
1062 GV.getInitializer() == Initializer)
1063 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1064
1065 SrcLocStr = Builder.CreateGlobalString(
1066 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1067 &M);
1068 }
1069 return SrcLocStr;
1070}
1071
1073 StringRef FileName,
1074 unsigned Line, unsigned Column,
1075 uint32_t &SrcLocStrSize) {
1076 SmallString<128> Buffer;
1077 Buffer.push_back(';');
1078 Buffer.append(FileName);
1079 Buffer.push_back(';');
1080 Buffer.append(FunctionName);
1081 Buffer.push_back(';');
1082 Buffer.append(std::to_string(Line));
1083 Buffer.push_back(';');
1084 Buffer.append(std::to_string(Column));
1085 Buffer.push_back(';');
1086 Buffer.push_back(';');
1087 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1088}
1089
1090Constant *
1092 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1093 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1094}
1095
1097 uint32_t &SrcLocStrSize,
1098 Function *F) {
1099 DILocation *DIL = DL.get();
1100 if (!DIL)
1101 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1102 StringRef FileName = M.getName();
1103 if (DIFile *DIF = DIL->getFile())
1104 if (std::optional<StringRef> Source = DIF->getSource())
1105 FileName = *Source;
1106 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1107 if (Function.empty() && F)
1108 Function = F->getName();
1109 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1110 DIL->getColumn(), SrcLocStrSize);
1111}
1112
1114 uint32_t &SrcLocStrSize) {
1115 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1116 Loc.IP.getBlock()->getParent());
1117}
1118
1121 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1122 "omp_global_thread_num");
1123}
1124
1127 bool ForceSimpleCall, bool CheckCancelFlag) {
1128 if (!updateToLocation(Loc))
1129 return Loc.IP;
1130
1131 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1132 // __kmpc_barrier(loc, thread_id);
1133
1134 IdentFlag BarrierLocFlags;
1135 switch (Kind) {
1136 case OMPD_for:
1137 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1138 break;
1139 case OMPD_sections:
1140 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1141 break;
1142 case OMPD_single:
1143 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1144 break;
1145 case OMPD_barrier:
1146 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1147 break;
1148 default:
1149 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1150 break;
1151 }
1152
1153 uint32_t SrcLocStrSize;
1154 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1155 Value *Args[] = {
1156 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1157 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1158
1159 // If we are in a cancellable parallel region, barriers are cancellation
1160 // points.
1161 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1162 bool UseCancelBarrier =
1163 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1164
1166 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1167 ? OMPRTL___kmpc_cancel_barrier
1168 : OMPRTL___kmpc_barrier),
1169 Args);
1170
1171 if (UseCancelBarrier && CheckCancelFlag)
1172 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1173 return Err;
1174
1175 return Builder.saveIP();
1176}
1177
1180 Value *IfCondition,
1181 omp::Directive CanceledDirective) {
1182 if (!updateToLocation(Loc))
1183 return Loc.IP;
1184
1185 // LLVM utilities like blocks with terminators.
1186 auto *UI = Builder.CreateUnreachable();
1187
1188 Instruction *ThenTI = UI, *ElseTI = nullptr;
1189 if (IfCondition) {
1190 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1191
1192 // Even if the if condition evaluates to false, this should count as a
1193 // cancellation point
1194 Builder.SetInsertPoint(ElseTI);
1195 auto ElseIP = Builder.saveIP();
1196
1198 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1199 if (!IPOrErr)
1200 return IPOrErr;
1201 }
1202
1203 Builder.SetInsertPoint(ThenTI);
1204
1205 Value *CancelKind = nullptr;
1206 switch (CanceledDirective) {
1207#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1208 case DirectiveEnum: \
1209 CancelKind = Builder.getInt32(Value); \
1210 break;
1211#include "llvm/Frontend/OpenMP/OMPKinds.def"
1212 default:
1213 llvm_unreachable("Unknown cancel kind!");
1214 }
1215
1216 uint32_t SrcLocStrSize;
1217 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1218 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1219 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1221 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1222
1223 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1224 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1225 return Err;
1226
1227 // Update the insertion point and remove the terminator we introduced.
1228 Builder.SetInsertPoint(UI->getParent());
1229 UI->eraseFromParent();
1230
1231 return Builder.saveIP();
1232}
1233
1236 omp::Directive CanceledDirective) {
1237 if (!updateToLocation(Loc))
1238 return Loc.IP;
1239
1240 // LLVM utilities like blocks with terminators.
1241 auto *UI = Builder.CreateUnreachable();
1242 Builder.SetInsertPoint(UI);
1243
1244 Value *CancelKind = nullptr;
1245 switch (CanceledDirective) {
1246#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1247 case DirectiveEnum: \
1248 CancelKind = Builder.getInt32(Value); \
1249 break;
1250#include "llvm/Frontend/OpenMP/OMPKinds.def"
1251 default:
1252 llvm_unreachable("Unknown cancel kind!");
1253 }
1254
1255 uint32_t SrcLocStrSize;
1256 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1257 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1258 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1260 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1261
1262 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1263 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1264 return Err;
1265
1266 // Update the insertion point and remove the terminator we introduced.
1267 Builder.SetInsertPoint(UI->getParent());
1268 UI->eraseFromParent();
1269
1270 return Builder.saveIP();
1271}
1272
1274 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1275 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1276 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1277 if (!updateToLocation(Loc))
1278 return Loc.IP;
1279
1280 Builder.restoreIP(AllocaIP);
1281 auto *KernelArgsPtr =
1282 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1284
1285 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1286 llvm::Value *Arg =
1287 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1288 Builder.CreateAlignedStore(
1289 KernelArgs[I], Arg,
1290 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1291 }
1292
1293 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1294 NumThreads, HostPtr, KernelArgsPtr};
1295
1297 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1298 OffloadingArgs);
1299
1300 return Builder.saveIP();
1301}
1302
1304 const LocationDescription &Loc, Value *OutlinedFnID,
1305 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1306 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1307
1308 if (!updateToLocation(Loc))
1309 return Loc.IP;
1310
1311 // On top of the arrays that were filled up, the target offloading call
1312 // takes as arguments the device id as well as the host pointer. The host
1313 // pointer is used by the runtime library to identify the current target
1314 // region, so it only has to be unique and not necessarily point to
1315 // anything. It could be the pointer to the outlined function that
1316 // implements the target region, but we aren't using that so that the
1317 // compiler doesn't need to keep that, and could therefore inline the host
1318 // function if proven worthwhile during optimization.
1319
1320 // From this point on, we need to have an ID of the target region defined.
1321 assert(OutlinedFnID && "Invalid outlined function ID!");
1322 (void)OutlinedFnID;
1323
1324 // Return value of the runtime offloading call.
1325 Value *Return = nullptr;
1326
1327 // Arguments for the target kernel.
1328 SmallVector<Value *> ArgsVector;
1329 getKernelArgsVector(Args, Builder, ArgsVector);
1330
1331 // The target region is an outlined function launched by the runtime
1332 // via calls to __tgt_target_kernel().
1333 //
1334 // Note that on the host and CPU targets, the runtime implementation of
1335 // these calls simply call the outlined function without forking threads.
1336 // The outlined functions themselves have runtime calls to
1337 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1338 // the compiler in emitTeamsCall() and emitParallelCall().
1339 //
1340 // In contrast, on the NVPTX target, the implementation of
1341 // __tgt_target_teams() launches a GPU kernel with the requested number
1342 // of teams and threads so no additional calls to the runtime are required.
1343 // Check the error code and execute the host version if required.
1344 Builder.restoreIP(emitTargetKernel(
1345 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1346 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1347
1348 BasicBlock *OffloadFailedBlock =
1349 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1350 BasicBlock *OffloadContBlock =
1351 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1352 Value *Failed = Builder.CreateIsNotNull(Return);
1353 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1354
1355 auto CurFn = Builder.GetInsertBlock()->getParent();
1356 emitBlock(OffloadFailedBlock, CurFn);
1357 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1358 if (!AfterIP)
1359 return AfterIP.takeError();
1360 Builder.restoreIP(*AfterIP);
1361 emitBranch(OffloadContBlock);
1362 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1363 return Builder.saveIP();
1364}
1365
1367 Value *CancelFlag, omp::Directive CanceledDirective) {
1368 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1369 "Unexpected cancellation!");
1370
1371 // For a cancel barrier we create two new blocks.
1372 BasicBlock *BB = Builder.GetInsertBlock();
1373 BasicBlock *NonCancellationBlock;
1374 if (Builder.GetInsertPoint() == BB->end()) {
1375 // TODO: This branch will not be needed once we moved to the
1376 // OpenMPIRBuilder codegen completely.
1377 NonCancellationBlock = BasicBlock::Create(
1378 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1379 } else {
1380 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1382 Builder.SetInsertPoint(BB);
1383 }
1384 BasicBlock *CancellationBlock = BasicBlock::Create(
1385 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1386
1387 // Jump to them based on the return value.
1388 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1389 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1390 /* TODO weight */ nullptr, nullptr);
1391
1392 // From the cancellation block we finalize all variables and go to the
1393 // post finalization block that is known to the FiniCB callback.
1394 auto &FI = FinalizationStack.back();
1395 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1396 if (!FiniBBOrErr)
1397 return FiniBBOrErr.takeError();
1398 Builder.SetInsertPoint(CancellationBlock);
1399 Builder.CreateBr(*FiniBBOrErr);
1400
1401 // The continuation block is where code generation continues.
1402 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1403 return Error::success();
1404}
1405
1406// Callback used to create OpenMP runtime calls to support
1407// omp parallel clause for the device.
1408// We need to use this callback to replace call to the OutlinedFn in OuterFn
1409// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1411 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1412 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1413 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1414 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1415 // Add some known attributes.
1416 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1417 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1418 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1419 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1420 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1421 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1422
1423 assert(OutlinedFn.arg_size() >= 2 &&
1424 "Expected at least tid and bounded tid as arguments");
1425 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1426
1427 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1428 assert(CI && "Expected call instruction to outlined function");
1429 CI->getParent()->setName("omp_parallel");
1430
1431 Builder.SetInsertPoint(CI);
1432 Type *PtrTy = OMPIRBuilder->VoidPtr;
1433 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1434
1435 // Add alloca for kernel args
1436 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1437 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1438 AllocaInst *ArgsAlloca =
1439 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1440 Value *Args = ArgsAlloca;
1441 // Add address space cast if array for storing arguments is not allocated
1442 // in address space 0
1443 if (ArgsAlloca->getAddressSpace())
1444 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1445 Builder.restoreIP(CurrentIP);
1446
1447 // Store captured vars which are used by kmpc_parallel_60
1448 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1449 Value *V = *(CI->arg_begin() + 2 + Idx);
1450 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1451 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1452 Builder.CreateStore(V, StoreAddress);
1453 }
1454
1455 Value *Cond =
1456 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1457 : Builder.getInt32(1);
1458
1459 // Build kmpc_parallel_60 call
1460 Value *Parallel60CallArgs[] = {
1461 /* identifier*/ Ident,
1462 /* global thread num*/ ThreadID,
1463 /* if expression */ Cond,
1464 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1465 /* Proc bind */ Builder.getInt32(-1),
1466 /* outlined function */ &OutlinedFn,
1467 /* wrapper function */ NullPtrValue,
1468 /* arguments of the outlined funciton*/ Args,
1469 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1470 /* strict for number of threads */ Builder.getInt32(0)};
1471
1472 FunctionCallee RTLFn =
1473 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1474
1475 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1476
1477 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1478 << *Builder.GetInsertBlock()->getParent() << "\n");
1479
1480 // Initialize the local TID stack location with the argument value.
1481 Builder.SetInsertPoint(PrivTID);
1482 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1483 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1484 PrivTIDAddr);
1485
1486 // Remove redundant call to the outlined function.
1487 CI->eraseFromParent();
1488
1489 for (Instruction *I : ToBeDeleted) {
1490 I->eraseFromParent();
1491 }
1492}
1493
1494// Callback used to create OpenMP runtime calls to support
1495// omp parallel clause for the host.
1496// We need to use this callback to replace call to the OutlinedFn in OuterFn
1497// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1498static void
1500 Function *OuterFn, Value *Ident, Value *IfCondition,
1501 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1502 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1503 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1504 FunctionCallee RTLFn;
1505 if (IfCondition) {
1506 RTLFn =
1507 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1508 } else {
1509 RTLFn =
1510 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1511 }
1512 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1513 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1514 LLVMContext &Ctx = F->getContext();
1515 MDBuilder MDB(Ctx);
1516 // Annotate the callback behavior of the __kmpc_fork_call:
1517 // - The callback callee is argument number 2 (microtask).
1518 // - The first two arguments of the callback callee are unknown (-1).
1519 // - All variadic arguments to the __kmpc_fork_call are passed to the
1520 // callback callee.
1521 F->addMetadata(LLVMContext::MD_callback,
1523 2, {-1, -1},
1524 /* VarArgsArePassed */ true)}));
1525 }
1526 }
1527 // Add some known attributes.
1528 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1529 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1530 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1531
1532 assert(OutlinedFn.arg_size() >= 2 &&
1533 "Expected at least tid and bounded tid as arguments");
1534 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1535
1536 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1537 CI->getParent()->setName("omp_parallel");
1538 Builder.SetInsertPoint(CI);
1539
1540 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1541 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1542 &OutlinedFn};
1543
1544 SmallVector<Value *, 16> RealArgs;
1545 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1546 if (IfCondition) {
1547 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1548 RealArgs.push_back(Cond);
1549 }
1550 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1551
1552 // __kmpc_fork_call_if always expects a void ptr as the last argument
1553 // If there are no arguments, pass a null pointer.
1554 auto PtrTy = OMPIRBuilder->VoidPtr;
1555 if (IfCondition && NumCapturedVars == 0) {
1556 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1557 RealArgs.push_back(NullPtrValue);
1558 }
1559
1560 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1561
1562 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1563 << *Builder.GetInsertBlock()->getParent() << "\n");
1564
1565 // Initialize the local TID stack location with the argument value.
1566 Builder.SetInsertPoint(PrivTID);
1567 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1568 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1569 PrivTIDAddr);
1570
1571 // Remove redundant call to the outlined function.
1572 CI->eraseFromParent();
1573
1574 for (Instruction *I : ToBeDeleted) {
1575 I->eraseFromParent();
1576 }
1577}
1578
1580 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1581 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1582 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1583 omp::ProcBindKind ProcBind, bool IsCancellable) {
1584 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1585
1586 if (!updateToLocation(Loc))
1587 return Loc.IP;
1588
1589 uint32_t SrcLocStrSize;
1590 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1591 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1592 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1593 (ProcBind != OMP_PROC_BIND_default);
1594 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1595 // If we generate code for the target device, we need to allocate
1596 // struct for aggregate params in the device default alloca address space.
1597 // OpenMP runtime requires that the params of the extracted functions are
1598 // passed as zero address space pointers. This flag ensures that extracted
1599 // function arguments are declared in zero address space
1600 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1601
1602 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1603 // only if we compile for host side.
1604 if (NumThreads && !Config.isTargetDevice()) {
1605 Value *Args[] = {
1606 Ident, ThreadID,
1607 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1609 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1610 }
1611
1612 if (ProcBind != OMP_PROC_BIND_default) {
1613 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1614 Value *Args[] = {
1615 Ident, ThreadID,
1616 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1618 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1619 }
1620
1621 BasicBlock *InsertBB = Builder.GetInsertBlock();
1622 Function *OuterFn = InsertBB->getParent();
1623
1624 // Save the outer alloca block because the insertion iterator may get
1625 // invalidated and we still need this later.
1626 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1627
1628 // Vector to remember instructions we used only during the modeling but which
1629 // we want to delete at the end.
1631
1632 // Change the location to the outer alloca insertion point to create and
1633 // initialize the allocas we pass into the parallel region.
1634 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1635 Builder.restoreIP(NewOuter);
1636 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1637 AllocaInst *ZeroAddrAlloca =
1638 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1639 Instruction *TIDAddr = TIDAddrAlloca;
1640 Instruction *ZeroAddr = ZeroAddrAlloca;
1641 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1642 // Add additional casts to enforce pointers in zero address space
1643 TIDAddr = new AddrSpaceCastInst(
1644 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1645 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1646 ToBeDeleted.push_back(TIDAddr);
1647 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1648 PointerType ::get(M.getContext(), 0),
1649 "zero.addr.ascast");
1650 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1651 ToBeDeleted.push_back(ZeroAddr);
1652 }
1653
1654 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1655 // associated arguments in the outlined function, so we delete them later.
1656 ToBeDeleted.push_back(TIDAddrAlloca);
1657 ToBeDeleted.push_back(ZeroAddrAlloca);
1658
1659 // Create an artificial insertion point that will also ensure the blocks we
1660 // are about to split are not degenerated.
1661 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1662
1663 BasicBlock *EntryBB = UI->getParent();
1664 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1665 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1666 BasicBlock *PRegPreFiniBB =
1667 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1668 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1669
1670 auto FiniCBWrapper = [&](InsertPointTy IP) {
1671 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1672 // target to the region exit block.
1673 if (IP.getBlock()->end() == IP.getPoint()) {
1675 Builder.restoreIP(IP);
1676 Instruction *I = Builder.CreateBr(PRegExitBB);
1677 IP = InsertPointTy(I->getParent(), I->getIterator());
1678 }
1679 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1680 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1681 "Unexpected insertion point for finalization call!");
1682 return FiniCB(IP);
1683 };
1684
1685 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1686
1687 // Generate the privatization allocas in the block that will become the entry
1688 // of the outlined function.
1689 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1690 InsertPointTy InnerAllocaIP = Builder.saveIP();
1691
1692 AllocaInst *PrivTIDAddr =
1693 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1694 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1695
1696 // Add some fake uses for OpenMP provided arguments.
1697 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1698 Instruction *ZeroAddrUse =
1699 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1700 ToBeDeleted.push_back(ZeroAddrUse);
1701
1702 // EntryBB
1703 // |
1704 // V
1705 // PRegionEntryBB <- Privatization allocas are placed here.
1706 // |
1707 // V
1708 // PRegionBodyBB <- BodeGen is invoked here.
1709 // |
1710 // V
1711 // PRegPreFiniBB <- The block we will start finalization from.
1712 // |
1713 // V
1714 // PRegionExitBB <- A common exit to simplify block collection.
1715 //
1716
1717 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1718
1719 // Let the caller create the body.
1720 assert(BodyGenCB && "Expected body generation callback!");
1721 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1722 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1723 return Err;
1724
1725 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1726
1727 OutlineInfo OI;
1728 if (Config.isTargetDevice()) {
1729 // Generate OpenMP target specific runtime call
1730 OI.PostOutlineCB = [=, ToBeDeletedVec =
1731 std::move(ToBeDeleted)](Function &OutlinedFn) {
1732 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1733 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1734 ThreadID, ToBeDeletedVec);
1735 };
1736 OI.FixUpNonEntryAllocas = true;
1737 } else {
1738 // Generate OpenMP host runtime call
1739 OI.PostOutlineCB = [=, ToBeDeletedVec =
1740 std::move(ToBeDeleted)](Function &OutlinedFn) {
1741 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1742 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1743 };
1744 OI.FixUpNonEntryAllocas = true;
1745 }
1746
1747 OI.OuterAllocaBB = OuterAllocaBlock;
1748 OI.EntryBB = PRegEntryBB;
1749 OI.ExitBB = PRegExitBB;
1750
1751 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1753 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1754
1755 CodeExtractorAnalysisCache CEAC(*OuterFn);
1756 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1757 /* AggregateArgs */ false,
1758 /* BlockFrequencyInfo */ nullptr,
1759 /* BranchProbabilityInfo */ nullptr,
1760 /* AssumptionCache */ nullptr,
1761 /* AllowVarArgs */ true,
1762 /* AllowAlloca */ true,
1763 /* AllocationBlock */ OuterAllocaBlock,
1764 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1765
1766 // Find inputs to, outputs from the code region.
1767 BasicBlock *CommonExit = nullptr;
1768 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1769 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1770
1771 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1772 /*CollectGlobalInputs=*/true);
1773
1774 Inputs.remove_if([&](Value *I) {
1776 return GV->getValueType() == OpenMPIRBuilder::Ident;
1777
1778 return false;
1779 });
1780
1781 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1782
1783 FunctionCallee TIDRTLFn =
1784 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1785
1786 auto PrivHelper = [&](Value &V) -> Error {
1787 if (&V == TIDAddr || &V == ZeroAddr) {
1789 return Error::success();
1790 }
1791
1793 for (Use &U : V.uses())
1794 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1795 if (ParallelRegionBlockSet.count(UserI->getParent()))
1796 Uses.insert(&U);
1797
1798 // __kmpc_fork_call expects extra arguments as pointers. If the input
1799 // already has a pointer type, everything is fine. Otherwise, store the
1800 // value onto stack and load it back inside the to-be-outlined region. This
1801 // will ensure only the pointer will be passed to the function.
1802 // FIXME: if there are more than 15 trailing arguments, they must be
1803 // additionally packed in a struct.
1804 Value *Inner = &V;
1805 if (!V.getType()->isPointerTy()) {
1807 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1808
1809 Builder.restoreIP(OuterAllocaIP);
1810 Value *Ptr =
1811 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1812
1813 // Store to stack at end of the block that currently branches to the entry
1814 // block of the to-be-outlined region.
1815 Builder.SetInsertPoint(InsertBB,
1816 InsertBB->getTerminator()->getIterator());
1817 Builder.CreateStore(&V, Ptr);
1818
1819 // Load back next to allocations in the to-be-outlined region.
1820 Builder.restoreIP(InnerAllocaIP);
1821 Inner = Builder.CreateLoad(V.getType(), Ptr);
1822 }
1823
1824 Value *ReplacementValue = nullptr;
1825 CallInst *CI = dyn_cast<CallInst>(&V);
1826 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1827 ReplacementValue = PrivTID;
1828 } else {
1829 InsertPointOrErrorTy AfterIP =
1830 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1831 if (!AfterIP)
1832 return AfterIP.takeError();
1833 Builder.restoreIP(*AfterIP);
1834 InnerAllocaIP = {
1835 InnerAllocaIP.getBlock(),
1836 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1837
1838 assert(ReplacementValue &&
1839 "Expected copy/create callback to set replacement value!");
1840 if (ReplacementValue == &V)
1841 return Error::success();
1842 }
1843
1844 for (Use *UPtr : Uses)
1845 UPtr->set(ReplacementValue);
1846
1847 return Error::success();
1848 };
1849
1850 // Reset the inner alloca insertion as it will be used for loading the values
1851 // wrapped into pointers before passing them into the to-be-outlined region.
1852 // Configure it to insert immediately after the fake use of zero address so
1853 // that they are available in the generated body and so that the
1854 // OpenMP-related values (thread ID and zero address pointers) remain leading
1855 // in the argument list.
1856 InnerAllocaIP = IRBuilder<>::InsertPoint(
1857 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1858
1859 // Reset the outer alloca insertion point to the entry of the relevant block
1860 // in case it was invalidated.
1861 OuterAllocaIP = IRBuilder<>::InsertPoint(
1862 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1863
1864 for (Value *Input : Inputs) {
1865 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1866 if (Error Err = PrivHelper(*Input))
1867 return Err;
1868 }
1869 LLVM_DEBUG({
1870 for (Value *Output : Outputs)
1871 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1872 });
1873 assert(Outputs.empty() &&
1874 "OpenMP outlining should not produce live-out values!");
1875
1876 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1877 LLVM_DEBUG({
1878 for (auto *BB : Blocks)
1879 dbgs() << " PBR: " << BB->getName() << "\n";
1880 });
1881
1882 // Adjust the finalization stack, verify the adjustment, and call the
1883 // finalize function a last time to finalize values between the pre-fini
1884 // block and the exit block if we left the parallel "the normal way".
1885 auto FiniInfo = FinalizationStack.pop_back_val();
1886 (void)FiniInfo;
1887 assert(FiniInfo.DK == OMPD_parallel &&
1888 "Unexpected finalization stack state!");
1889
1890 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1891
1892 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1893 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1894 if (!FiniBBOrErr)
1895 return FiniBBOrErr.takeError();
1896 {
1898 Builder.restoreIP(PreFiniIP);
1899 Builder.CreateBr(*FiniBBOrErr);
1900 // There's currently a branch to omp.par.exit. Delete it. We will get there
1901 // via the fini block
1902 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1903 Term->eraseFromParent();
1904 }
1905
1906 // Register the outlined info.
1907 addOutlineInfo(std::move(OI));
1908
1909 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1910 UI->eraseFromParent();
1911
1912 return AfterIP;
1913}
1914
1916 // Build call void __kmpc_flush(ident_t *loc)
1917 uint32_t SrcLocStrSize;
1918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1919 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1920
1922 Args);
1923}
1924
1926 if (!updateToLocation(Loc))
1927 return;
1928 emitFlush(Loc);
1929}
1930
1932 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1933 // global_tid);
1934 uint32_t SrcLocStrSize;
1935 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1936 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1937 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1938
1939 // Ignore return result until untied tasks are supported.
1941 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1942}
1943
1949
1951 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1952 uint32_t SrcLocStrSize;
1953 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1954 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1955 Constant *I32Null = ConstantInt::getNullValue(Int32);
1956 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1957
1959 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1960}
1961
1967
1969 const DependData &Dep) {
1970 // Store the pointer to the variable
1971 Value *Addr = Builder.CreateStructGEP(
1972 DependInfo, Entry,
1973 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
1974 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, SizeTy);
1975 Builder.CreateStore(DepValPtr, Addr);
1976 // Store the size of the variable
1977 Value *Size = Builder.CreateStructGEP(
1978 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Len));
1979 Builder.CreateStore(
1980 ConstantInt::get(SizeTy,
1981 M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
1982 Size);
1983 // Store the dependency kind
1984 Value *Flags = Builder.CreateStructGEP(
1985 DependInfo, Entry, static_cast<unsigned int>(RTLDependInfoFields::Flags));
1986 Builder.CreateStore(ConstantInt::get(Builder.getInt8Ty(),
1987 static_cast<unsigned int>(Dep.DepKind)),
1988 Flags);
1989}
1990
1991// Processes the dependencies in Dependencies and does the following
1992// - Allocates space on the stack of an array of DependInfo objects
1993// - Populates each DependInfo object with relevant information of
1994// the corresponding dependence.
1995// - All code is inserted in the entry block of the current function.
1997 OpenMPIRBuilder &OMPBuilder,
1999 // Early return if we have no dependencies to process
2000 if (Dependencies.empty())
2001 return nullptr;
2002
2003 // Given a vector of DependData objects, in this function we create an
2004 // array on the stack that holds kmp_depend_info objects corresponding
2005 // to each dependency. This is then passed to the OpenMP runtime.
2006 // For example, if there are 'n' dependencies then the following psedo
2007 // code is generated. Assume the first dependence is on a variable 'a'
2008 //
2009 // \code{c}
2010 // DepArray = alloc(n x sizeof(kmp_depend_info);
2011 // idx = 0;
2012 // DepArray[idx].base_addr = ptrtoint(&a);
2013 // DepArray[idx].len = 8;
2014 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
2015 // ++idx;
2016 // DepArray[idx].base_addr = ...;
2017 // \endcode
2018
2019 IRBuilderBase &Builder = OMPBuilder.Builder;
2020 Type *DependInfo = OMPBuilder.DependInfo;
2021
2022 Value *DepArray = nullptr;
2023 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2024 Builder.SetInsertPoint(
2026
2027 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2028 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2029
2030 Builder.restoreIP(OldIP);
2031
2032 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2033 Value *Base =
2034 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2035 OMPBuilder.emitTaskDependency(Builder, Base, Dep);
2036 }
2037 return DepArray;
2038}
2039
2040/// Create the task duplication function passed to kmpc_taskloop.
2041Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2042 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2043 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2044 if (!DupCB)
2046 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2047
2048 // From OpenMP Runtime p_task_dup_t:
2049 // Routine optionally generated by the compiler for setting the lastprivate
2050 // flag and calling needed constructors for private/firstprivate objects (used
2051 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2052 // lastprivate flag.
2053 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2054
2055 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2056
2057 FunctionType *DupFuncTy = FunctionType::get(
2058 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2059 /*isVarArg=*/false);
2060
2061 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2062 "omp_taskloop_dup", M);
2063 Value *DestTaskArg = DupFunction->getArg(0);
2064 Value *SrcTaskArg = DupFunction->getArg(1);
2065 Value *LastprivateFlagArg = DupFunction->getArg(2);
2066 DestTaskArg->setName("dest_task");
2067 SrcTaskArg->setName("src_task");
2068 LastprivateFlagArg->setName("lastprivate_flag");
2069
2070 IRBuilderBase::InsertPointGuard Guard(Builder);
2071 Builder.SetInsertPoint(
2072 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2073
2074 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2075 Type *TaskWithPrivatesTy =
2076 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2077 Value *TaskPrivates = Builder.CreateGEP(
2078 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2079 Value *ContextPtr = Builder.CreateGEP(
2080 PrivatesTy, TaskPrivates,
2081 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2082 return ContextPtr;
2083 };
2084
2085 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2086 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2087
2088 DestTaskContextPtr->setName("destPtr");
2089 SrcTaskContextPtr->setName("srcPtr");
2090
2091 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2092 DupFunction->getEntryBlock().begin());
2093 InsertPointTy CodeGenIP = Builder.saveIP();
2094 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2095 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2096 if (!AfterIPOrError)
2097 return AfterIPOrError.takeError();
2098 Builder.restoreIP(*AfterIPOrError);
2099
2100 Builder.CreateRetVoid();
2101
2102 return DupFunction;
2103}
2104
2105OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2106 const LocationDescription &Loc, InsertPointTy AllocaIP,
2107 BodyGenCallbackTy BodyGenCB,
2108 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2109 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2110 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2111 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2112 Value *TaskContextStructPtrVal) {
2113
2114 if (!updateToLocation(Loc))
2115 return InsertPointTy();
2116
2117 uint32_t SrcLocStrSize;
2118 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2119 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2120
2121 BasicBlock *TaskloopExitBB =
2122 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2123 BasicBlock *TaskloopBodyBB =
2124 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2125 BasicBlock *TaskloopAllocaBB =
2126 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2127
2128 InsertPointTy TaskloopAllocaIP =
2129 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2130 InsertPointTy TaskloopBodyIP =
2131 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2132
2133 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2134 return Err;
2135
2136 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2137 if (!result) {
2138 return result.takeError();
2139 }
2140
2141 llvm::CanonicalLoopInfo *CLI = result.get();
2142 OutlineInfo OI;
2143 OI.EntryBB = TaskloopAllocaBB;
2144 OI.OuterAllocaBB = AllocaIP.getBlock();
2145 OI.ExitBB = TaskloopExitBB;
2146
2147 // Add the thread ID argument.
2148 SmallVector<Instruction *> ToBeDeleted;
2149 // dummy instruction to be used as a fake argument
2150 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2151 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2152 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2153 TaskloopAllocaIP, "lb", false, true);
2154 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2155 TaskloopAllocaIP, "ub", false, true);
2156 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2157 TaskloopAllocaIP, "step", false, true);
2158 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2159 // aggregate struct
2160 OI.Inputs.insert(FakeLB);
2161 OI.Inputs.insert(FakeUB);
2162 OI.Inputs.insert(FakeStep);
2163 if (TaskContextStructPtrVal)
2164 OI.Inputs.insert(TaskContextStructPtrVal);
2165 assert(((TaskContextStructPtrVal && DupCB) ||
2166 (!TaskContextStructPtrVal && !DupCB)) &&
2167 "Task context struct ptr and duplication callback must be both set "
2168 "or both null");
2169
2170 // It isn't safe to run the duplication bodygen callback inside the post
2171 // outlining callback so this has to be run now before we know the real task
2172 // shareds structure type.
2173 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2174 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2175 Type *FakeSharedsTy = StructType::get(
2176 Builder.getContext(),
2177 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2178 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2179 FakeSharedsTy,
2180 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2181 if (!TaskDupFnOrErr) {
2182 return TaskDupFnOrErr.takeError();
2183 }
2184 Value *TaskDupFn = *TaskDupFnOrErr;
2185
2186 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2187 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2188 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2189 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2190 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2191 // Replace the Stale CI by appropriate RTL function call.
2192 assert(OutlinedFn.hasOneUse() &&
2193 "there must be a single user for the outlined function");
2194 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2195
2196 /* Create the casting for the Bounds Values that can be used when outlining
2197 * to replace the uses of the fakes with real values */
2198 BasicBlock *CodeReplBB = StaleCI->getParent();
2199 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2200 Value *CastedLBVal =
2201 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2202 Value *CastedUBVal =
2203 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2204 Value *CastedStepVal =
2205 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2206
2207 Builder.SetInsertPoint(StaleCI);
2208
2209 // Gather the arguments for emitting the runtime call for
2210 // @__kmpc_omp_task_alloc
2211 Function *TaskAllocFn =
2212 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2213
2214 Value *ThreadID = getOrCreateThreadID(Ident);
2215
2216 if (!NoGroup) {
2217 // Emit runtime call for @__kmpc_taskgroup
2218 Function *TaskgroupFn =
2219 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2220 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2221 }
2222
2223 // `flags` Argument Configuration
2224 // Task is tied if (Flags & 1) == 1.
2225 // Task is untied if (Flags & 1) == 0.
2226 // Task is final if (Flags & 2) == 2.
2227 // Task is not final if (Flags & 2) == 0.
2228 // Task is mergeable if (Flags & 4) == 4.
2229 // Task is not mergeable if (Flags & 4) == 0.
2230 // Task is priority if (Flags & 32) == 32.
2231 // Task is not priority if (Flags & 32) == 0.
2232 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2233 if (Final)
2234 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2235 if (Mergeable)
2236 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2237 if (Priority)
2238 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2239
2240 Value *TaskSize = Builder.getInt64(
2241 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2242
2243 AllocaInst *ArgStructAlloca =
2245 assert(ArgStructAlloca &&
2246 "Unable to find the alloca instruction corresponding to arguments "
2247 "for extracted function");
2248 std::optional<TypeSize> ArgAllocSize =
2249 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2250 assert(ArgAllocSize &&
2251 "Unable to determine size of arguments for extracted function");
2252 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2253
2254 // Emit the @__kmpc_omp_task_alloc runtime call
2255 // The runtime call returns a pointer to an area where the task captured
2256 // variables must be copied before the task is run (TaskData)
2257 CallInst *TaskData = Builder.CreateCall(
2258 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2259 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2260 /*task_func=*/&OutlinedFn});
2261
2262 Value *Shareds = StaleCI->getArgOperand(1);
2263 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2264 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2265 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2266 SharedsSize);
2267 // Get the pointer to loop lb, ub, step from task ptr
2268 // and set up the lowerbound,upperbound and step values
2269 llvm::Value *Lb = Builder.CreateGEP(
2270 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2271
2272 llvm::Value *Ub = Builder.CreateGEP(
2273 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2274
2275 llvm::Value *Step = Builder.CreateGEP(
2276 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2277 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2278
2279 // set up the arguments for emitting kmpc_taskloop runtime call
2280 // setting values for ifval, nogroup, sched, grainsize, task_dup
2281 Value *IfCondVal =
2282 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2283 : Builder.getInt32(1);
2284 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2285 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2286 Value *NoGroupVal = Builder.getInt32(1);
2287 Value *SchedVal = Builder.getInt32(Sched);
2288 Value *GrainSizeVal =
2289 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2290 : Builder.getInt64(0);
2291 Value *TaskDup = TaskDupFn;
2292
2293 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2294 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2295
2296 // taskloop runtime call
2297 Function *TaskloopFn =
2298 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2299 Builder.CreateCall(TaskloopFn, Args);
2300
2301 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2302 // nogroup is not defined
2303 if (!NoGroup) {
2304 Function *EndTaskgroupFn =
2305 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2306 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2307 }
2308
2309 StaleCI->eraseFromParent();
2310
2311 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2312
2313 LoadInst *SharedsOutlined =
2314 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2315 OutlinedFn.getArg(1)->replaceUsesWithIf(
2316 SharedsOutlined,
2317 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2318
2319 Value *IV = CLI->getIndVar();
2320 Type *IVTy = IV->getType();
2321 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2322
2323 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2324 // UpperBound. These GEP's can be reused for loading the tasks respective
2325 // bounds.
2326 Value *TaskLB = nullptr;
2327 Value *TaskUB = nullptr;
2328 Value *TaskStep = nullptr;
2329 Value *LoadTaskLB = nullptr;
2330 Value *LoadTaskUB = nullptr;
2331 Value *LoadTaskStep = nullptr;
2332 for (Instruction &I : *TaskloopAllocaBB) {
2333 if (I.getOpcode() == Instruction::GetElementPtr) {
2334 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2335 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2336 switch (CI->getZExtValue()) {
2337 case 0:
2338 TaskLB = &I;
2339 break;
2340 case 1:
2341 TaskUB = &I;
2342 break;
2343 case 2:
2344 TaskStep = &I;
2345 break;
2346 }
2347 }
2348 } else if (I.getOpcode() == Instruction::Load) {
2349 LoadInst &Load = cast<LoadInst>(I);
2350 if (Load.getPointerOperand() == TaskLB) {
2351 assert(TaskLB != nullptr && "Expected value for TaskLB");
2352 LoadTaskLB = &I;
2353 } else if (Load.getPointerOperand() == TaskUB) {
2354 assert(TaskUB != nullptr && "Expected value for TaskUB");
2355 LoadTaskUB = &I;
2356 } else if (Load.getPointerOperand() == TaskStep) {
2357 assert(TaskStep != nullptr && "Expected value for TaskStep");
2358 LoadTaskStep = &I;
2359 }
2360 }
2361 }
2362
2363 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2364
2365 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2366 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2367 assert(LoadTaskStep != nullptr && "Expected value for LoadTaskStep");
2368 Value *TripCountMinusOne = Builder.CreateSDiv(
2369 Builder.CreateSub(LoadTaskUB, LoadTaskLB), LoadTaskStep);
2370 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2371 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2372 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2373 // set the trip count in the CLI
2374 CLI->setTripCount(CastedTripCount);
2375
2376 Builder.SetInsertPoint(CLI->getBody(),
2377 CLI->getBody()->getFirstInsertionPt());
2378
2379 if (NumOfCollapseLoops > 1) {
2380 llvm::SmallVector<User *> UsersToReplace;
2381 // When using the collapse clause, the bounds of the loop have to be
2382 // adjusted to properly represent the iterator of the outer loop.
2383 Value *IVPlusTaskLB = Builder.CreateAdd(
2384 CLI->getIndVar(),
2385 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2386 // To ensure every Use is correctly captured, we first want to record
2387 // which users to replace the value in, and then replace the value.
2388 for (auto IVUse = CLI->getIndVar()->uses().begin();
2389 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2390 User *IVUser = IVUse->getUser();
2391 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2392 if (Op->getOpcode() == Instruction::URem ||
2393 Op->getOpcode() == Instruction::UDiv) {
2394 UsersToReplace.push_back(IVUser);
2395 }
2396 }
2397 }
2398 for (User *User : UsersToReplace) {
2399 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2400 }
2401 } else {
2402 // The canonical loop is generated with a fixed lower bound. We need to
2403 // update the index calculation code to use the task's lower bound. The
2404 // generated code looks like this:
2405 // %omp_loop.iv = phi ...
2406 // ...
2407 // %tmp = mul [type] %omp_loop.iv, step
2408 // %user_index = add [type] tmp, lb
2409 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2410 // of the normalised induction variable:
2411 // 1. This one: converting the normalised IV to the user IV
2412 // 2. The increment (add)
2413 // 3. The comparison against the trip count (icmp)
2414 // (1) is the only use that is a mul followed by an add so this cannot
2415 // match other IR.
2416 assert(CLI->getIndVar()->getNumUses() == 3 &&
2417 "Canonical loop should have exactly three uses of the ind var");
2418 for (User *IVUser : CLI->getIndVar()->users()) {
2419 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2420 if (Mul->getOpcode() == Instruction::Mul) {
2421 for (User *MulUser : Mul->users()) {
2422 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2423 if (Add->getOpcode() == Instruction::Add) {
2424 Add->setOperand(1, CastedTaskLB);
2425 }
2426 }
2427 }
2428 }
2429 }
2430 }
2431 }
2432
2433 FakeLB->replaceAllUsesWith(CastedLBVal);
2434 FakeUB->replaceAllUsesWith(CastedUBVal);
2435 FakeStep->replaceAllUsesWith(CastedStepVal);
2436 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2437 I->eraseFromParent();
2438 }
2439 };
2440
2441 addOutlineInfo(std::move(OI));
2442 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2443 return Builder.saveIP();
2444}
2445
2448 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2449 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2450 llvm::Type::getInt32Ty(M.getContext()));
2451}
2452
2454 const LocationDescription &Loc, InsertPointTy AllocaIP,
2455 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2456 const DependenciesInfo &Dependencies, const AffinityData &Affinities,
2457 bool Mergeable, Value *EventHandle, Value *Priority) {
2458
2459 if (!updateToLocation(Loc))
2460 return InsertPointTy();
2461
2462 uint32_t SrcLocStrSize;
2463 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2464 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2465 // The current basic block is split into four basic blocks. After outlining,
2466 // they will be mapped as follows:
2467 // ```
2468 // def current_fn() {
2469 // current_basic_block:
2470 // br label %task.exit
2471 // task.exit:
2472 // ; instructions after task
2473 // }
2474 // def outlined_fn() {
2475 // task.alloca:
2476 // br label %task.body
2477 // task.body:
2478 // ret void
2479 // }
2480 // ```
2481 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2482 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2483 BasicBlock *TaskAllocaBB =
2484 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2485
2486 InsertPointTy TaskAllocaIP =
2487 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2488 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2489 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2490 return Err;
2491
2492 OutlineInfo OI;
2493 OI.EntryBB = TaskAllocaBB;
2494 OI.OuterAllocaBB = AllocaIP.getBlock();
2495 OI.ExitBB = TaskExitBB;
2496
2497 // Add the thread ID argument.
2500 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2501
2502 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2503 Affinities, Mergeable, Priority, EventHandle,
2504 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
2505 // Replace the Stale CI by appropriate RTL function call.
2506 assert(OutlinedFn.hasOneUse() &&
2507 "there must be a single user for the outlined function");
2508 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2509
2510 // HasShareds is true if any variables are captured in the outlined region,
2511 // false otherwise.
2512 bool HasShareds = StaleCI->arg_size() > 1;
2513 Builder.SetInsertPoint(StaleCI);
2514
2515 // Gather the arguments for emitting the runtime call for
2516 // @__kmpc_omp_task_alloc
2517 Function *TaskAllocFn =
2518 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2519
2520 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2521 // call.
2522 Value *ThreadID = getOrCreateThreadID(Ident);
2523
2524 // Argument - `flags`
2525 // Task is tied iff (Flags & 1) == 1.
2526 // Task is untied iff (Flags & 1) == 0.
2527 // Task is final iff (Flags & 2) == 2.
2528 // Task is not final iff (Flags & 2) == 0.
2529 // Task is mergeable iff (Flags & 4) == 4.
2530 // Task is not mergeable iff (Flags & 4) == 0.
2531 // Task is priority iff (Flags & 32) == 32.
2532 // Task is not priority iff (Flags & 32) == 0.
2533 // TODO: Handle the other flags.
2534 Value *Flags = Builder.getInt32(Tied);
2535 if (Final) {
2536 Value *FinalFlag =
2537 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2538 Flags = Builder.CreateOr(FinalFlag, Flags);
2539 }
2540
2541 if (Mergeable)
2542 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2543 if (Priority)
2544 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2545
2546 // Argument - `sizeof_kmp_task_t` (TaskSize)
2547 // Tasksize refers to the size in bytes of kmp_task_t data structure
2548 // including private vars accessed in task.
2549 // TODO: add kmp_task_t_with_privates (privates)
2550 Value *TaskSize = Builder.getInt64(
2551 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2552
2553 // Argument - `sizeof_shareds` (SharedsSize)
2554 // SharedsSize refers to the shareds array size in the kmp_task_t data
2555 // structure.
2556 Value *SharedsSize = Builder.getInt64(0);
2557 if (HasShareds) {
2558 AllocaInst *ArgStructAlloca =
2560 assert(ArgStructAlloca &&
2561 "Unable to find the alloca instruction corresponding to arguments "
2562 "for extracted function");
2563 std::optional<TypeSize> ArgAllocSize =
2564 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2565 assert(ArgAllocSize &&
2566 "Unable to determine size of arguments for extracted function");
2567 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2568 }
2569 // Emit the @__kmpc_omp_task_alloc runtime call
2570 // The runtime call returns a pointer to an area where the task captured
2571 // variables must be copied before the task is run (TaskData)
2573 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2574 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2575 /*task_func=*/&OutlinedFn});
2576
2577 if (Affinities.Count && Affinities.Info) {
2579 OMPRTL___kmpc_omp_reg_task_with_affinity);
2580
2581 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2582 Affinities.Count, Affinities.Info});
2583 }
2584
2585 // Emit detach clause initialization.
2586 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2587 // task_descriptor);
2588 if (EventHandle) {
2590 OMPRTL___kmpc_task_allow_completion_event);
2591 llvm::Value *EventVal =
2592 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2593 llvm::Value *EventHandleAddr =
2594 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2595 Builder.getPtrTy(0));
2596 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2597 Builder.CreateStore(EventVal, EventHandleAddr);
2598 }
2599 // Copy the arguments for outlined function
2600 if (HasShareds) {
2601 Value *Shareds = StaleCI->getArgOperand(1);
2602 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2603 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2604 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2605 SharedsSize);
2606 }
2607
2608 if (Priority) {
2609 //
2610 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2611 // we populate the priority information into the "kmp_task_t" here
2612 //
2613 // The struct "kmp_task_t" definition is available in kmp.h
2614 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2615 // data2 is used for priority
2616 //
2617 Type *Int32Ty = Builder.getInt32Ty();
2618 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2619 // kmp_task_t* => { ptr }
2620 Type *TaskPtr = StructType::get(VoidPtr);
2621 Value *TaskGEP =
2622 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2623 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2624 Type *TaskStructType = StructType::get(
2625 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2626 Value *PriorityData = Builder.CreateInBoundsGEP(
2627 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2628 // kmp_cmplrdata_t => { ptr, ptr }
2629 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2630 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2631 PriorityData, {Zero, Zero});
2632 Builder.CreateStore(Priority, CmplrData);
2633 }
2634
2635 Value *DepArray = nullptr;
2636 Value *NumDeps = nullptr;
2637 if (Dependencies.DepArray) {
2638 DepArray = Dependencies.DepArray;
2639 NumDeps = Dependencies.NumDeps;
2640 } else if (!Dependencies.Deps.empty()) {
2641 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
2642 NumDeps = Builder.getInt32(Dependencies.Deps.size());
2643 }
2644
2645 // In the presence of the `if` clause, the following IR is generated:
2646 // ...
2647 // %data = call @__kmpc_omp_task_alloc(...)
2648 // br i1 %if_condition, label %then, label %else
2649 // then:
2650 // call @__kmpc_omp_task(...)
2651 // br label %exit
2652 // else:
2653 // ;; Wait for resolution of dependencies, if any, before
2654 // ;; beginning the task
2655 // call @__kmpc_omp_wait_deps(...)
2656 // call @__kmpc_omp_task_begin_if0(...)
2657 // call @outlined_fn(...)
2658 // call @__kmpc_omp_task_complete_if0(...)
2659 // br label %exit
2660 // exit:
2661 // ...
2662 if (IfCondition) {
2663 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2664 // terminator.
2665 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2666 Instruction *IfTerminator =
2667 Builder.GetInsertPoint()->getParent()->getTerminator();
2668 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2669 Builder.SetInsertPoint(IfTerminator);
2670 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2671 &ElseTI);
2672 Builder.SetInsertPoint(ElseTI);
2673
2674 if (DepArray) {
2675 Function *TaskWaitFn =
2676 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2678 TaskWaitFn,
2679 {Ident, ThreadID, NumDeps, DepArray,
2680 ConstantInt::get(Builder.getInt32Ty(), 0),
2682 }
2683 Function *TaskBeginFn =
2684 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2685 Function *TaskCompleteFn =
2686 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2687 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2688 CallInst *CI = nullptr;
2689 if (HasShareds)
2690 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2691 else
2692 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2693 CI->setDebugLoc(StaleCI->getDebugLoc());
2694 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2695 Builder.SetInsertPoint(ThenTI);
2696 }
2697
2698 if (DepArray) {
2699 Function *TaskFn =
2700 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2702 TaskFn,
2703 {Ident, ThreadID, TaskData, NumDeps, DepArray,
2704 ConstantInt::get(Builder.getInt32Ty(), 0),
2706
2707 } else {
2708 // Emit the @__kmpc_omp_task runtime call to spawn the task
2709 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2710 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2711 }
2712
2713 StaleCI->eraseFromParent();
2714
2715 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2716 if (HasShareds) {
2717 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2718 OutlinedFn.getArg(1)->replaceUsesWithIf(
2719 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2720 }
2721
2722 for (Instruction *I : llvm::reverse(ToBeDeleted))
2723 I->eraseFromParent();
2724 };
2725
2726 addOutlineInfo(std::move(OI));
2727 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2728
2729 return Builder.saveIP();
2730}
2731
2734 InsertPointTy AllocaIP,
2735 BodyGenCallbackTy BodyGenCB) {
2736 if (!updateToLocation(Loc))
2737 return InsertPointTy();
2738
2739 uint32_t SrcLocStrSize;
2740 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2741 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2742 Value *ThreadID = getOrCreateThreadID(Ident);
2743
2744 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2745 Function *TaskgroupFn =
2746 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2747 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2748
2749 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2750 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2751 return Err;
2752
2753 Builder.SetInsertPoint(TaskgroupExitBB);
2754 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2755 Function *EndTaskgroupFn =
2756 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2757 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2758
2759 return Builder.saveIP();
2760}
2761
2763 const LocationDescription &Loc, InsertPointTy AllocaIP,
2765 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2766 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2767
2768 if (!updateToLocation(Loc))
2769 return Loc.IP;
2770
2771 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2772
2773 // Each section is emitted as a switch case
2774 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2775 // -> OMP.createSection() which generates the IR for each section
2776 // Iterate through all sections and emit a switch construct:
2777 // switch (IV) {
2778 // case 0:
2779 // <SectionStmt[0]>;
2780 // break;
2781 // ...
2782 // case <NumSection> - 1:
2783 // <SectionStmt[<NumSection> - 1]>;
2784 // break;
2785 // }
2786 // ...
2787 // section_loop.after:
2788 // <FiniCB>;
2789 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2790 Builder.restoreIP(CodeGenIP);
2792 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2793 Function *CurFn = Continue->getParent();
2794 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2795
2796 unsigned CaseNumber = 0;
2797 for (auto SectionCB : SectionCBs) {
2799 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2800 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2801 Builder.SetInsertPoint(CaseBB);
2802 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
2803 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2804 CaseEndBr->getIterator()}))
2805 return Err;
2806 CaseNumber++;
2807 }
2808 // remove the existing terminator from body BB since there can be no
2809 // terminators after switch/case
2810 return Error::success();
2811 };
2812 // Loop body ends here
2813 // LowerBound, UpperBound, and STride for createCanonicalLoop
2814 Type *I32Ty = Type::getInt32Ty(M.getContext());
2815 Value *LB = ConstantInt::get(I32Ty, 0);
2816 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2817 Value *ST = ConstantInt::get(I32Ty, 1);
2819 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2820 if (!LoopInfo)
2821 return LoopInfo.takeError();
2822
2823 InsertPointOrErrorTy WsloopIP =
2824 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2825 WorksharingLoopType::ForStaticLoop, !IsNowait);
2826 if (!WsloopIP)
2827 return WsloopIP.takeError();
2828 InsertPointTy AfterIP = *WsloopIP;
2829
2830 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2831 assert(LoopFini && "Bad structure of static workshare loop finalization");
2832
2833 // Apply the finalization callback in LoopAfterBB
2834 auto FiniInfo = FinalizationStack.pop_back_val();
2835 assert(FiniInfo.DK == OMPD_sections &&
2836 "Unexpected finalization stack state!");
2837 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2838 return Err;
2839
2840 return AfterIP;
2841}
2842
2845 BodyGenCallbackTy BodyGenCB,
2846 FinalizeCallbackTy FiniCB) {
2847 if (!updateToLocation(Loc))
2848 return Loc.IP;
2849
2850 auto FiniCBWrapper = [&](InsertPointTy IP) {
2851 if (IP.getBlock()->end() != IP.getPoint())
2852 return FiniCB(IP);
2853 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2854 // will fail because that function requires the Finalization Basic Block to
2855 // have a terminator, which is already removed by EmitOMPRegionBody.
2856 // IP is currently at cancelation block.
2857 // We need to backtrack to the condition block to fetch
2858 // the exit block and create a branch from cancelation
2859 // to exit block.
2861 Builder.restoreIP(IP);
2862 auto *CaseBB = Loc.IP.getBlock();
2863 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2864 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2865 Instruction *I = Builder.CreateBr(ExitBB);
2866 IP = InsertPointTy(I->getParent(), I->getIterator());
2867 return FiniCB(IP);
2868 };
2869
2870 Directive OMPD = Directive::OMPD_sections;
2871 // Since we are using Finalization Callback here, HasFinalize
2872 // and IsCancellable have to be true
2873 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2874 /*Conditional*/ false, /*hasFinalize*/ true,
2875 /*IsCancellable*/ true);
2876}
2877
2883
2884Value *OpenMPIRBuilder::getGPUThreadID() {
2887 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2888 {});
2889}
2890
2891Value *OpenMPIRBuilder::getGPUWarpSize() {
2893 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2894}
2895
2896Value *OpenMPIRBuilder::getNVPTXWarpID() {
2897 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2898 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2899}
2900
2901Value *OpenMPIRBuilder::getNVPTXLaneID() {
2902 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2903 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2904 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2905 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2906 "nvptx_lane_id");
2907}
2908
2909Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2910 Type *ToType) {
2911 Type *FromType = From->getType();
2912 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2913 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2914 assert(FromSize > 0 && "From size must be greater than zero");
2915 assert(ToSize > 0 && "To size must be greater than zero");
2916 if (FromType == ToType)
2917 return From;
2918 if (FromSize == ToSize)
2919 return Builder.CreateBitCast(From, ToType);
2920 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2921 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2922 InsertPointTy SaveIP = Builder.saveIP();
2923 Builder.restoreIP(AllocaIP);
2924 Value *CastItem = Builder.CreateAlloca(ToType);
2925 Builder.restoreIP(SaveIP);
2926
2927 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2928 CastItem, Builder.getPtrTy(0));
2929 Builder.CreateStore(From, ValCastItem);
2930 return Builder.CreateLoad(ToType, CastItem);
2931}
2932
2933Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2934 Value *Element,
2935 Type *ElementType,
2936 Value *Offset) {
2937 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2938 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2939
2940 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2941 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2942 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2943 Value *WarpSize =
2944 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2946 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2947 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2948 Value *WarpSizeCast =
2949 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2950 Value *ShuffleCall =
2951 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2952 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2953}
2954
2955void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2956 Value *DstAddr, Type *ElemType,
2957 Value *Offset, Type *ReductionArrayTy,
2958 bool IsByRefElem) {
2959 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2960 // Create the loop over the big sized data.
2961 // ptr = (void*)Elem;
2962 // ptrEnd = (void*) Elem + 1;
2963 // Step = 8;
2964 // while (ptr + Step < ptrEnd)
2965 // shuffle((int64_t)*ptr);
2966 // Step = 4;
2967 // while (ptr + Step < ptrEnd)
2968 // shuffle((int32_t)*ptr);
2969 // ...
2970 Type *IndexTy = Builder.getIndexTy(
2971 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2972 Value *ElemPtr = DstAddr;
2973 Value *Ptr = SrcAddr;
2974 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2975 if (Size < IntSize)
2976 continue;
2977 Type *IntType = Builder.getIntNTy(IntSize * 8);
2978 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2979 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2980 Value *SrcAddrGEP =
2981 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2982 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2983 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2984
2985 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2986 if ((Size / IntSize) > 1) {
2987 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2988 SrcAddrGEP, Builder.getPtrTy());
2989 BasicBlock *PreCondBB =
2990 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2991 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2992 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2993 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2994 emitBlock(PreCondBB, CurFunc);
2995 PHINode *PhiSrc =
2996 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2997 PhiSrc->addIncoming(Ptr, CurrentBB);
2998 PHINode *PhiDest =
2999 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
3000 PhiDest->addIncoming(ElemPtr, CurrentBB);
3001 Ptr = PhiSrc;
3002 ElemPtr = PhiDest;
3003 Value *PtrDiff = Builder.CreatePtrDiff(
3004 Builder.getInt8Ty(), PtrEnd,
3005 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
3006 Builder.CreateCondBr(
3007 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
3008 ExitBB);
3009 emitBlock(ThenBB, CurFunc);
3010 Value *Res = createRuntimeShuffleFunction(
3011 AllocaIP,
3012 Builder.CreateAlignedLoad(
3013 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
3014 IntType, Offset);
3015 Builder.CreateAlignedStore(Res, ElemPtr,
3016 M.getDataLayout().getPrefTypeAlign(ElemType));
3017 Value *LocalPtr =
3018 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3019 Value *LocalElemPtr =
3020 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3021 PhiSrc->addIncoming(LocalPtr, ThenBB);
3022 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3023 emitBranch(PreCondBB);
3024 emitBlock(ExitBB, CurFunc);
3025 } else {
3026 Value *Res = createRuntimeShuffleFunction(
3027 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3028 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3029 Res->getType()->getScalarSizeInBits())
3030 Res = Builder.CreateTrunc(Res, ElemType);
3031 Builder.CreateStore(Res, ElemPtr);
3032 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3033 ElemPtr =
3034 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3035 }
3036 Size = Size % IntSize;
3037 }
3038}
3039
3040Error OpenMPIRBuilder::emitReductionListCopy(
3041 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3042 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3043 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3044 Type *IndexTy = Builder.getIndexTy(
3045 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3046 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3047
3048 // Iterates, element-by-element, through the source Reduce list and
3049 // make a copy.
3050 for (auto En : enumerate(ReductionInfos)) {
3051 const ReductionInfo &RI = En.value();
3052 Value *SrcElementAddr = nullptr;
3053 AllocaInst *DestAlloca = nullptr;
3054 Value *DestElementAddr = nullptr;
3055 Value *DestElementPtrAddr = nullptr;
3056 // Should we shuffle in an element from a remote lane?
3057 bool ShuffleInElement = false;
3058 // Set to true to update the pointer in the dest Reduce list to a
3059 // newly created element.
3060 bool UpdateDestListPtr = false;
3061
3062 // Step 1.1: Get the address for the src element in the Reduce list.
3063 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3064 ReductionArrayTy, SrcBase,
3065 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3066 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3067
3068 // Step 1.2: Create a temporary to store the element in the destination
3069 // Reduce list.
3070 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3071 ReductionArrayTy, DestBase,
3072 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3073 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3074 switch (Action) {
3076 InsertPointTy CurIP = Builder.saveIP();
3077 Builder.restoreIP(AllocaIP);
3078
3079 Type *DestAllocaType =
3080 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3081 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3082 ".omp.reduction.element");
3083 DestAlloca->setAlignment(
3084 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3085 DestElementAddr = DestAlloca;
3086 DestElementAddr =
3087 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3088 DestElementAddr->getName() + ".ascast");
3089 Builder.restoreIP(CurIP);
3090 ShuffleInElement = true;
3091 UpdateDestListPtr = true;
3092 break;
3093 }
3095 DestElementAddr =
3096 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3097 break;
3098 }
3099 }
3100
3101 // Now that all active lanes have read the element in the
3102 // Reduce list, shuffle over the value from the remote lane.
3103 if (ShuffleInElement) {
3104 Type *ShuffleType = RI.ElementType;
3105 Value *ShuffleSrcAddr = SrcElementAddr;
3106 Value *ShuffleDestAddr = DestElementAddr;
3107 AllocaInst *LocalStorage = nullptr;
3108
3109 if (IsByRefElem) {
3110 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3111 assert(RI.ByRefAllocatedType &&
3112 "Expected by-ref allocated type to be set");
3113 // For by-ref reductions, we need to copy from the remote lane the
3114 // actual value of the partial reduction computed by that remote lane;
3115 // rather than, for example, a pointer to that data or, even worse, a
3116 // pointer to the descriptor of the by-ref reduction element.
3117 ShuffleType = RI.ByRefElementType;
3118
3119 if (RI.DataPtrPtrGen) {
3120 // Descriptor-based by-ref: extract data pointer from descriptor.
3121 InsertPointOrErrorTy GenResult = RI.DataPtrPtrGen(
3122 Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3123
3124 if (!GenResult)
3125 return GenResult.takeError();
3126
3127 ShuffleSrcAddr =
3128 Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3129
3130 {
3131 InsertPointTy OldIP = Builder.saveIP();
3132 Builder.restoreIP(AllocaIP);
3133
3134 LocalStorage = Builder.CreateAlloca(ShuffleType);
3135 Builder.restoreIP(OldIP);
3136 ShuffleDestAddr = LocalStorage;
3137 }
3138 } else {
3139 // Non-descriptor by-ref: the pointer already references data
3140 // directly. Shuffle into the destination alloca.
3141 ShuffleDestAddr = DestElementAddr;
3142 }
3143 }
3144
3145 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3146 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3147
3148 if (IsByRefElem && RI.DataPtrPtrGen) {
3149 // Copy descriptor from source and update base_ptr to shuffled data
3150 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3151 DestAlloca, Builder.getPtrTy(), ".ascast");
3152
3153 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3154 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3155 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3156
3157 if (!GenResult)
3158 return GenResult.takeError();
3159 }
3160 } else {
3161 switch (RI.EvaluationKind) {
3162 case EvalKind::Scalar: {
3163 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3164 // Store the source element value to the dest element address.
3165 Builder.CreateStore(Elem, DestElementAddr);
3166 break;
3167 }
3168 case EvalKind::Complex: {
3169 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3170 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3171 Value *SrcReal = Builder.CreateLoad(
3172 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3173 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3174 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3175 Value *SrcImg = Builder.CreateLoad(
3176 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3177
3178 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3179 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3180 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3181 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3182 Builder.CreateStore(SrcReal, DestRealPtr);
3183 Builder.CreateStore(SrcImg, DestImgPtr);
3184 break;
3185 }
3186 case EvalKind::Aggregate: {
3187 Value *SizeVal = Builder.getInt64(
3188 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3189 Builder.CreateMemCpy(
3190 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3191 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3192 SizeVal, false);
3193 break;
3194 }
3195 };
3196 }
3197
3198 // Step 3.1: Modify reference in dest Reduce list as needed.
3199 // Modifying the reference in Reduce list to point to the newly
3200 // created element. The element is live in the current function
3201 // scope and that of functions it invokes (i.e., reduce_function).
3202 // RemoteReduceData[i] = (void*)&RemoteElem
3203 if (UpdateDestListPtr) {
3204 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3205 DestElementAddr, Builder.getPtrTy(),
3206 DestElementAddr->getName() + ".ascast");
3207 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3208 }
3209 }
3210
3211 return Error::success();
3212}
3213
3214Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3215 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3216 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3217 InsertPointTy SavedIP = Builder.saveIP();
3218 LLVMContext &Ctx = M.getContext();
3219 FunctionType *FuncTy = FunctionType::get(
3220 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3221 /* IsVarArg */ false);
3222 Function *WcFunc =
3224 "_omp_reduction_inter_warp_copy_func", &M);
3225 WcFunc->setAttributes(FuncAttrs);
3226 WcFunc->addParamAttr(0, Attribute::NoUndef);
3227 WcFunc->addParamAttr(1, Attribute::NoUndef);
3228 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3229 Builder.SetInsertPoint(EntryBB);
3230
3231 // ReduceList: thread local Reduce list.
3232 // At the stage of the computation when this function is called, partially
3233 // aggregated values reside in the first lane of every active warp.
3234 Argument *ReduceListArg = WcFunc->getArg(0);
3235 // NumWarps: number of warps active in the parallel region. This could
3236 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3237 Argument *NumWarpsArg = WcFunc->getArg(1);
3238
3239 // This array is used as a medium to transfer, one reduce element at a time,
3240 // the data from the first lane of every warp to lanes in the first warp
3241 // in order to perform the final step of a reduction in a parallel region
3242 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3243 // for reduced latency, as well as to have a distinct copy for concurrently
3244 // executing target regions. The array is declared with common linkage so
3245 // as to be shared across compilation units.
3246 StringRef TransferMediumName =
3247 "__openmp_nvptx_data_transfer_temporary_storage";
3248 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3249 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3250 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3251 if (!TransferMedium) {
3252 TransferMedium = new GlobalVariable(
3253 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3254 UndefValue::get(ArrayTy), TransferMediumName,
3255 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3256 /*AddressSpace=*/3);
3257 }
3258
3259 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3260 Value *GPUThreadID = getGPUThreadID();
3261 // nvptx_lane_id = nvptx_id % warpsize
3262 Value *LaneID = getNVPTXLaneID();
3263 // nvptx_warp_id = nvptx_id / warpsize
3264 Value *WarpID = getNVPTXWarpID();
3265
3266 InsertPointTy AllocaIP =
3267 InsertPointTy(Builder.GetInsertBlock(),
3268 Builder.GetInsertBlock()->getFirstInsertionPt());
3269 Type *Arg0Type = ReduceListArg->getType();
3270 Type *Arg1Type = NumWarpsArg->getType();
3271 Builder.restoreIP(AllocaIP);
3272 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3273 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3274 AllocaInst *NumWarpsAlloca =
3275 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3276 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3277 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3278 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3279 NumWarpsAlloca, Builder.getPtrTy(0),
3280 NumWarpsAlloca->getName() + ".ascast");
3281 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3282 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3283 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3284 InsertPointTy CodeGenIP =
3285 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3286 Builder.restoreIP(CodeGenIP);
3287
3288 Value *ReduceList =
3289 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3290
3291 for (auto En : enumerate(ReductionInfos)) {
3292 //
3293 // Warp master copies reduce element to transfer medium in __shared__
3294 // memory.
3295 //
3296 const ReductionInfo &RI = En.value();
3297 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3298 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3299 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3300 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3301 Type *CType = Builder.getIntNTy(TySize * 8);
3302
3303 unsigned NumIters = RealTySize / TySize;
3304 if (NumIters == 0)
3305 continue;
3306 Value *Cnt = nullptr;
3307 Value *CntAddr = nullptr;
3308 BasicBlock *PrecondBB = nullptr;
3309 BasicBlock *ExitBB = nullptr;
3310 if (NumIters > 1) {
3311 CodeGenIP = Builder.saveIP();
3312 Builder.restoreIP(AllocaIP);
3313 CntAddr =
3314 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3315
3316 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3317 CntAddr->getName() + ".ascast");
3318 Builder.restoreIP(CodeGenIP);
3319 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3320 CntAddr,
3321 /*Volatile=*/false);
3322 PrecondBB = BasicBlock::Create(Ctx, "precond");
3323 ExitBB = BasicBlock::Create(Ctx, "exit");
3324 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3325 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3326 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3327 /*Volatile=*/false);
3328 Value *Cmp = Builder.CreateICmpULT(
3329 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3330 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3331 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3332 }
3333
3334 // kmpc_barrier.
3335 InsertPointOrErrorTy BarrierIP1 =
3336 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3337 omp::Directive::OMPD_unknown,
3338 /* ForceSimpleCall */ false,
3339 /* CheckCancelFlag */ true);
3340 if (!BarrierIP1)
3341 return BarrierIP1.takeError();
3342 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3343 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3344 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3345
3346 // if (lane_id == 0)
3347 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3348 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3349 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3350
3351 // Reduce element = LocalReduceList[i]
3352 auto *RedListArrayTy =
3353 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3354 Type *IndexTy = Builder.getIndexTy(
3355 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3356 Value *ElemPtrPtr =
3357 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3358 {ConstantInt::get(IndexTy, 0),
3359 ConstantInt::get(IndexTy, En.index())});
3360 // elemptr = ((CopyType*)(elemptrptr)) + I
3361 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3362
3363 if (IsByRefElem && RI.DataPtrPtrGen) {
3364 InsertPointOrErrorTy GenRes =
3365 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3366
3367 if (!GenRes)
3368 return GenRes.takeError();
3369
3370 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3371 }
3372
3373 if (NumIters > 1)
3374 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3375
3376 // Get pointer to location in transfer medium.
3377 // MediumPtr = &medium[warp_id]
3378 Value *MediumPtr = Builder.CreateInBoundsGEP(
3379 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3380 // elem = *elemptr
3381 //*MediumPtr = elem
3382 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3383 // Store the source element value to the dest element address.
3384 Builder.CreateStore(Elem, MediumPtr,
3385 /*IsVolatile*/ true);
3386 Builder.CreateBr(MergeBB);
3387
3388 // else
3389 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3390 Builder.CreateBr(MergeBB);
3391
3392 // endif
3393 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3394 InsertPointOrErrorTy BarrierIP2 =
3395 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3396 omp::Directive::OMPD_unknown,
3397 /* ForceSimpleCall */ false,
3398 /* CheckCancelFlag */ true);
3399 if (!BarrierIP2)
3400 return BarrierIP2.takeError();
3401
3402 // Warp 0 copies reduce element from transfer medium
3403 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3404 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3405 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3406
3407 Value *NumWarpsVal =
3408 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3409 // Up to 32 threads in warp 0 are active.
3410 Value *IsActiveThread =
3411 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3412 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3413
3414 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3415
3416 // SecMediumPtr = &medium[tid]
3417 // SrcMediumVal = *SrcMediumPtr
3418 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3419 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3420 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3421 Value *TargetElemPtrPtr =
3422 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3423 {ConstantInt::get(IndexTy, 0),
3424 ConstantInt::get(IndexTy, En.index())});
3425 Value *TargetElemPtrVal =
3426 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3427 Value *TargetElemPtr = TargetElemPtrVal;
3428
3429 if (IsByRefElem && RI.DataPtrPtrGen) {
3430 InsertPointOrErrorTy GenRes =
3431 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3432
3433 if (!GenRes)
3434 return GenRes.takeError();
3435
3436 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3437 }
3438
3439 if (NumIters > 1)
3440 TargetElemPtr =
3441 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3442
3443 // *TargetElemPtr = SrcMediumVal;
3444 Value *SrcMediumValue =
3445 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3446 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3447 Builder.CreateBr(W0MergeBB);
3448
3449 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3450 Builder.CreateBr(W0MergeBB);
3451
3452 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3453
3454 if (NumIters > 1) {
3455 Cnt = Builder.CreateNSWAdd(
3456 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3457 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3458
3459 auto *CurFn = Builder.GetInsertBlock()->getParent();
3460 emitBranch(PrecondBB);
3461 emitBlock(ExitBB, CurFn);
3462 }
3463 RealTySize %= TySize;
3464 }
3465 }
3466
3467 Builder.CreateRetVoid();
3468 Builder.restoreIP(SavedIP);
3469
3470 return WcFunc;
3471}
3472
3473Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3474 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3475 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3476 LLVMContext &Ctx = M.getContext();
3477 FunctionType *FuncTy =
3478 FunctionType::get(Builder.getVoidTy(),
3479 {Builder.getPtrTy(), Builder.getInt16Ty(),
3480 Builder.getInt16Ty(), Builder.getInt16Ty()},
3481 /* IsVarArg */ false);
3482 Function *SarFunc =
3484 "_omp_reduction_shuffle_and_reduce_func", &M);
3485 SarFunc->setAttributes(FuncAttrs);
3486 SarFunc->addParamAttr(0, Attribute::NoUndef);
3487 SarFunc->addParamAttr(1, Attribute::NoUndef);
3488 SarFunc->addParamAttr(2, Attribute::NoUndef);
3489 SarFunc->addParamAttr(3, Attribute::NoUndef);
3490 SarFunc->addParamAttr(1, Attribute::SExt);
3491 SarFunc->addParamAttr(2, Attribute::SExt);
3492 SarFunc->addParamAttr(3, Attribute::SExt);
3493 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3494 Builder.SetInsertPoint(EntryBB);
3495
3496 // Thread local Reduce list used to host the values of data to be reduced.
3497 Argument *ReduceListArg = SarFunc->getArg(0);
3498 // Current lane id; could be logical.
3499 Argument *LaneIDArg = SarFunc->getArg(1);
3500 // Offset of the remote source lane relative to the current lane.
3501 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3502 // Algorithm version. This is expected to be known at compile time.
3503 Argument *AlgoVerArg = SarFunc->getArg(3);
3504
3505 Type *ReduceListArgType = ReduceListArg->getType();
3506 Type *LaneIDArgType = LaneIDArg->getType();
3507 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3508 Value *ReduceListAlloca = Builder.CreateAlloca(
3509 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3510 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3511 LaneIDArg->getName() + ".addr");
3512 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3513 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3514 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3515 AlgoVerArg->getName() + ".addr");
3516 ArrayType *RedListArrayTy =
3517 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3518
3519 // Create a local thread-private variable to host the Reduce list
3520 // from a remote lane.
3521 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3522 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3523
3524 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3525 ReduceListAlloca, ReduceListArgType,
3526 ReduceListAlloca->getName() + ".ascast");
3527 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3528 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3529 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3530 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3531 RemoteLaneOffsetAlloca->getName() + ".ascast");
3532 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3533 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3534 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3535 RemoteReductionListAlloca, Builder.getPtrTy(),
3536 RemoteReductionListAlloca->getName() + ".ascast");
3537
3538 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3539 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3540 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3541 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3542
3543 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3544 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3545 Value *RemoteLaneOffset =
3546 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3547 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3548
3549 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3550
3551 // This loop iterates through the list of reduce elements and copies,
3552 // element by element, from a remote lane in the warp to RemoteReduceList,
3553 // hosted on the thread's stack.
3554 Error EmitRedLsCpRes = emitReductionListCopy(
3555 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3556 ReduceList, RemoteListAddrCast, IsByRef,
3557 {RemoteLaneOffset, nullptr, nullptr});
3558
3559 if (EmitRedLsCpRes)
3560 return EmitRedLsCpRes;
3561
3562 // The actions to be performed on the Remote Reduce list is dependent
3563 // on the algorithm version.
3564 //
3565 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3566 // LaneId % 2 == 0 && Offset > 0):
3567 // do the reduction value aggregation
3568 //
3569 // The thread local variable Reduce list is mutated in place to host the
3570 // reduced data, which is the aggregated value produced from local and
3571 // remote lanes.
3572 //
3573 // Note that AlgoVer is expected to be a constant integer known at compile
3574 // time.
3575 // When AlgoVer==0, the first conjunction evaluates to true, making
3576 // the entire predicate true during compile time.
3577 // When AlgoVer==1, the second conjunction has only the second part to be
3578 // evaluated during runtime. Other conjunctions evaluates to false
3579 // during compile time.
3580 // When AlgoVer==2, the third conjunction has only the second part to be
3581 // evaluated during runtime. Other conjunctions evaluates to false
3582 // during compile time.
3583 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3584 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3585 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3586 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3587 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3588 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3589 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3590 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3591 Value *RemoteOffsetComp =
3592 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3593 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3594 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3595 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3596
3597 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3598 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3599 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3600
3601 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3602 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3603 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3604 ReduceList, Builder.getPtrTy());
3605 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3606 RemoteListAddrCast, Builder.getPtrTy());
3607 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3608 ->addFnAttr(Attribute::NoUnwind);
3609 Builder.CreateBr(MergeBB);
3610
3611 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3612 Builder.CreateBr(MergeBB);
3613
3614 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3615
3616 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3617 // Reduce list.
3618 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3619 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3620 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3621
3622 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3623 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3624 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3625 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3626
3627 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3628
3629 EmitRedLsCpRes = emitReductionListCopy(
3630 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3631 RemoteListAddrCast, ReduceList, IsByRef);
3632
3633 if (EmitRedLsCpRes)
3634 return EmitRedLsCpRes;
3635
3636 Builder.CreateBr(CpyMergeBB);
3637
3638 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3639 Builder.CreateBr(CpyMergeBB);
3640
3641 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3642
3643 Builder.CreateRetVoid();
3644
3645 return SarFunc;
3646}
3647
3649OpenMPIRBuilder::generateReductionDescriptor(
3650 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3651 Type *DescriptorType,
3652 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3653 DataPtrPtrGen) {
3654
3655 // Copy the source descriptor to preserve all metadata (rank, extents,
3656 // strides, etc.)
3657 Value *DescriptorSize =
3658 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3659 Builder.CreateMemCpy(
3660 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3661 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3662 DescriptorSize);
3663
3664 // Update the base pointer field to point to the local shuffled data
3665 Value *DataPtrField;
3666 InsertPointOrErrorTy GenResult =
3667 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3668
3669 if (!GenResult)
3670 return GenResult.takeError();
3671
3672 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3673 DataPtr, Builder.getPtrTy(), ".ascast"),
3674 DataPtrField);
3675
3676 return Builder.saveIP();
3677}
3678
3679Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3680 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3681 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3682 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3683 LLVMContext &Ctx = M.getContext();
3684 FunctionType *FuncTy = FunctionType::get(
3685 Builder.getVoidTy(),
3686 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3687 /* IsVarArg */ false);
3688 Function *LtGCFunc =
3690 "_omp_reduction_list_to_global_copy_func", &M);
3691 LtGCFunc->setAttributes(FuncAttrs);
3692 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3693 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3694 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3695
3696 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3697 Builder.SetInsertPoint(EntryBlock);
3698
3699 // Buffer: global reduction buffer.
3700 Argument *BufferArg = LtGCFunc->getArg(0);
3701 // Idx: index of the buffer.
3702 Argument *IdxArg = LtGCFunc->getArg(1);
3703 // ReduceList: thread local Reduce list.
3704 Argument *ReduceListArg = LtGCFunc->getArg(2);
3705
3706 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3707 BufferArg->getName() + ".addr");
3708 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3709 IdxArg->getName() + ".addr");
3710 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3711 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3712 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3713 BufferArgAlloca, Builder.getPtrTy(),
3714 BufferArgAlloca->getName() + ".ascast");
3715 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3716 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3717 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3718 ReduceListArgAlloca, Builder.getPtrTy(),
3719 ReduceListArgAlloca->getName() + ".ascast");
3720
3721 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3722 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3723 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3724
3725 Value *LocalReduceList =
3726 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3727 Value *BufferArgVal =
3728 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3729 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3730 Type *IndexTy = Builder.getIndexTy(
3731 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3732 for (auto En : enumerate(ReductionInfos)) {
3733 const ReductionInfo &RI = En.value();
3734 auto *RedListArrayTy =
3735 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3736 // Reduce element = LocalReduceList[i]
3737 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3738 RedListArrayTy, LocalReduceList,
3739 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3740 // elemptr = ((CopyType*)(elemptrptr)) + I
3741 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3742
3743 // Global = Buffer.VD[Idx];
3744 Value *BufferVD =
3745 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3746 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3747 ReductionsBufferTy, BufferVD, 0, En.index());
3748
3749 switch (RI.EvaluationKind) {
3750 case EvalKind::Scalar: {
3751 Value *TargetElement;
3752
3753 if (IsByRef.empty() || !IsByRef[En.index()]) {
3754 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3755 } else {
3756 if (RI.DataPtrPtrGen) {
3757 InsertPointOrErrorTy GenResult =
3758 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3759
3760 if (!GenResult)
3761 return GenResult.takeError();
3762
3763 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3764 }
3765 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3766 }
3767
3768 Builder.CreateStore(TargetElement, GlobVal);
3769 break;
3770 }
3771 case EvalKind::Complex: {
3772 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3773 RI.ElementType, ElemPtr, 0, 0, ".realp");
3774 Value *SrcReal = Builder.CreateLoad(
3775 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3776 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3777 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3778 Value *SrcImg = Builder.CreateLoad(
3779 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3780
3781 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3782 RI.ElementType, GlobVal, 0, 0, ".realp");
3783 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3784 RI.ElementType, GlobVal, 0, 1, ".imagp");
3785 Builder.CreateStore(SrcReal, DestRealPtr);
3786 Builder.CreateStore(SrcImg, DestImgPtr);
3787 break;
3788 }
3789 case EvalKind::Aggregate: {
3790 Value *SizeVal =
3791 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3792 Builder.CreateMemCpy(
3793 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3794 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3795 break;
3796 }
3797 }
3798 }
3799
3800 Builder.CreateRetVoid();
3801 Builder.restoreIP(OldIP);
3802 return LtGCFunc;
3803}
3804
3805Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3806 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3807 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3808 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3809 LLVMContext &Ctx = M.getContext();
3810 FunctionType *FuncTy = FunctionType::get(
3811 Builder.getVoidTy(),
3812 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3813 /* IsVarArg */ false);
3814 Function *LtGRFunc =
3816 "_omp_reduction_list_to_global_reduce_func", &M);
3817 LtGRFunc->setAttributes(FuncAttrs);
3818 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3819 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3820 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3821
3822 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3823 Builder.SetInsertPoint(EntryBlock);
3824
3825 // Buffer: global reduction buffer.
3826 Argument *BufferArg = LtGRFunc->getArg(0);
3827 // Idx: index of the buffer.
3828 Argument *IdxArg = LtGRFunc->getArg(1);
3829 // ReduceList: thread local Reduce list.
3830 Argument *ReduceListArg = LtGRFunc->getArg(2);
3831
3832 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3833 BufferArg->getName() + ".addr");
3834 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3835 IdxArg->getName() + ".addr");
3836 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3837 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3838 auto *RedListArrayTy =
3839 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3840
3841 // 1. Build a list of reduction variables.
3842 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3843 Value *LocalReduceList =
3844 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3845
3846 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3847
3848 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3849 BufferArgAlloca, Builder.getPtrTy(),
3850 BufferArgAlloca->getName() + ".ascast");
3851 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3852 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3853 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3854 ReduceListArgAlloca, Builder.getPtrTy(),
3855 ReduceListArgAlloca->getName() + ".ascast");
3856 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3857 LocalReduceList, Builder.getPtrTy(),
3858 LocalReduceList->getName() + ".ascast");
3859
3860 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3861 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3862 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3863
3864 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3865 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3866 Type *IndexTy = Builder.getIndexTy(
3867 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3868 for (auto En : enumerate(ReductionInfos)) {
3869 const ReductionInfo &RI = En.value();
3870
3871 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3872 RedListArrayTy, LocalReduceListAddrCast,
3873 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3874 Value *BufferVD =
3875 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3876 // Global = Buffer.VD[Idx];
3877 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3878 ReductionsBufferTy, BufferVD, 0, En.index());
3879
3880 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
3881 InsertPointTy OldIP = Builder.saveIP();
3882 Builder.restoreIP(AllocaIP);
3883
3884 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3885 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3886 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3887
3888 Builder.restoreIP(OldIP);
3889
3890 // Get source descriptor from the reduce list argument
3891 Value *ReduceList =
3892 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3893 Value *SrcElementPtrPtr =
3894 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3895 {ConstantInt::get(IndexTy, 0),
3896 ConstantInt::get(IndexTy, En.index())});
3897 Value *SrcDescriptorAddr =
3898 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3899
3900 // Copy descriptor from source and update base_ptr to global buffer data
3901 InsertPointOrErrorTy GenResult =
3902 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3903 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3904
3905 if (!GenResult)
3906 return GenResult.takeError();
3907
3908 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3909 } else {
3910 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3911 }
3912 }
3913
3914 // Call reduce_function(GlobalReduceList, ReduceList)
3915 Value *ReduceList =
3916 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3917 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3918 ->addFnAttr(Attribute::NoUnwind);
3919 Builder.CreateRetVoid();
3920 Builder.restoreIP(OldIP);
3921 return LtGRFunc;
3922}
3923
3924Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3925 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3926 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3927 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3928 LLVMContext &Ctx = M.getContext();
3929 FunctionType *FuncTy = FunctionType::get(
3930 Builder.getVoidTy(),
3931 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3932 /* IsVarArg */ false);
3933 Function *GtLCFunc =
3935 "_omp_reduction_global_to_list_copy_func", &M);
3936 GtLCFunc->setAttributes(FuncAttrs);
3937 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3938 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3939 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3940
3941 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3942 Builder.SetInsertPoint(EntryBlock);
3943
3944 // Buffer: global reduction buffer.
3945 Argument *BufferArg = GtLCFunc->getArg(0);
3946 // Idx: index of the buffer.
3947 Argument *IdxArg = GtLCFunc->getArg(1);
3948 // ReduceList: thread local Reduce list.
3949 Argument *ReduceListArg = GtLCFunc->getArg(2);
3950
3951 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3952 BufferArg->getName() + ".addr");
3953 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3954 IdxArg->getName() + ".addr");
3955 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3956 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3957 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3958 BufferArgAlloca, Builder.getPtrTy(),
3959 BufferArgAlloca->getName() + ".ascast");
3960 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3961 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3962 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3963 ReduceListArgAlloca, Builder.getPtrTy(),
3964 ReduceListArgAlloca->getName() + ".ascast");
3965 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3966 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3967 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3968
3969 Value *LocalReduceList =
3970 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3971 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3972 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3973 Type *IndexTy = Builder.getIndexTy(
3974 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3975 for (auto En : enumerate(ReductionInfos)) {
3976 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3977 auto *RedListArrayTy =
3978 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3979 // Reduce element = LocalReduceList[i]
3980 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3981 RedListArrayTy, LocalReduceList,
3982 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3983 // elemptr = ((CopyType*)(elemptrptr)) + I
3984 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3985 // Global = Buffer.VD[Idx];
3986 Value *BufferVD =
3987 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3988 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3989 ReductionsBufferTy, BufferVD, 0, En.index());
3990
3991 switch (RI.EvaluationKind) {
3992 case EvalKind::Scalar: {
3993 Type *ElemType = RI.ElementType;
3994
3995 if (!IsByRef.empty() && IsByRef[En.index()]) {
3996 ElemType = RI.ByRefElementType;
3997 if (RI.DataPtrPtrGen) {
3998 InsertPointOrErrorTy GenResult =
3999 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
4000
4001 if (!GenResult)
4002 return GenResult.takeError();
4003
4004 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
4005 }
4006 }
4007
4008 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
4009 Builder.CreateStore(TargetElement, ElemPtr);
4010 break;
4011 }
4012 case EvalKind::Complex: {
4013 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
4014 RI.ElementType, GlobValPtr, 0, 0, ".realp");
4015 Value *SrcReal = Builder.CreateLoad(
4016 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
4017 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
4018 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
4019 Value *SrcImg = Builder.CreateLoad(
4020 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
4021
4022 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
4023 RI.ElementType, ElemPtr, 0, 0, ".realp");
4024 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
4025 RI.ElementType, ElemPtr, 0, 1, ".imagp");
4026 Builder.CreateStore(SrcReal, DestRealPtr);
4027 Builder.CreateStore(SrcImg, DestImgPtr);
4028 break;
4029 }
4030 case EvalKind::Aggregate: {
4031 Value *SizeVal =
4032 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4033 Builder.CreateMemCpy(
4034 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4035 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4036 SizeVal, false);
4037 break;
4038 }
4039 }
4040 }
4041
4042 Builder.CreateRetVoid();
4043 Builder.restoreIP(OldIP);
4044 return GtLCFunc;
4045}
4046
4047Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4048 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4049 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4050 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4051 LLVMContext &Ctx = M.getContext();
4052 auto *FuncTy = FunctionType::get(
4053 Builder.getVoidTy(),
4054 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4055 /* IsVarArg */ false);
4056 Function *GtLRFunc =
4058 "_omp_reduction_global_to_list_reduce_func", &M);
4059 GtLRFunc->setAttributes(FuncAttrs);
4060 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4061 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4062 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4063
4064 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4065 Builder.SetInsertPoint(EntryBlock);
4066
4067 // Buffer: global reduction buffer.
4068 Argument *BufferArg = GtLRFunc->getArg(0);
4069 // Idx: index of the buffer.
4070 Argument *IdxArg = GtLRFunc->getArg(1);
4071 // ReduceList: thread local Reduce list.
4072 Argument *ReduceListArg = GtLRFunc->getArg(2);
4073
4074 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4075 BufferArg->getName() + ".addr");
4076 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4077 IdxArg->getName() + ".addr");
4078 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4079 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4080 ArrayType *RedListArrayTy =
4081 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4082
4083 // 1. Build a list of reduction variables.
4084 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4085 Value *LocalReduceList =
4086 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4087
4088 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4089
4090 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4091 BufferArgAlloca, Builder.getPtrTy(),
4092 BufferArgAlloca->getName() + ".ascast");
4093 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4094 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4095 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4096 ReduceListArgAlloca, Builder.getPtrTy(),
4097 ReduceListArgAlloca->getName() + ".ascast");
4098 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4099 LocalReduceList, Builder.getPtrTy(),
4100 LocalReduceList->getName() + ".ascast");
4101
4102 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4103 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4104 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4105
4106 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4107 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4108 Type *IndexTy = Builder.getIndexTy(
4109 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4110 for (auto En : enumerate(ReductionInfos)) {
4111 const ReductionInfo &RI = En.value();
4112
4113 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4114 RedListArrayTy, ReductionList,
4115 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4116 // Global = Buffer.VD[Idx];
4117 Value *BufferVD =
4118 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4119 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4120 ReductionsBufferTy, BufferVD, 0, En.index());
4121
4122 if (!IsByRef.empty() && IsByRef[En.index()] && RI.DataPtrPtrGen) {
4123 InsertPointTy OldIP = Builder.saveIP();
4124 Builder.restoreIP(AllocaIP);
4125
4126 Value *ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4127 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4128 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4129
4130 Builder.restoreIP(OldIP);
4131
4132 // Get source descriptor from the reduce list
4133 Value *ReduceListVal =
4134 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4135 Value *SrcElementPtrPtr =
4136 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4137 {ConstantInt::get(IndexTy, 0),
4138 ConstantInt::get(IndexTy, En.index())});
4139 Value *SrcDescriptorAddr =
4140 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4141
4142 // Copy descriptor from source and update base_ptr to global buffer data
4143 InsertPointOrErrorTy GenResult =
4144 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4145 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4146 if (!GenResult)
4147 return GenResult.takeError();
4148
4149 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4150 } else {
4151 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4152 }
4153 }
4154
4155 // Call reduce_function(ReduceList, GlobalReduceList)
4156 Value *ReduceList =
4157 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4158 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4159 ->addFnAttr(Attribute::NoUnwind);
4160 Builder.CreateRetVoid();
4161 Builder.restoreIP(OldIP);
4162 return GtLRFunc;
4163}
4164
4165std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4166 std::string Suffix =
4167 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4168 return (Name + Suffix).str();
4169}
4170
4171Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4172 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4174 AttributeList FuncAttrs) {
4175 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4176 {Builder.getPtrTy(), Builder.getPtrTy()},
4177 /* IsVarArg */ false);
4178 std::string Name = getReductionFuncName(ReducerName);
4179 Function *ReductionFunc =
4181 ReductionFunc->setAttributes(FuncAttrs);
4182 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4183 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4184 BasicBlock *EntryBB =
4185 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4186 Builder.SetInsertPoint(EntryBB);
4187
4188 // Need to alloca memory here and deal with the pointers before getting
4189 // LHS/RHS pointers out
4190 Value *LHSArrayPtr = nullptr;
4191 Value *RHSArrayPtr = nullptr;
4192 Argument *Arg0 = ReductionFunc->getArg(0);
4193 Argument *Arg1 = ReductionFunc->getArg(1);
4194 Type *Arg0Type = Arg0->getType();
4195 Type *Arg1Type = Arg1->getType();
4196
4197 Value *LHSAlloca =
4198 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4199 Value *RHSAlloca =
4200 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4201 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4202 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4203 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4204 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4205 Builder.CreateStore(Arg0, LHSAddrCast);
4206 Builder.CreateStore(Arg1, RHSAddrCast);
4207 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4208 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4209
4210 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4211 Type *IndexTy = Builder.getIndexTy(
4212 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4213 SmallVector<Value *> LHSPtrs, RHSPtrs;
4214 for (auto En : enumerate(ReductionInfos)) {
4215 const ReductionInfo &RI = En.value();
4216 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4217 RedArrayTy, RHSArrayPtr,
4218 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4219 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4220 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4221 RHSI8Ptr, RI.PrivateVariable->getType(),
4222 RHSI8Ptr->getName() + ".ascast");
4223
4224 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4225 RedArrayTy, LHSArrayPtr,
4226 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4227 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4228 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4229 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4230
4232 LHSPtrs.emplace_back(LHSPtr);
4233 RHSPtrs.emplace_back(RHSPtr);
4234 } else {
4235 Value *LHS = LHSPtr;
4236 Value *RHS = RHSPtr;
4237
4238 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4239 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4240 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4241 }
4242
4243 Value *Reduced;
4244 InsertPointOrErrorTy AfterIP =
4245 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4246 if (!AfterIP)
4247 return AfterIP.takeError();
4248 if (!Builder.GetInsertBlock())
4249 return ReductionFunc;
4250
4251 Builder.restoreIP(*AfterIP);
4252
4253 if (!IsByRef.empty() && !IsByRef[En.index()])
4254 Builder.CreateStore(Reduced, LHSPtr);
4255 }
4256 }
4257
4259 for (auto En : enumerate(ReductionInfos)) {
4260 unsigned Index = En.index();
4261 const ReductionInfo &RI = En.value();
4262 Value *LHSFixupPtr, *RHSFixupPtr;
4263 Builder.restoreIP(RI.ReductionGenClang(
4264 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4265
4266 // Fix the CallBack code genereated to use the correct Values for the LHS
4267 // and RHS
4268 LHSFixupPtr->replaceUsesWithIf(
4269 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4270 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4271 ReductionFunc;
4272 });
4273 RHSFixupPtr->replaceUsesWithIf(
4274 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4275 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4276 ReductionFunc;
4277 });
4278 }
4279
4280 Builder.CreateRetVoid();
4281 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4282 // to the entry block (this is dones for higher opt levels by later passes in
4283 // the pipeline). This has caused issues because non-entry `alloca`s force the
4284 // function to use dynamic stack allocations and we might run out of scratch
4285 // memory.
4286 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4287
4288 return ReductionFunc;
4289}
4290
4291static void
4293 bool IsGPU) {
4294 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4295 (void)RI;
4296 assert(RI.Variable && "expected non-null variable");
4297 assert(RI.PrivateVariable && "expected non-null private variable");
4298 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4299 "expected non-null reduction generator callback");
4300 if (!IsGPU) {
4301 assert(
4302 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4303 "expected variables and their private equivalents to have the same "
4304 "type");
4305 }
4306 assert(RI.Variable->getType()->isPointerTy() &&
4307 "expected variables to be pointers");
4308 }
4309}
4310
4312 const LocationDescription &Loc, InsertPointTy AllocaIP,
4313 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4314 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4315 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4316 unsigned ReductionBufNum, Value *SrcLocInfo) {
4317 if (!updateToLocation(Loc))
4318 return InsertPointTy();
4319 Builder.restoreIP(CodeGenIP);
4320 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4321 LLVMContext &Ctx = M.getContext();
4322
4323 // Source location for the ident struct
4324 if (!SrcLocInfo) {
4325 uint32_t SrcLocStrSize;
4326 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4327 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4328 }
4329
4330 if (ReductionInfos.size() == 0)
4331 return Builder.saveIP();
4332
4333 BasicBlock *ContinuationBlock = nullptr;
4335 // Copied code from createReductions
4336 BasicBlock *InsertBlock = Loc.IP.getBlock();
4337 ContinuationBlock =
4338 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4339 InsertBlock->getTerminator()->eraseFromParent();
4340 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4341 }
4342
4343 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4344 AttributeList FuncAttrs;
4345 AttrBuilder AttrBldr(Ctx);
4346 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4347 AttrBldr.addAttribute(Attr);
4348 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4349 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4350
4351 CodeGenIP = Builder.saveIP();
4352 Expected<Function *> ReductionResult = createReductionFunction(
4353 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4354 ReductionGenCBKind, FuncAttrs);
4355 if (!ReductionResult)
4356 return ReductionResult.takeError();
4357 Function *ReductionFunc = *ReductionResult;
4358 Builder.restoreIP(CodeGenIP);
4359
4360 // Set the grid value in the config needed for lowering later on
4361 if (GridValue.has_value())
4362 Config.setGridValue(GridValue.value());
4363 else
4364 Config.setGridValue(getGridValue(T, ReductionFunc));
4365
4366 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4367 // RedList, shuffle_reduce_func, interwarp_copy_func);
4368 // or
4369 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4370 Value *Res;
4371
4372 // 1. Build a list of reduction variables.
4373 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4374 auto Size = ReductionInfos.size();
4375 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4376 Type *FuncPtrTy =
4377 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4378 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4379 CodeGenIP = Builder.saveIP();
4380 Builder.restoreIP(AllocaIP);
4381 Value *ReductionListAlloca =
4382 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4383 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4384 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4385 Builder.restoreIP(CodeGenIP);
4386 Type *IndexTy = Builder.getIndexTy(
4387 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4388 for (auto En : enumerate(ReductionInfos)) {
4389 const ReductionInfo &RI = En.value();
4390 Value *ElemPtr = Builder.CreateInBoundsGEP(
4391 RedArrayTy, ReductionList,
4392 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4393
4394 Value *PrivateVar = RI.PrivateVariable;
4395 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4396 if (IsByRefElem)
4397 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4398
4399 Value *CastElem =
4400 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4401 Builder.CreateStore(CastElem, ElemPtr);
4402 }
4403 CodeGenIP = Builder.saveIP();
4404 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4405 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4406
4407 if (!SarFunc)
4408 return SarFunc.takeError();
4409
4410 Expected<Function *> CopyResult =
4411 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4412 if (!CopyResult)
4413 return CopyResult.takeError();
4414 Function *WcFunc = *CopyResult;
4415 Builder.restoreIP(CodeGenIP);
4416
4417 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4418
4419 // NOTE: ReductionDataSize is passed as the reduce_data_size
4420 // argument to __kmpc_nvptx_{parallel,teams}_reduce_nowait_v2, but
4421 // the runtime implementations do not currently use it. The teams
4422 // runtime reads ReductionDataSize from KernelEnvironmentTy instead
4423 // (set separately via TargetKernelDefaultAttrs). It is computed
4424 // here conservatively as max(element sizes) * N rather than the
4425 // exact sum, which over-calculates the size for mixed reduction
4426 // types but is harmless given the argument is unused.
4427 // TODO: Consider dropping this computation if the runtime API is
4428 // ever revised to remove the unused parameter.
4429 unsigned MaxDataSize = 0;
4430 SmallVector<Type *> ReductionTypeArgs;
4431 for (auto En : enumerate(ReductionInfos)) {
4432 // Use ByRefElementType for by-ref reductions so that MaxDataSize matches
4433 // the actual data size stored in the global reduction buffer, consistent
4434 // with the ReductionsBufferTy struct used for GEP offsets below.
4435 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4436 ? En.value().ByRefElementType
4437 : En.value().ElementType;
4438 auto Size = M.getDataLayout().getTypeStoreSize(RedTypeArg);
4439 if (Size > MaxDataSize)
4440 MaxDataSize = Size;
4441 ReductionTypeArgs.emplace_back(RedTypeArg);
4442 }
4443 Value *ReductionDataSize =
4444 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4445 if (!IsTeamsReduction) {
4446 Value *SarFuncCast =
4447 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4448 Value *WcFuncCast =
4449 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4450 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4451 WcFuncCast};
4453 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4454 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4455 } else {
4456 CodeGenIP = Builder.saveIP();
4457 StructType *ReductionsBufferTy = StructType::create(
4458 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4459 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4460 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4461
4462 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4463 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4464 if (!LtGCFunc)
4465 return LtGCFunc.takeError();
4466
4467 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4468 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4469 if (!LtGRFunc)
4470 return LtGRFunc.takeError();
4471
4472 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4473 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4474 if (!GtLCFunc)
4475 return GtLCFunc.takeError();
4476
4477 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4478 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4479 if (!GtLRFunc)
4480 return GtLRFunc.takeError();
4481
4482 Builder.restoreIP(CodeGenIP);
4483
4484 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4485 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4486
4487 Value *Args3[] = {SrcLocInfo,
4488 KernelTeamsReductionPtr,
4489 Builder.getInt32(ReductionBufNum),
4490 ReductionDataSize,
4491 RL,
4492 *SarFunc,
4493 WcFunc,
4494 *LtGCFunc,
4495 *LtGRFunc,
4496 *GtLCFunc,
4497 *GtLRFunc};
4498
4499 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4500 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4501 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4502 }
4503
4504 // 5. Build if (res == 1)
4505 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4506 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4507 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4508 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4509
4510 // 6. Build then branch: where we have reduced values in the master
4511 // thread in each team.
4512 // __kmpc_end_reduce{_nowait}(<gtid>);
4513 // break;
4514 emitBlock(ThenBB, CurFunc);
4515
4516 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4517 for (auto En : enumerate(ReductionInfos)) {
4518 const ReductionInfo &RI = En.value();
4520 Value *RedValue = RI.Variable;
4521 Value *RHS =
4522 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4523
4525 Value *LHSPtr, *RHSPtr;
4526 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4527 &LHSPtr, &RHSPtr, CurFunc));
4528
4529 // Fix the CallBack code genereated to use the correct Values for the LHS
4530 // and RHS
4531 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4532 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4533 ReductionFunc;
4534 });
4535 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4536 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4537 ReductionFunc;
4538 });
4539 } else {
4540 if (IsByRef.empty() || !IsByRef[En.index()]) {
4541 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4542 "red.value." + Twine(En.index()));
4543 }
4544 Value *PrivateRedValue = Builder.CreateLoad(
4545 ValueType, RHS, "red.private.value" + Twine(En.index()));
4546 Value *Reduced;
4547 InsertPointOrErrorTy AfterIP =
4548 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4549 if (!AfterIP)
4550 return AfterIP.takeError();
4551 Builder.restoreIP(*AfterIP);
4552
4553 if (!IsByRef.empty() && !IsByRef[En.index()])
4554 Builder.CreateStore(Reduced, RI.Variable);
4555 }
4556 }
4557 emitBlock(ExitBB, CurFunc);
4558 if (ContinuationBlock) {
4559 Builder.CreateBr(ContinuationBlock);
4560 Builder.SetInsertPoint(ContinuationBlock);
4561 }
4562 Config.setEmitLLVMUsed();
4563
4564 return Builder.saveIP();
4565}
4566
4568 Type *VoidTy = Type::getVoidTy(M.getContext());
4569 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4570 auto *FuncTy =
4571 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4573 ".omp.reduction.func", &M);
4574}
4575
4577 Function *ReductionFunc,
4579 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4580 Module *Module = ReductionFunc->getParent();
4581 BasicBlock *ReductionFuncBlock =
4582 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4583 Builder.SetInsertPoint(ReductionFuncBlock);
4584 Value *LHSArrayPtr = nullptr;
4585 Value *RHSArrayPtr = nullptr;
4586 if (IsGPU) {
4587 // Need to alloca memory here and deal with the pointers before getting
4588 // LHS/RHS pointers out
4589 //
4590 Argument *Arg0 = ReductionFunc->getArg(0);
4591 Argument *Arg1 = ReductionFunc->getArg(1);
4592 Type *Arg0Type = Arg0->getType();
4593 Type *Arg1Type = Arg1->getType();
4594
4595 Value *LHSAlloca =
4596 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4597 Value *RHSAlloca =
4598 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4599 Value *LHSAddrCast =
4600 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4601 Value *RHSAddrCast =
4602 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4603 Builder.CreateStore(Arg0, LHSAddrCast);
4604 Builder.CreateStore(Arg1, RHSAddrCast);
4605 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4606 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4607 } else {
4608 LHSArrayPtr = ReductionFunc->getArg(0);
4609 RHSArrayPtr = ReductionFunc->getArg(1);
4610 }
4611
4612 unsigned NumReductions = ReductionInfos.size();
4613 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4614
4615 for (auto En : enumerate(ReductionInfos)) {
4616 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4617 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4618 RedArrayTy, LHSArrayPtr, 0, En.index());
4619 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4620 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4621 LHSI8Ptr, RI.Variable->getType());
4622 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4623 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4624 RedArrayTy, RHSArrayPtr, 0, En.index());
4625 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4626 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4627 RHSI8Ptr, RI.PrivateVariable->getType());
4628 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4629 Value *Reduced;
4631 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4632 if (!AfterIP)
4633 return AfterIP.takeError();
4634
4635 Builder.restoreIP(*AfterIP);
4636 // TODO: Consider flagging an error.
4637 if (!Builder.GetInsertBlock())
4638 return Error::success();
4639
4640 // store is inside of the reduction region when using by-ref
4641 if (!IsByRef[En.index()])
4642 Builder.CreateStore(Reduced, LHSPtr);
4643 }
4644 Builder.CreateRetVoid();
4645 return Error::success();
4646}
4647
4649 const LocationDescription &Loc, InsertPointTy AllocaIP,
4650 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4651 bool IsNoWait, bool IsTeamsReduction) {
4652 assert(ReductionInfos.size() == IsByRef.size());
4653 if (Config.isGPU())
4654 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4655 IsByRef, IsNoWait, IsTeamsReduction);
4656
4657 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4658
4659 if (!updateToLocation(Loc))
4660 return InsertPointTy();
4661
4662 if (ReductionInfos.size() == 0)
4663 return Builder.saveIP();
4664
4665 BasicBlock *InsertBlock = Loc.IP.getBlock();
4666 BasicBlock *ContinuationBlock =
4667 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4668 InsertBlock->getTerminator()->eraseFromParent();
4669
4670 // Create and populate array of type-erased pointers to private reduction
4671 // values.
4672 unsigned NumReductions = ReductionInfos.size();
4673 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4674 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4675 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4676
4677 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4678
4679 for (auto En : enumerate(ReductionInfos)) {
4680 unsigned Index = En.index();
4681 const ReductionInfo &RI = En.value();
4682 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4683 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4684 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4685 }
4686
4687 // Emit a call to the runtime function that orchestrates the reduction.
4688 // Declare the reduction function in the process.
4689 Type *IndexTy = Builder.getIndexTy(
4690 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4691 Function *Func = Builder.GetInsertBlock()->getParent();
4692 Module *Module = Func->getParent();
4693 uint32_t SrcLocStrSize;
4694 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4695 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4696 return RI.AtomicReductionGen;
4697 });
4698 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4699 CanGenerateAtomic
4700 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4701 : IdentFlag(0));
4702 Value *ThreadId = getOrCreateThreadID(Ident);
4703 Constant *NumVariables = Builder.getInt32(NumReductions);
4704 const DataLayout &DL = Module->getDataLayout();
4705 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4706 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4707 Function *ReductionFunc = getFreshReductionFunc(*Module);
4708 Value *Lock = getOMPCriticalRegionLock(".reduction");
4710 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4711 : RuntimeFunction::OMPRTL___kmpc_reduce);
4712 CallInst *ReduceCall =
4713 createRuntimeFunctionCall(ReduceFunc,
4714 {Ident, ThreadId, NumVariables, RedArraySize,
4715 RedArray, ReductionFunc, Lock},
4716 "reduce");
4717
4718 // Create final reduction entry blocks for the atomic and non-atomic case.
4719 // Emit IR that dispatches control flow to one of the blocks based on the
4720 // reduction supporting the atomic mode.
4721 BasicBlock *NonAtomicRedBlock =
4722 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4723 BasicBlock *AtomicRedBlock =
4724 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4725 SwitchInst *Switch =
4726 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4727 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4728 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4729
4730 // Populate the non-atomic reduction using the elementwise reduction function.
4731 // This loads the elements from the global and private variables and reduces
4732 // them before storing back the result to the global variable.
4733 Builder.SetInsertPoint(NonAtomicRedBlock);
4734 for (auto En : enumerate(ReductionInfos)) {
4735 const ReductionInfo &RI = En.value();
4737 // We have one less load for by-ref case because that load is now inside of
4738 // the reduction region
4739 Value *RedValue = RI.Variable;
4740 if (!IsByRef[En.index()]) {
4741 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4742 "red.value." + Twine(En.index()));
4743 }
4744 Value *PrivateRedValue =
4745 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4746 "red.private.value." + Twine(En.index()));
4747 Value *Reduced;
4748 InsertPointOrErrorTy AfterIP =
4749 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4750 if (!AfterIP)
4751 return AfterIP.takeError();
4752 Builder.restoreIP(*AfterIP);
4753
4754 if (!Builder.GetInsertBlock())
4755 return InsertPointTy();
4756 // for by-ref case, the load is inside of the reduction region
4757 if (!IsByRef[En.index()])
4758 Builder.CreateStore(Reduced, RI.Variable);
4759 }
4760 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4761 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4762 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4763 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4764 Builder.CreateBr(ContinuationBlock);
4765
4766 // Populate the atomic reduction using the atomic elementwise reduction
4767 // function. There are no loads/stores here because they will be happening
4768 // inside the atomic elementwise reduction.
4769 Builder.SetInsertPoint(AtomicRedBlock);
4770 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4771 for (const ReductionInfo &RI : ReductionInfos) {
4773 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4774 if (!AfterIP)
4775 return AfterIP.takeError();
4776 Builder.restoreIP(*AfterIP);
4777 if (!Builder.GetInsertBlock())
4778 return InsertPointTy();
4779 }
4780 Builder.CreateBr(ContinuationBlock);
4781 } else {
4782 Builder.CreateUnreachable();
4783 }
4784
4785 // Populate the outlined reduction function using the elementwise reduction
4786 // function. Partial values are extracted from the type-erased array of
4787 // pointers to private variables.
4788 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4789 IsByRef, /*isGPU=*/false);
4790 if (Err)
4791 return Err;
4792
4793 if (!Builder.GetInsertBlock())
4794 return InsertPointTy();
4795
4796 Builder.SetInsertPoint(ContinuationBlock);
4797 return Builder.saveIP();
4798}
4799
4802 BodyGenCallbackTy BodyGenCB,
4803 FinalizeCallbackTy FiniCB) {
4804 if (!updateToLocation(Loc))
4805 return Loc.IP;
4806
4807 Directive OMPD = Directive::OMPD_master;
4808 uint32_t SrcLocStrSize;
4809 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4810 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4811 Value *ThreadId = getOrCreateThreadID(Ident);
4812 Value *Args[] = {Ident, ThreadId};
4813
4814 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4815 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4816
4817 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4818 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4819
4820 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4821 /*Conditional*/ true, /*hasFinalize*/ true);
4822}
4823
4826 BodyGenCallbackTy BodyGenCB,
4827 FinalizeCallbackTy FiniCB, Value *Filter) {
4828 if (!updateToLocation(Loc))
4829 return Loc.IP;
4830
4831 Directive OMPD = Directive::OMPD_masked;
4832 uint32_t SrcLocStrSize;
4833 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4834 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4835 Value *ThreadId = getOrCreateThreadID(Ident);
4836 Value *Args[] = {Ident, ThreadId, Filter};
4837 Value *ArgsEnd[] = {Ident, ThreadId};
4838
4839 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4840 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4841
4842 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4843 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4844
4845 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4846 /*Conditional*/ true, /*hasFinalize*/ true);
4847}
4848
4850 llvm::FunctionCallee Callee,
4852 const llvm::Twine &Name) {
4853 llvm::CallInst *Call = Builder.CreateCall(
4854 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4855 Call->setDoesNotThrow();
4856 return Call;
4857}
4858
4859// Expects input basic block is dominated by BeforeScanBB.
4860// Once Scan directive is encountered, the code after scan directive should be
4861// dominated by AfterScanBB. Scan directive splits the code sequence to
4862// scan and input phase. Based on whether inclusive or exclusive
4863// clause is used in the scan directive and whether input loop or scan loop
4864// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4865// input loop and second is the scan loop. The code generated handles only
4866// inclusive scans now.
4868 const LocationDescription &Loc, InsertPointTy AllocaIP,
4869 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4870 bool IsInclusive, ScanInfo *ScanRedInfo) {
4871 if (ScanRedInfo->OMPFirstScanLoop) {
4872 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4873 ScanVarsType, ScanRedInfo);
4874 if (Err)
4875 return Err;
4876 }
4877 if (!updateToLocation(Loc))
4878 return Loc.IP;
4879
4880 llvm::Value *IV = ScanRedInfo->IV;
4881
4882 if (ScanRedInfo->OMPFirstScanLoop) {
4883 // Emit buffer[i] = red; at the end of the input phase.
4884 for (size_t i = 0; i < ScanVars.size(); i++) {
4885 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4886 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4887 Type *DestTy = ScanVarsType[i];
4888 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4889 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4890
4891 Builder.CreateStore(Src, Val);
4892 }
4893 }
4894 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4895 emitBlock(ScanRedInfo->OMPScanDispatch,
4896 Builder.GetInsertBlock()->getParent());
4897
4898 if (!ScanRedInfo->OMPFirstScanLoop) {
4899 IV = ScanRedInfo->IV;
4900 // Emit red = buffer[i]; at the entrance to the scan phase.
4901 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4902 for (size_t i = 0; i < ScanVars.size(); i++) {
4903 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4904 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4905 Type *DestTy = ScanVarsType[i];
4906 Value *SrcPtr =
4907 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4908 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4909 Builder.CreateStore(Src, ScanVars[i]);
4910 }
4911 }
4912
4913 // TODO: Update it to CreateBr and remove dead blocks
4914 llvm::Value *CmpI = Builder.getInt1(true);
4915 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4916 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4917 ScanRedInfo->OMPAfterScanBlock);
4918 } else {
4919 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4920 ScanRedInfo->OMPBeforeScanBlock);
4921 }
4922 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4923 Builder.GetInsertBlock()->getParent());
4924 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4925 return Builder.saveIP();
4926}
4927
4928Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4929 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4930 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4931
4932 Builder.restoreIP(AllocaIP);
4933 // Create the shared pointer at alloca IP.
4934 for (size_t i = 0; i < ScanVars.size(); i++) {
4935 llvm::Value *BuffPtr =
4936 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4937 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4938 }
4939
4940 // Allocate temporary buffer by master thread
4941 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4942 InsertPointTy CodeGenIP) -> Error {
4943 Builder.restoreIP(CodeGenIP);
4944 Value *AllocSpan =
4945 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4946 for (size_t i = 0; i < ScanVars.size(); i++) {
4947 Type *IntPtrTy = Builder.getInt32Ty();
4948 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4949 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4950 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4951 AllocSpan, nullptr, "arr");
4952 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4953 }
4954 return Error::success();
4955 };
4956 // TODO: Perform finalization actions for variables. This has to be
4957 // called for variables which have destructors/finalizers.
4958 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4959
4960 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4961 llvm::Value *FilterVal = Builder.getInt32(0);
4963 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4964
4965 if (!AfterIP)
4966 return AfterIP.takeError();
4967 Builder.restoreIP(*AfterIP);
4968 BasicBlock *InputBB = Builder.GetInsertBlock();
4969 if (InputBB->hasTerminator())
4970 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4971 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4972 if (!AfterIP)
4973 return AfterIP.takeError();
4974 Builder.restoreIP(*AfterIP);
4975
4976 return Error::success();
4977}
4978
4979Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4980 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4981 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4982 InsertPointTy CodeGenIP) -> Error {
4983 Builder.restoreIP(CodeGenIP);
4984 for (ReductionInfo RedInfo : ReductionInfos) {
4985 Value *PrivateVar = RedInfo.PrivateVariable;
4986 Value *OrigVar = RedInfo.Variable;
4987 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4988 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4989
4990 Type *SrcTy = RedInfo.ElementType;
4991 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4992 "arrayOffset");
4993 Value *Src = Builder.CreateLoad(SrcTy, Val);
4994
4995 Builder.CreateStore(Src, OrigVar);
4996 Builder.CreateFree(Buff);
4997 }
4998 return Error::success();
4999 };
5000 // TODO: Perform finalization actions for variables. This has to be
5001 // called for variables which have destructors/finalizers.
5002 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5003
5004 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
5005 Builder.SetInsertPoint(TI);
5006 else
5007 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
5008
5009 llvm::Value *FilterVal = Builder.getInt32(0);
5011 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5012
5013 if (!AfterIP)
5014 return AfterIP.takeError();
5015 Builder.restoreIP(*AfterIP);
5016 BasicBlock *InputBB = Builder.GetInsertBlock();
5017 if (InputBB->hasTerminator())
5018 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
5019 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5020 if (!AfterIP)
5021 return AfterIP.takeError();
5022 Builder.restoreIP(*AfterIP);
5023 return Error::success();
5024}
5025
5027 const LocationDescription &Loc,
5029 ScanInfo *ScanRedInfo) {
5030
5031 if (!updateToLocation(Loc))
5032 return Loc.IP;
5033 auto BodyGenCB = [&](InsertPointTy AllocaIP,
5034 InsertPointTy CodeGenIP) -> Error {
5035 Builder.restoreIP(CodeGenIP);
5036 Function *CurFn = Builder.GetInsertBlock()->getParent();
5037 // for (int k = 0; k <= ceil(log2(n)); ++k)
5038 llvm::BasicBlock *LoopBB =
5039 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5040 llvm::BasicBlock *ExitBB =
5041 splitBB(Builder, false, "omp.outer.log.scan.exit");
5043 Builder.GetInsertBlock()->getModule(),
5044 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5045 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5046 llvm::Value *Arg =
5047 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5048 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5050 Builder.GetInsertBlock()->getModule(),
5051 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5052 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5053 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5054 llvm::Value *NMin1 = Builder.CreateNUWSub(
5055 ScanRedInfo->Span,
5056 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5057 Builder.SetInsertPoint(InputBB);
5058 Builder.CreateBr(LoopBB);
5059 emitBlock(LoopBB, CurFn);
5060 Builder.SetInsertPoint(LoopBB);
5061
5062 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5063 // size pow2k = 1;
5064 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5065 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5066 InputBB);
5067 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5068 InputBB);
5069 // for (size i = n - 1; i >= 2 ^ k; --i)
5070 // tmp[i] op= tmp[i-pow2k];
5071 llvm::BasicBlock *InnerLoopBB =
5072 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5073 llvm::BasicBlock *InnerExitBB =
5074 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5075 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5076 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5077 emitBlock(InnerLoopBB, CurFn);
5078 Builder.SetInsertPoint(InnerLoopBB);
5079 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5080 IVal->addIncoming(NMin1, LoopBB);
5081 for (ReductionInfo RedInfo : ReductionInfos) {
5082 Value *ReductionVal = RedInfo.PrivateVariable;
5083 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5084 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5085 Type *DestTy = RedInfo.ElementType;
5086 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5087 Value *LHSPtr =
5088 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5089 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5090 Value *RHSPtr =
5091 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5092 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5093 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5094 llvm::Value *Result;
5095 InsertPointOrErrorTy AfterIP =
5096 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5097 if (!AfterIP)
5098 return AfterIP.takeError();
5099 Builder.CreateStore(Result, LHSPtr);
5100 }
5101 llvm::Value *NextIVal = Builder.CreateNUWSub(
5102 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5103 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5104 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5105 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5106 emitBlock(InnerExitBB, CurFn);
5107 llvm::Value *Next = Builder.CreateNUWAdd(
5108 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5109 Counter->addIncoming(Next, Builder.GetInsertBlock());
5110 // pow2k <<= 1;
5111 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5112 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5113 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5114 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5115 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5116 return Error::success();
5117 };
5118
5119 // TODO: Perform finalization actions for variables. This has to be
5120 // called for variables which have destructors/finalizers.
5121 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5122
5123 llvm::Value *FilterVal = Builder.getInt32(0);
5125 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5126
5127 if (!AfterIP)
5128 return AfterIP.takeError();
5129 Builder.restoreIP(*AfterIP);
5130 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5131
5132 if (!AfterIP)
5133 return AfterIP.takeError();
5134 Builder.restoreIP(*AfterIP);
5135 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5136 if (Err)
5137 return Err;
5138
5139 return AfterIP;
5140}
5141
5142Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5143 llvm::function_ref<Error()> InputLoopGen,
5144 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5145 ScanInfo *ScanRedInfo) {
5146
5147 {
5148 // Emit loop with input phase:
5149 // for (i: 0..<num_iters>) {
5150 // <input phase>;
5151 // buffer[i] = red;
5152 // }
5153 ScanRedInfo->OMPFirstScanLoop = true;
5154 Error Err = InputLoopGen();
5155 if (Err)
5156 return Err;
5157 }
5158 {
5159 // Emit loop with scan phase:
5160 // for (i: 0..<num_iters>) {
5161 // red = buffer[i];
5162 // <scan phase>;
5163 // }
5164 ScanRedInfo->OMPFirstScanLoop = false;
5165 Error Err = ScanLoopGen(Builder.saveIP());
5166 if (Err)
5167 return Err;
5168 }
5169 return Error::success();
5170}
5171
5172void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5173 Function *Fun = Builder.GetInsertBlock()->getParent();
5174 ScanRedInfo->OMPScanDispatch =
5175 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5176 ScanRedInfo->OMPAfterScanBlock =
5177 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5178 ScanRedInfo->OMPBeforeScanBlock =
5179 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5180 ScanRedInfo->OMPScanLoopExit =
5181 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5182}
5184 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5185 BasicBlock *PostInsertBefore, const Twine &Name) {
5186 Module *M = F->getParent();
5187 LLVMContext &Ctx = M->getContext();
5188 Type *IndVarTy = TripCount->getType();
5189
5190 // Create the basic block structure.
5191 BasicBlock *Preheader =
5192 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5193 BasicBlock *Header =
5194 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5195 BasicBlock *Cond =
5196 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5197 BasicBlock *Body =
5198 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5199 BasicBlock *Latch =
5200 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5201 BasicBlock *Exit =
5202 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5203 BasicBlock *After =
5204 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5205
5206 // Use specified DebugLoc for new instructions.
5207 Builder.SetCurrentDebugLocation(DL);
5208
5209 Builder.SetInsertPoint(Preheader);
5210 Builder.CreateBr(Header);
5211
5212 Builder.SetInsertPoint(Header);
5213 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5214 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5215 Builder.CreateBr(Cond);
5216
5217 Builder.SetInsertPoint(Cond);
5218 Value *Cmp =
5219 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5220 Builder.CreateCondBr(Cmp, Body, Exit);
5221
5222 Builder.SetInsertPoint(Body);
5223 Builder.CreateBr(Latch);
5224
5225 Builder.SetInsertPoint(Latch);
5226 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5227 "omp_" + Name + ".next", /*HasNUW=*/true);
5228 Builder.CreateBr(Header);
5229 IndVarPHI->addIncoming(Next, Latch);
5230
5231 Builder.SetInsertPoint(Exit);
5232 Builder.CreateBr(After);
5233
5234 // Remember and return the canonical control flow.
5235 LoopInfos.emplace_front();
5236 CanonicalLoopInfo *CL = &LoopInfos.front();
5237
5238 CL->Header = Header;
5239 CL->Cond = Cond;
5240 CL->Latch = Latch;
5241 CL->Exit = Exit;
5242
5243#ifndef NDEBUG
5244 CL->assertOK();
5245#endif
5246 return CL;
5247}
5248
5251 LoopBodyGenCallbackTy BodyGenCB,
5252 Value *TripCount, const Twine &Name) {
5253 BasicBlock *BB = Loc.IP.getBlock();
5254 BasicBlock *NextBB = BB->getNextNode();
5255
5256 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5257 NextBB, NextBB, Name);
5258 BasicBlock *After = CL->getAfter();
5259
5260 // If location is not set, don't connect the loop.
5261 if (updateToLocation(Loc)) {
5262 // Split the loop at the insertion point: Branch to the preheader and move
5263 // every following instruction to after the loop (the After BB). Also, the
5264 // new successor is the loop's after block.
5265 spliceBB(Builder, After, /*CreateBranch=*/false);
5266 Builder.CreateBr(CL->getPreheader());
5267 }
5268
5269 // Emit the body content. We do it after connecting the loop to the CFG to
5270 // avoid that the callback encounters degenerate BBs.
5271 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5272 return Err;
5273
5274#ifndef NDEBUG
5275 CL->assertOK();
5276#endif
5277 return CL;
5278}
5279
5281 ScanInfos.emplace_front();
5282 ScanInfo *Result = &ScanInfos.front();
5283 return Result;
5284}
5285
5289 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5290 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5291 LocationDescription ComputeLoc =
5292 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5293 updateToLocation(ComputeLoc);
5294
5296
5298 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5299 ScanRedInfo->Span = TripCount;
5300 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5301 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5302
5303 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5304 Builder.restoreIP(CodeGenIP);
5305 ScanRedInfo->IV = IV;
5306 createScanBBs(ScanRedInfo);
5307 BasicBlock *InputBlock = Builder.GetInsertBlock();
5308 Instruction *Terminator = InputBlock->getTerminator();
5309 assert(Terminator->getNumSuccessors() == 1);
5310 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5311 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5312 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5313 Builder.GetInsertBlock()->getParent());
5314 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5315 emitBlock(ScanRedInfo->OMPScanLoopExit,
5316 Builder.GetInsertBlock()->getParent());
5317 Builder.CreateBr(ContinueBlock);
5318 Builder.SetInsertPoint(
5319 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5320 return BodyGenCB(Builder.saveIP(), IV);
5321 };
5322
5323 const auto &&InputLoopGen = [&]() -> Error {
5325 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5326 ComputeIP, Name, true, ScanRedInfo);
5327 if (!LoopInfo)
5328 return LoopInfo.takeError();
5329 Result.push_back(*LoopInfo);
5330 Builder.restoreIP((*LoopInfo)->getAfterIP());
5331 return Error::success();
5332 };
5333 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5335 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5336 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5337 if (!LoopInfo)
5338 return LoopInfo.takeError();
5339 Result.push_back(*LoopInfo);
5340 Builder.restoreIP((*LoopInfo)->getAfterIP());
5341 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5342 return Error::success();
5343 };
5344 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5345 if (Err)
5346 return Err;
5347 return Result;
5348}
5349
5351 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5352 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5353
5354 // Consider the following difficulties (assuming 8-bit signed integers):
5355 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5356 // DO I = 1, 100, 50
5357 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5358 // DO I = 100, 0, -128
5359
5360 // Start, Stop and Step must be of the same integer type.
5361 auto *IndVarTy = cast<IntegerType>(Start->getType());
5362 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5363 assert(IndVarTy == Step->getType() && "Step type mismatch");
5364
5366
5367 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5368 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5369
5370 // Like Step, but always positive.
5371 Value *Incr = Step;
5372
5373 // Distance between Start and Stop; always positive.
5374 Value *Span;
5375
5376 // Condition whether there are no iterations are executed at all, e.g. because
5377 // UB < LB.
5378 Value *ZeroCmp;
5379
5380 if (IsSigned) {
5381 // Ensure that increment is positive. If not, negate and invert LB and UB.
5382 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5383 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5384 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5385 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5386 Span = Builder.CreateSub(UB, LB, "", false, true);
5387 ZeroCmp = Builder.CreateICmp(
5388 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5389 } else {
5390 Span = Builder.CreateSub(Stop, Start, "", true);
5391 ZeroCmp = Builder.CreateICmp(
5392 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5393 }
5394
5395 Value *CountIfLooping;
5396 if (InclusiveStop) {
5397 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5398 } else {
5399 // Avoid incrementing past stop since it could overflow.
5400 Value *CountIfTwo = Builder.CreateAdd(
5401 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5402 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5403 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5404 }
5405
5406 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5407 "omp_" + Name + ".tripcount");
5408}
5409
5412 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5413 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5414 ScanInfo *ScanRedInfo) {
5415 LocationDescription ComputeLoc =
5416 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5417
5419 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5420
5421 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5422 Builder.restoreIP(CodeGenIP);
5423 Value *Span = Builder.CreateMul(IV, Step);
5424 Value *IndVar = Builder.CreateAdd(Span, Start);
5425 if (InScan)
5426 ScanRedInfo->IV = IndVar;
5427 return BodyGenCB(Builder.saveIP(), IndVar);
5428 };
5429 LocationDescription LoopLoc =
5430 ComputeIP.isSet()
5431 ? Loc
5432 : LocationDescription(Builder.saveIP(),
5433 Builder.getCurrentDebugLocation());
5434 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5435}
5436
5437// Returns an LLVM function to call for initializing loop bounds using OpenMP
5438// static scheduling for composite `distribute parallel for` depending on
5439// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5440// integers as unsigned similarly to CanonicalLoopInfo.
5441static FunctionCallee
5443 OpenMPIRBuilder &OMPBuilder) {
5444 unsigned Bitwidth = Ty->getIntegerBitWidth();
5445 if (Bitwidth == 32)
5446 return OMPBuilder.getOrCreateRuntimeFunction(
5447 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5448 if (Bitwidth == 64)
5449 return OMPBuilder.getOrCreateRuntimeFunction(
5450 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5451 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5452}
5453
5454// Returns an LLVM function to call for initializing loop bounds using OpenMP
5455// static scheduling depending on `type`. Only i32 and i64 are supported by the
5456// runtime. Always interpret integers as unsigned similarly to
5457// CanonicalLoopInfo.
5459 OpenMPIRBuilder &OMPBuilder) {
5460 unsigned Bitwidth = Ty->getIntegerBitWidth();
5461 if (Bitwidth == 32)
5462 return OMPBuilder.getOrCreateRuntimeFunction(
5463 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5464 if (Bitwidth == 64)
5465 return OMPBuilder.getOrCreateRuntimeFunction(
5466 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5467 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5468}
5469
5470OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5471 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5472 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5473 OMPScheduleType DistScheduleSchedType) {
5474 assert(CLI->isValid() && "Requires a valid canonical loop");
5475 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5476 "Require dedicated allocate IP");
5477
5478 // Set up the source location value for OpenMP runtime.
5479 Builder.restoreIP(CLI->getPreheaderIP());
5480 Builder.SetCurrentDebugLocation(DL);
5481
5482 uint32_t SrcLocStrSize;
5483 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5485 switch (LoopType) {
5486 case WorksharingLoopType::ForStaticLoop:
5487 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5488 break;
5489 case WorksharingLoopType::DistributeStaticLoop:
5490 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5491 break;
5492 case WorksharingLoopType::DistributeForStaticLoop:
5493 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5494 break;
5495 }
5496 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5497
5498 // Declare useful OpenMP runtime functions.
5499 Value *IV = CLI->getIndVar();
5500 Type *IVTy = IV->getType();
5501 FunctionCallee StaticInit =
5502 LoopType == WorksharingLoopType::DistributeForStaticLoop
5503 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5504 : getKmpcForStaticInitForType(IVTy, M, *this);
5505 FunctionCallee StaticFini =
5506 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5507
5508 // Allocate space for computed loop bounds as expected by the "init" function.
5509 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5510
5511 Type *I32Type = Type::getInt32Ty(M.getContext());
5512 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5513 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5514 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5515 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5516 CLI->setLastIter(PLastIter);
5517
5518 // At the end of the preheader, prepare for calling the "init" function by
5519 // storing the current loop bounds into the allocated space. A canonical loop
5520 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5521 // and produces an inclusive upper bound.
5522 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5523 Constant *Zero = ConstantInt::get(IVTy, 0);
5524 Constant *One = ConstantInt::get(IVTy, 1);
5525 Builder.CreateStore(Zero, PLowerBound);
5526 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5527 Builder.CreateStore(UpperBound, PUpperBound);
5528 Builder.CreateStore(One, PStride);
5529
5530 Value *ThreadNum =
5531 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5532
5533 OMPScheduleType SchedType =
5534 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5535 ? OMPScheduleType::OrderedDistribute
5537 Constant *SchedulingType =
5538 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5539
5540 // Call the "init" function and update the trip count of the loop with the
5541 // value it produced.
5542 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5543 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5544 this](Value *SchedulingType, auto &Builder) {
5545 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5546 PLowerBound, PUpperBound});
5547 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5548 Value *PDistUpperBound =
5549 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5550 Args.push_back(PDistUpperBound);
5551 }
5552 Args.append({PStride, One, Zero});
5553 createRuntimeFunctionCall(StaticInit, Args);
5554 };
5555 BuildInitCall(SchedulingType, Builder);
5556 if (HasDistSchedule &&
5557 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5558 Constant *DistScheduleSchedType = ConstantInt::get(
5559 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5560 // We want to emit a second init function call for the dist_schedule clause
5561 // to the Distribute construct. This should only be done however if a
5562 // Workshare Loop is nested within a Distribute Construct
5563 BuildInitCall(DistScheduleSchedType, Builder);
5564 }
5565 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5566 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5567 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5568 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5569 CLI->setTripCount(TripCount);
5570
5571 // Update all uses of the induction variable except the one in the condition
5572 // block that compares it with the actual upper bound, and the increment in
5573 // the latch block.
5574
5575 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5576 Builder.SetInsertPoint(CLI->getBody(),
5577 CLI->getBody()->getFirstInsertionPt());
5578 Builder.SetCurrentDebugLocation(DL);
5579 return Builder.CreateAdd(OldIV, LowerBound);
5580 });
5581
5582 // In the "exit" block, call the "fini" function.
5583 Builder.SetInsertPoint(CLI->getExit(),
5584 CLI->getExit()->getTerminator()->getIterator());
5585 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5586
5587 // Add the barrier if requested.
5588 if (NeedsBarrier) {
5589 InsertPointOrErrorTy BarrierIP =
5591 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5592 /* CheckCancelFlag */ false);
5593 if (!BarrierIP)
5594 return BarrierIP.takeError();
5595 }
5596
5597 InsertPointTy AfterIP = CLI->getAfterIP();
5598 CLI->invalidate();
5599
5600 return AfterIP;
5601}
5602
5603static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5604 LoopInfo &LI);
5605static void addLoopMetadata(CanonicalLoopInfo *Loop,
5606 ArrayRef<Metadata *> Properties);
5607
5609 LLVMContext &Ctx, Loop *Loop,
5611 SmallVector<Metadata *> &LoopMDList) {
5612 SmallSet<BasicBlock *, 8> Reachable;
5613
5614 // Get the basic blocks from the loop in which memref instructions
5615 // can be found.
5616 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5617 // preferably without running any passes.
5618 for (BasicBlock *Block : Loop->getBlocks()) {
5619 if (Block == CLI->getCond() || Block == CLI->getHeader())
5620 continue;
5621 Reachable.insert(Block);
5622 }
5623
5624 // Add access group metadata to memory-access instructions.
5625 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5626 for (BasicBlock *BB : Reachable)
5627 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5628 // TODO: If the loop has existing parallel access metadata, have
5629 // to combine two lists.
5630 LoopMDList.push_back(MDNode::get(
5631 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5632}
5633
5635OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5636 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5637 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5638 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5639 assert(CLI->isValid() && "Requires a valid canonical loop");
5640 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5641
5642 LLVMContext &Ctx = CLI->getFunction()->getContext();
5643 Value *IV = CLI->getIndVar();
5644 Value *OrigTripCount = CLI->getTripCount();
5645 Type *IVTy = IV->getType();
5646 assert(IVTy->getIntegerBitWidth() <= 64 &&
5647 "Max supported tripcount bitwidth is 64 bits");
5648 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5649 : Type::getInt64Ty(Ctx);
5650 Type *I32Type = Type::getInt32Ty(M.getContext());
5651 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5652 Constant *One = ConstantInt::get(InternalIVTy, 1);
5653
5654 Function *F = CLI->getFunction();
5655 // Blocks must have terminators.
5656 // FIXME: Don't run analyses on incomplete/invalid IR.
5658 for (BasicBlock &BB : *F)
5659 if (!BB.hasTerminator())
5660 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5662 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5663 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5664 LoopAnalysis LIA;
5665 LoopInfo &&LI = LIA.run(*F, FAM);
5666 for (Instruction *I : UIs)
5667 I->eraseFromParent();
5668 Loop *L = LI.getLoopFor(CLI->getHeader());
5669 SmallVector<Metadata *> LoopMDList;
5670 if (ChunkSize || DistScheduleChunkSize)
5671 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5672 addLoopMetadata(CLI, LoopMDList);
5673
5674 // Declare useful OpenMP runtime functions.
5675 FunctionCallee StaticInit =
5676 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5677 FunctionCallee StaticFini =
5678 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5679
5680 // Allocate space for computed loop bounds as expected by the "init" function.
5681 Builder.restoreIP(AllocaIP);
5682 Builder.SetCurrentDebugLocation(DL);
5683 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5684 Value *PLowerBound =
5685 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5686 Value *PUpperBound =
5687 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5688 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5689 CLI->setLastIter(PLastIter);
5690
5691 // Set up the source location value for the OpenMP runtime.
5692 Builder.restoreIP(CLI->getPreheaderIP());
5693 Builder.SetCurrentDebugLocation(DL);
5694
5695 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5696 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5697 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5698 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5699 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5700 "distschedulechunksize");
5701 Value *CastedTripCount =
5702 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5703
5704 Constant *SchedulingType =
5705 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5706 Constant *DistSchedulingType =
5707 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5708 Builder.CreateStore(Zero, PLowerBound);
5709 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5710 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5711 Value *UpperBound =
5712 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5713 Builder.CreateStore(UpperBound, PUpperBound);
5714 Builder.CreateStore(One, PStride);
5715
5716 // Call the "init" function and update the trip count of the loop with the
5717 // value it produced.
5718 uint32_t SrcLocStrSize;
5719 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5720 IdentFlag Flag = OMP_IDENT_FLAG_WORK_LOOP;
5721 if (DistScheduleSchedType != OMPScheduleType::None) {
5722 Flag |= OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5723 }
5724 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5725 Value *ThreadNum =
5726 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
5727 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5728 PUpperBound, PStride, One,
5729 this](Value *SchedulingType, Value *ChunkSize,
5730 auto &Builder) {
5732 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5733 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5734 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5735 /*pstride=*/PStride, /*incr=*/One,
5736 /*chunk=*/ChunkSize});
5737 };
5738 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5739 if (DistScheduleSchedType != OMPScheduleType::None &&
5740 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5741 SchedType != OMPScheduleType::OrderedDistribute) {
5742 // We want to emit a second init function call for the dist_schedule clause
5743 // to the Distribute construct. This should only be done however if a
5744 // Workshare Loop is nested within a Distribute Construct
5745 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5746 }
5747
5748 // Load values written by the "init" function.
5749 Value *FirstChunkStart =
5750 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5751 Value *FirstChunkStop =
5752 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5753 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5754 Value *ChunkRange =
5755 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5756 Value *NextChunkStride =
5757 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5758
5759 // Create outer "dispatch" loop for enumerating the chunks.
5760 BasicBlock *DispatchEnter = splitBB(Builder, true);
5761 Value *DispatchCounter;
5762
5763 // It is safe to assume this didn't return an error because the callback
5764 // passed into createCanonicalLoop is the only possible error source, and it
5765 // always returns success.
5766 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5767 {Builder.saveIP(), DL},
5768 [&](InsertPointTy BodyIP, Value *Counter) {
5769 DispatchCounter = Counter;
5770 return Error::success();
5771 },
5772 FirstChunkStart, CastedTripCount, NextChunkStride,
5773 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5774 "dispatch"));
5775
5776 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5777 // not have to preserve the canonical invariant.
5778 BasicBlock *DispatchBody = DispatchCLI->getBody();
5779 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5780 BasicBlock *DispatchExit = DispatchCLI->getExit();
5781 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5782 DispatchCLI->invalidate();
5783
5784 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5785 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5786 redirectTo(CLI->getExit(), DispatchLatch, DL);
5787 redirectTo(DispatchBody, DispatchEnter, DL);
5788
5789 // Prepare the prolog of the chunk loop.
5790 Builder.restoreIP(CLI->getPreheaderIP());
5791 Builder.SetCurrentDebugLocation(DL);
5792
5793 // Compute the number of iterations of the chunk loop.
5794 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5795 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5796 Value *IsLastChunk =
5797 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5798 Value *CountUntilOrigTripCount =
5799 Builder.CreateSub(CastedTripCount, DispatchCounter);
5800 Value *ChunkTripCount = Builder.CreateSelect(
5801 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5802 Value *BackcastedChunkTC =
5803 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5804 CLI->setTripCount(BackcastedChunkTC);
5805
5806 // Update all uses of the induction variable except the one in the condition
5807 // block that compares it with the actual upper bound, and the increment in
5808 // the latch block.
5809 Value *BackcastedDispatchCounter =
5810 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5811 CLI->mapIndVar([&](Instruction *) -> Value * {
5812 Builder.restoreIP(CLI->getBodyIP());
5813 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5814 });
5815
5816 // In the "exit" block, call the "fini" function.
5817 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5818 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5819
5820 // Add the barrier if requested.
5821 if (NeedsBarrier) {
5822 InsertPointOrErrorTy AfterIP =
5823 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5824 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5825 if (!AfterIP)
5826 return AfterIP.takeError();
5827 }
5828
5829#ifndef NDEBUG
5830 // Even though we currently do not support applying additional methods to it,
5831 // the chunk loop should remain a canonical loop.
5832 CLI->assertOK();
5833#endif
5834
5835 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5836}
5837
5838// Returns an LLVM function to call for executing an OpenMP static worksharing
5839// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5840// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5841static FunctionCallee
5843 WorksharingLoopType LoopType) {
5844 unsigned Bitwidth = Ty->getIntegerBitWidth();
5845 Module &M = OMPBuilder->M;
5846 switch (LoopType) {
5847 case WorksharingLoopType::ForStaticLoop:
5848 if (Bitwidth == 32)
5849 return OMPBuilder->getOrCreateRuntimeFunction(
5850 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5851 if (Bitwidth == 64)
5852 return OMPBuilder->getOrCreateRuntimeFunction(
5853 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5854 break;
5855 case WorksharingLoopType::DistributeStaticLoop:
5856 if (Bitwidth == 32)
5857 return OMPBuilder->getOrCreateRuntimeFunction(
5858 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5859 if (Bitwidth == 64)
5860 return OMPBuilder->getOrCreateRuntimeFunction(
5861 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5862 break;
5863 case WorksharingLoopType::DistributeForStaticLoop:
5864 if (Bitwidth == 32)
5865 return OMPBuilder->getOrCreateRuntimeFunction(
5866 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5867 if (Bitwidth == 64)
5868 return OMPBuilder->getOrCreateRuntimeFunction(
5869 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5870 break;
5871 }
5872 if (Bitwidth != 32 && Bitwidth != 64) {
5873 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5874 }
5875 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5876}
5877
5878// Inserts a call to proper OpenMP Device RTL function which handles
5879// loop worksharing.
5881 WorksharingLoopType LoopType,
5882 BasicBlock *InsertBlock, Value *Ident,
5883 Value *LoopBodyArg, Value *TripCount,
5884 Function &LoopBodyFn, bool NoLoop) {
5885 Type *TripCountTy = TripCount->getType();
5886 Module &M = OMPBuilder->M;
5887 IRBuilder<> &Builder = OMPBuilder->Builder;
5888 FunctionCallee RTLFn =
5889 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5890 SmallVector<Value *, 8> RealArgs;
5891 RealArgs.push_back(Ident);
5892 RealArgs.push_back(&LoopBodyFn);
5893 RealArgs.push_back(LoopBodyArg);
5894 RealArgs.push_back(TripCount);
5895 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5896 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5897 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5898 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5899 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5900 return;
5901 }
5902 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5903 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5904 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5905 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5906
5907 RealArgs.push_back(
5908 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5909 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5910 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5911 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5912 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5913 } else {
5914 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5915 }
5916
5917 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5918}
5919
5921 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5922 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5923 WorksharingLoopType LoopType, bool NoLoop) {
5924 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5925 BasicBlock *Preheader = CLI->getPreheader();
5926 Value *TripCount = CLI->getTripCount();
5927
5928 // After loop body outling, the loop body contains only set up
5929 // of loop body argument structure and the call to the outlined
5930 // loop body function. Firstly, we need to move setup of loop body args
5931 // into loop preheader.
5932 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5933 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5934
5935 // The next step is to remove the whole loop. We do not it need anymore.
5936 // That's why make an unconditional branch from loop preheader to loop
5937 // exit block
5938 Builder.restoreIP({Preheader, Preheader->end()});
5939 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5940 Preheader->getTerminator()->eraseFromParent();
5941 Builder.CreateBr(CLI->getExit());
5942
5943 // Delete dead loop blocks
5944 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5945 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5946 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5947 CleanUpInfo.EntryBB = CLI->getHeader();
5948 CleanUpInfo.ExitBB = CLI->getExit();
5949 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5950 DeleteDeadBlocks(BlocksToBeRemoved);
5951
5952 // Find the instruction which corresponds to loop body argument structure
5953 // and remove the call to loop body function instruction.
5954 Value *LoopBodyArg;
5955 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5956 assert(OutlinedFnUser &&
5957 "Expected unique undroppable user of outlined function");
5958 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5959 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5960 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5961 "Expected outlined function call to be located in loop preheader");
5962 // Check in case no argument structure has been passed.
5963 if (OutlinedFnCallInstruction->arg_size() > 1)
5964 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5965 else
5966 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5967 OutlinedFnCallInstruction->eraseFromParent();
5968
5969 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5970 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5971
5972 for (auto &ToBeDeletedItem : ToBeDeleted)
5973 ToBeDeletedItem->eraseFromParent();
5974 CLI->invalidate();
5975}
5976
5977OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5978 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5979 WorksharingLoopType LoopType, bool NoLoop) {
5980 uint32_t SrcLocStrSize;
5981 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5983 switch (LoopType) {
5984 case WorksharingLoopType::ForStaticLoop:
5985 Flag = OMP_IDENT_FLAG_WORK_LOOP;
5986 break;
5987 case WorksharingLoopType::DistributeStaticLoop:
5988 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE;
5989 break;
5990 case WorksharingLoopType::DistributeForStaticLoop:
5991 Flag = OMP_IDENT_FLAG_WORK_DISTRIBUTE | OMP_IDENT_FLAG_WORK_LOOP;
5992 break;
5993 }
5994 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize, Flag);
5995
5996 OutlineInfo OI;
5997 OI.OuterAllocaBB = CLI->getPreheader();
5998 Function *OuterFn = CLI->getPreheader()->getParent();
5999
6000 // Instructions which need to be deleted at the end of code generation
6001 SmallVector<Instruction *, 4> ToBeDeleted;
6002
6003 OI.OuterAllocaBB = AllocaIP.getBlock();
6004
6005 // Mark the body loop as region which needs to be extracted
6006 OI.EntryBB = CLI->getBody();
6007 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
6008 "omp.prelatch");
6009
6010 // Prepare loop body for extraction
6011 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
6012
6013 // Insert new loop counter variable which will be used only in loop
6014 // body.
6015 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
6016 Instruction *NewLoopCntLoad =
6017 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
6018 // New loop counter instructions are redundant in the loop preheader when
6019 // code generation for workshare loop is finshed. That's why mark them as
6020 // ready for deletion.
6021 ToBeDeleted.push_back(NewLoopCntLoad);
6022 ToBeDeleted.push_back(NewLoopCnt);
6023
6024 // Analyse loop body region. Find all input variables which are used inside
6025 // loop body region.
6026 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
6028 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
6029
6030 CodeExtractorAnalysisCache CEAC(*OuterFn);
6031 CodeExtractor Extractor(Blocks,
6032 /* DominatorTree */ nullptr,
6033 /* AggregateArgs */ true,
6034 /* BlockFrequencyInfo */ nullptr,
6035 /* BranchProbabilityInfo */ nullptr,
6036 /* AssumptionCache */ nullptr,
6037 /* AllowVarArgs */ true,
6038 /* AllowAlloca */ true,
6039 /* AllocationBlock */ CLI->getPreheader(),
6040 /* Suffix */ ".omp_wsloop",
6041 /* AggrArgsIn0AddrSpace */ true);
6042
6043 BasicBlock *CommonExit = nullptr;
6044 SetVector<Value *> SinkingCands, HoistingCands;
6045
6046 // Find allocas outside the loop body region which are used inside loop
6047 // body
6048 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
6049
6050 // We need to model loop body region as the function f(cnt, loop_arg).
6051 // That's why we replace loop induction variable by the new counter
6052 // which will be one of loop body function argument
6054 CLI->getIndVar()->user_end());
6055 for (auto Use : Users) {
6056 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
6057 if (ParallelRegionBlockSet.count(Inst->getParent())) {
6058 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
6059 }
6060 }
6061 }
6062 // Make sure that loop counter variable is not merged into loop body
6063 // function argument structure and it is passed as separate variable
6064 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
6065
6066 // PostOutline CB is invoked when loop body function is outlined and
6067 // loop body is replaced by call to outlined function. We need to add
6068 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6069 // function will handle loop control logic.
6070 //
6071 OI.PostOutlineCB = [=, ToBeDeletedVec =
6072 std::move(ToBeDeleted)](Function &OutlinedFn) {
6073 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6074 LoopType, NoLoop);
6075 };
6076 addOutlineInfo(std::move(OI));
6077 return CLI->getAfterIP();
6078}
6079
6082 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6083 bool HasSimdModifier, bool HasMonotonicModifier,
6084 bool HasNonmonotonicModifier, bool HasOrderedClause,
6085 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6086 Value *DistScheduleChunkSize) {
6087 if (Config.isTargetDevice())
6088 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6089 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6090 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6091 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6092
6093 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6094 OMPScheduleType::ModifierOrdered;
6095 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6096 if (HasDistSchedule) {
6097 DistScheduleSchedType = DistScheduleChunkSize
6098 ? OMPScheduleType::OrderedDistributeChunked
6099 : OMPScheduleType::OrderedDistribute;
6100 }
6101 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6102 case OMPScheduleType::BaseStatic:
6103 case OMPScheduleType::BaseDistribute:
6104 assert((!ChunkSize || !DistScheduleChunkSize) &&
6105 "No chunk size with static-chunked schedule");
6106 if (IsOrdered && !HasDistSchedule)
6107 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6108 NeedsBarrier, ChunkSize);
6109 // FIXME: Monotonicity ignored?
6110 if (DistScheduleChunkSize)
6111 return applyStaticChunkedWorkshareLoop(
6112 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6113 DistScheduleChunkSize, DistScheduleSchedType);
6114 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6115 HasDistSchedule);
6116
6117 case OMPScheduleType::BaseStaticChunked:
6118 case OMPScheduleType::BaseDistributeChunked:
6119 if (IsOrdered && !HasDistSchedule)
6120 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6121 NeedsBarrier, ChunkSize);
6122 // FIXME: Monotonicity ignored?
6123 return applyStaticChunkedWorkshareLoop(
6124 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6125 DistScheduleChunkSize, DistScheduleSchedType);
6126
6127 case OMPScheduleType::BaseRuntime:
6128 case OMPScheduleType::BaseAuto:
6129 case OMPScheduleType::BaseGreedy:
6130 case OMPScheduleType::BaseBalanced:
6131 case OMPScheduleType::BaseSteal:
6132 case OMPScheduleType::BaseRuntimeSimd:
6133 assert(!ChunkSize &&
6134 "schedule type does not support user-defined chunk sizes");
6135 [[fallthrough]];
6136 case OMPScheduleType::BaseGuidedSimd:
6137 case OMPScheduleType::BaseDynamicChunked:
6138 case OMPScheduleType::BaseGuidedChunked:
6139 case OMPScheduleType::BaseGuidedIterativeChunked:
6140 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6141 case OMPScheduleType::BaseStaticBalancedChunked:
6142 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6143 NeedsBarrier, ChunkSize);
6144
6145 default:
6146 llvm_unreachable("Unknown/unimplemented schedule kind");
6147 }
6148}
6149
6150/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6151/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6152/// the runtime. Always interpret integers as unsigned similarly to
6153/// CanonicalLoopInfo.
6154static FunctionCallee
6156 unsigned Bitwidth = Ty->getIntegerBitWidth();
6157 if (Bitwidth == 32)
6158 return OMPBuilder.getOrCreateRuntimeFunction(
6159 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6160 if (Bitwidth == 64)
6161 return OMPBuilder.getOrCreateRuntimeFunction(
6162 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6163 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6164}
6165
6166/// Returns an LLVM function to call for updating the next loop using OpenMP
6167/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6168/// the runtime. Always interpret integers as unsigned similarly to
6169/// CanonicalLoopInfo.
6170static FunctionCallee
6172 unsigned Bitwidth = Ty->getIntegerBitWidth();
6173 if (Bitwidth == 32)
6174 return OMPBuilder.getOrCreateRuntimeFunction(
6175 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6176 if (Bitwidth == 64)
6177 return OMPBuilder.getOrCreateRuntimeFunction(
6178 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6179 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6180}
6181
6182/// Returns an LLVM function to call for finalizing the dynamic loop using
6183/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6184/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6185static FunctionCallee
6187 unsigned Bitwidth = Ty->getIntegerBitWidth();
6188 if (Bitwidth == 32)
6189 return OMPBuilder.getOrCreateRuntimeFunction(
6190 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6191 if (Bitwidth == 64)
6192 return OMPBuilder.getOrCreateRuntimeFunction(
6193 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6194 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6195}
6196
6198OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6199 InsertPointTy AllocaIP,
6200 OMPScheduleType SchedType,
6201 bool NeedsBarrier, Value *Chunk) {
6202 assert(CLI->isValid() && "Requires a valid canonical loop");
6203 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6204 "Require dedicated allocate IP");
6206 "Require valid schedule type");
6207
6208 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6209 OMPScheduleType::ModifierOrdered;
6210
6211 // Set up the source location value for OpenMP runtime.
6212 Builder.SetCurrentDebugLocation(DL);
6213
6214 uint32_t SrcLocStrSize;
6215 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6216 Value *SrcLoc =
6217 getOrCreateIdent(SrcLocStr, SrcLocStrSize, OMP_IDENT_FLAG_WORK_LOOP);
6218
6219 // Declare useful OpenMP runtime functions.
6220 Value *IV = CLI->getIndVar();
6221 Type *IVTy = IV->getType();
6222 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6223 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6224
6225 // Allocate space for computed loop bounds as expected by the "init" function.
6226 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6227 Type *I32Type = Type::getInt32Ty(M.getContext());
6228 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6229 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6230 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6231 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6232 CLI->setLastIter(PLastIter);
6233
6234 // At the end of the preheader, prepare for calling the "init" function by
6235 // storing the current loop bounds into the allocated space. A canonical loop
6236 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6237 // and produces an inclusive upper bound.
6238 BasicBlock *PreHeader = CLI->getPreheader();
6239 Builder.SetInsertPoint(PreHeader->getTerminator());
6240 Constant *One = ConstantInt::get(IVTy, 1);
6241 Builder.CreateStore(One, PLowerBound);
6242 Value *UpperBound = CLI->getTripCount();
6243 Builder.CreateStore(UpperBound, PUpperBound);
6244 Builder.CreateStore(One, PStride);
6245
6246 BasicBlock *Header = CLI->getHeader();
6247 BasicBlock *Exit = CLI->getExit();
6248 BasicBlock *Cond = CLI->getCond();
6249 BasicBlock *Latch = CLI->getLatch();
6250 InsertPointTy AfterIP = CLI->getAfterIP();
6251
6252 // The CLI will be "broken" in the code below, as the loop is no longer
6253 // a valid canonical loop.
6254
6255 if (!Chunk)
6256 Chunk = One;
6257
6258 Value *ThreadNum =
6259 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize));
6260
6261 Constant *SchedulingType =
6262 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6263
6264 // Call the "init" function.
6265 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6266 /* LowerBound */ One, UpperBound,
6267 /* step */ One, Chunk});
6268
6269 // An outer loop around the existing one.
6270 BasicBlock *OuterCond = BasicBlock::Create(
6271 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6272 PreHeader->getParent());
6273 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6274 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6276 DynamicNext,
6277 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6278 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6279 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6280 Value *LowerBound =
6281 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6282 Builder.CreateCondBr(MoreWork, Header, Exit);
6283
6284 // Change PHI-node in loop header to use outer cond rather than preheader,
6285 // and set IV to the LowerBound.
6286 Instruction *Phi = &Header->front();
6287 auto *PI = cast<PHINode>(Phi);
6288 PI->setIncomingBlock(0, OuterCond);
6289 PI->setIncomingValue(0, LowerBound);
6290
6291 // Then set the pre-header to jump to the OuterCond
6292 Instruction *Term = PreHeader->getTerminator();
6293 auto *Br = cast<UncondBrInst>(Term);
6294 Br->setSuccessor(OuterCond);
6295
6296 // Modify the inner condition:
6297 // * Use the UpperBound returned from the DynamicNext call.
6298 // * jump to the loop outer loop when done with one of the inner loops.
6299 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6300 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6301 Instruction *Comp = &*Builder.GetInsertPoint();
6302 auto *CI = cast<CmpInst>(Comp);
6303 CI->setOperand(1, UpperBound);
6304 // Redirect the inner exit to branch to outer condition.
6305 Instruction *Branch = &Cond->back();
6306 auto *BI = cast<CondBrInst>(Branch);
6307 assert(BI->getSuccessor(1) == Exit);
6308 BI->setSuccessor(1, OuterCond);
6309
6310 // Call the "fini" function if "ordered" is present in wsloop directive.
6311 if (Ordered) {
6312 Builder.SetInsertPoint(&Latch->back());
6313 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6314 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6315 }
6316
6317 // Add the barrier if requested.
6318 if (NeedsBarrier) {
6319 Builder.SetInsertPoint(&Exit->back());
6320 InsertPointOrErrorTy BarrierIP =
6322 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6323 /* CheckCancelFlag */ false);
6324 if (!BarrierIP)
6325 return BarrierIP.takeError();
6326 }
6327
6328 CLI->invalidate();
6329 return AfterIP;
6330}
6331
6332/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6333/// after this \p OldTarget will be orphaned.
6335 BasicBlock *NewTarget, DebugLoc DL) {
6336 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6337 redirectTo(Pred, NewTarget, DL);
6338}
6339
6340/// Determine which blocks in \p BBs are reachable from outside and remove the
6341/// ones that are not reachable from the function.
6344 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6345 for (Use &U : BB->uses()) {
6346 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6347 if (!UseInst)
6348 continue;
6349 if (BBsToErase.count(UseInst->getParent()))
6350 continue;
6351 return true;
6352 }
6353 return false;
6354 };
6355
6356 while (BBsToErase.remove_if(HasRemainingUses)) {
6357 // Try again if anything was removed.
6358 }
6359
6360 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6361 DeleteDeadBlocks(BBVec);
6362}
6363
6364CanonicalLoopInfo *
6366 InsertPointTy ComputeIP) {
6367 assert(Loops.size() >= 1 && "At least one loop required");
6368 size_t NumLoops = Loops.size();
6369
6370 // Nothing to do if there is already just one loop.
6371 if (NumLoops == 1)
6372 return Loops.front();
6373
6374 CanonicalLoopInfo *Outermost = Loops.front();
6375 CanonicalLoopInfo *Innermost = Loops.back();
6376 BasicBlock *OrigPreheader = Outermost->getPreheader();
6377 BasicBlock *OrigAfter = Outermost->getAfter();
6378 Function *F = OrigPreheader->getParent();
6379
6380 // Loop control blocks that may become orphaned later.
6381 SmallVector<BasicBlock *, 12> OldControlBBs;
6382 OldControlBBs.reserve(6 * Loops.size());
6384 Loop->collectControlBlocks(OldControlBBs);
6385
6386 // Setup the IRBuilder for inserting the trip count computation.
6387 Builder.SetCurrentDebugLocation(DL);
6388 if (ComputeIP.isSet())
6389 Builder.restoreIP(ComputeIP);
6390 else
6391 Builder.restoreIP(Outermost->getPreheaderIP());
6392
6393 // Derive the collapsed' loop trip count.
6394 // TODO: Find common/largest indvar type.
6395 Value *CollapsedTripCount = nullptr;
6396 for (CanonicalLoopInfo *L : Loops) {
6397 assert(L->isValid() &&
6398 "All loops to collapse must be valid canonical loops");
6399 Value *OrigTripCount = L->getTripCount();
6400 if (!CollapsedTripCount) {
6401 CollapsedTripCount = OrigTripCount;
6402 continue;
6403 }
6404
6405 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6406 CollapsedTripCount =
6407 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6408 }
6409
6410 // Create the collapsed loop control flow.
6411 CanonicalLoopInfo *Result =
6412 createLoopSkeleton(DL, CollapsedTripCount, F,
6413 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6414
6415 // Build the collapsed loop body code.
6416 // Start with deriving the input loop induction variables from the collapsed
6417 // one, using a divmod scheme. To preserve the original loops' order, the
6418 // innermost loop use the least significant bits.
6419 Builder.restoreIP(Result->getBodyIP());
6420
6421 Value *Leftover = Result->getIndVar();
6422 SmallVector<Value *> NewIndVars;
6423 NewIndVars.resize(NumLoops);
6424 for (int i = NumLoops - 1; i >= 1; --i) {
6425 Value *OrigTripCount = Loops[i]->getTripCount();
6426
6427 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6428 NewIndVars[i] = NewIndVar;
6429
6430 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6431 }
6432 // Outermost loop gets all the remaining bits.
6433 NewIndVars[0] = Leftover;
6434
6435 // Construct the loop body control flow.
6436 // We progressively construct the branch structure following in direction of
6437 // the control flow, from the leading in-between code, the loop nest body, the
6438 // trailing in-between code, and rejoining the collapsed loop's latch.
6439 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6440 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6441 // its predecessors as sources.
6442 BasicBlock *ContinueBlock = Result->getBody();
6443 BasicBlock *ContinuePred = nullptr;
6444 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6445 BasicBlock *NextSrc) {
6446 if (ContinueBlock)
6447 redirectTo(ContinueBlock, Dest, DL);
6448 else
6449 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6450
6451 ContinueBlock = nullptr;
6452 ContinuePred = NextSrc;
6453 };
6454
6455 // The code before the nested loop of each level.
6456 // Because we are sinking it into the nest, it will be executed more often
6457 // that the original loop. More sophisticated schemes could keep track of what
6458 // the in-between code is and instantiate it only once per thread.
6459 for (size_t i = 0; i < NumLoops - 1; ++i)
6460 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6461
6462 // Connect the loop nest body.
6463 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6464
6465 // The code after the nested loop at each level.
6466 for (size_t i = NumLoops - 1; i > 0; --i)
6467 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6468
6469 // Connect the finished loop to the collapsed loop latch.
6470 ContinueWith(Result->getLatch(), nullptr);
6471
6472 // Replace the input loops with the new collapsed loop.
6473 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6474 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6475
6476 // Replace the input loop indvars with the derived ones.
6477 for (size_t i = 0; i < NumLoops; ++i)
6478 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6479
6480 // Remove unused parts of the input loops.
6481 removeUnusedBlocksFromParent(OldControlBBs);
6482
6483 for (CanonicalLoopInfo *L : Loops)
6484 L->invalidate();
6485
6486#ifndef NDEBUG
6487 Result->assertOK();
6488#endif
6489 return Result;
6490}
6491
6492std::vector<CanonicalLoopInfo *>
6494 ArrayRef<Value *> TileSizes) {
6495 assert(TileSizes.size() == Loops.size() &&
6496 "Must pass as many tile sizes as there are loops");
6497 int NumLoops = Loops.size();
6498 assert(NumLoops >= 1 && "At least one loop to tile required");
6499
6500 CanonicalLoopInfo *OutermostLoop = Loops.front();
6501 CanonicalLoopInfo *InnermostLoop = Loops.back();
6502 Function *F = OutermostLoop->getBody()->getParent();
6503 BasicBlock *InnerEnter = InnermostLoop->getBody();
6504 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6505
6506 // Loop control blocks that may become orphaned later.
6507 SmallVector<BasicBlock *, 12> OldControlBBs;
6508 OldControlBBs.reserve(6 * Loops.size());
6510 Loop->collectControlBlocks(OldControlBBs);
6511
6512 // Collect original trip counts and induction variable to be accessible by
6513 // index. Also, the structure of the original loops is not preserved during
6514 // the construction of the tiled loops, so do it before we scavenge the BBs of
6515 // any original CanonicalLoopInfo.
6516 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6517 for (CanonicalLoopInfo *L : Loops) {
6518 assert(L->isValid() && "All input loops must be valid canonical loops");
6519 OrigTripCounts.push_back(L->getTripCount());
6520 OrigIndVars.push_back(L->getIndVar());
6521 }
6522
6523 // Collect the code between loop headers. These may contain SSA definitions
6524 // that are used in the loop nest body. To be usable with in the innermost
6525 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6526 // these instructions may be executed more often than before the tiling.
6527 // TODO: It would be sufficient to only sink them into body of the
6528 // corresponding tile loop.
6530 for (int i = 0; i < NumLoops - 1; ++i) {
6531 CanonicalLoopInfo *Surrounding = Loops[i];
6532 CanonicalLoopInfo *Nested = Loops[i + 1];
6533
6534 BasicBlock *EnterBB = Surrounding->getBody();
6535 BasicBlock *ExitBB = Nested->getHeader();
6536 InbetweenCode.emplace_back(EnterBB, ExitBB);
6537 }
6538
6539 // Compute the trip counts of the floor loops.
6540 Builder.SetCurrentDebugLocation(DL);
6541 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6542 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6543 for (int i = 0; i < NumLoops; ++i) {
6544 Value *TileSize = TileSizes[i];
6545 Value *OrigTripCount = OrigTripCounts[i];
6546 Type *IVType = OrigTripCount->getType();
6547
6548 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6549 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6550
6551 // 0 if tripcount divides the tilesize, 1 otherwise.
6552 // 1 means we need an additional iteration for a partial tile.
6553 //
6554 // Unfortunately we cannot just use the roundup-formula
6555 // (tripcount + tilesize - 1)/tilesize
6556 // because the summation might overflow. We do not want introduce undefined
6557 // behavior when the untiled loop nest did not.
6558 Value *FloorTripOverflow =
6559 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6560
6561 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6562 Value *FloorTripCount =
6563 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6564 "omp_floor" + Twine(i) + ".tripcount", true);
6565
6566 // Remember some values for later use.
6567 FloorCompleteCount.push_back(FloorCompleteTripCount);
6568 FloorCount.push_back(FloorTripCount);
6569 FloorRems.push_back(FloorTripRem);
6570 }
6571
6572 // Generate the new loop nest, from the outermost to the innermost.
6573 std::vector<CanonicalLoopInfo *> Result;
6574 Result.reserve(NumLoops * 2);
6575
6576 // The basic block of the surrounding loop that enters the nest generated
6577 // loop.
6578 BasicBlock *Enter = OutermostLoop->getPreheader();
6579
6580 // The basic block of the surrounding loop where the inner code should
6581 // continue.
6582 BasicBlock *Continue = OutermostLoop->getAfter();
6583
6584 // Where the next loop basic block should be inserted.
6585 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6586
6587 auto EmbeddNewLoop =
6588 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6589 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6590 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6591 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6592 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6593 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6594
6595 // Setup the position where the next embedded loop connects to this loop.
6596 Enter = EmbeddedLoop->getBody();
6597 Continue = EmbeddedLoop->getLatch();
6598 OutroInsertBefore = EmbeddedLoop->getLatch();
6599 return EmbeddedLoop;
6600 };
6601
6602 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6603 const Twine &NameBase) {
6604 for (auto P : enumerate(TripCounts)) {
6605 CanonicalLoopInfo *EmbeddedLoop =
6606 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6607 Result.push_back(EmbeddedLoop);
6608 }
6609 };
6610
6611 EmbeddNewLoops(FloorCount, "floor");
6612
6613 // Within the innermost floor loop, emit the code that computes the tile
6614 // sizes.
6615 Builder.SetInsertPoint(Enter->getTerminator());
6616 SmallVector<Value *, 4> TileCounts;
6617 for (int i = 0; i < NumLoops; ++i) {
6618 CanonicalLoopInfo *FloorLoop = Result[i];
6619 Value *TileSize = TileSizes[i];
6620
6621 Value *FloorIsEpilogue =
6622 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6623 Value *TileTripCount =
6624 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6625
6626 TileCounts.push_back(TileTripCount);
6627 }
6628
6629 // Create the tile loops.
6630 EmbeddNewLoops(TileCounts, "tile");
6631
6632 // Insert the inbetween code into the body.
6633 BasicBlock *BodyEnter = Enter;
6634 BasicBlock *BodyEntered = nullptr;
6635 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6636 BasicBlock *EnterBB = P.first;
6637 BasicBlock *ExitBB = P.second;
6638
6639 if (BodyEnter)
6640 redirectTo(BodyEnter, EnterBB, DL);
6641 else
6642 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6643
6644 BodyEnter = nullptr;
6645 BodyEntered = ExitBB;
6646 }
6647
6648 // Append the original loop nest body into the generated loop nest body.
6649 if (BodyEnter)
6650 redirectTo(BodyEnter, InnerEnter, DL);
6651 else
6652 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6654
6655 // Replace the original induction variable with an induction variable computed
6656 // from the tile and floor induction variables.
6657 Builder.restoreIP(Result.back()->getBodyIP());
6658 for (int i = 0; i < NumLoops; ++i) {
6659 CanonicalLoopInfo *FloorLoop = Result[i];
6660 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6661 Value *OrigIndVar = OrigIndVars[i];
6662 Value *Size = TileSizes[i];
6663
6664 Value *Scale =
6665 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6666 Value *Shift =
6667 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6668 OrigIndVar->replaceAllUsesWith(Shift);
6669 }
6670
6671 // Remove unused parts of the original loops.
6672 removeUnusedBlocksFromParent(OldControlBBs);
6673
6674 for (CanonicalLoopInfo *L : Loops)
6675 L->invalidate();
6676
6677#ifndef NDEBUG
6678 for (CanonicalLoopInfo *GenL : Result)
6679 GenL->assertOK();
6680#endif
6681 return Result;
6682}
6683
6684/// Attach metadata \p Properties to the basic block described by \p BB. If the
6685/// basic block already has metadata, the basic block properties are appended.
6687 ArrayRef<Metadata *> Properties) {
6688 // Nothing to do if no property to attach.
6689 if (Properties.empty())
6690 return;
6691
6692 LLVMContext &Ctx = BB->getContext();
6693 SmallVector<Metadata *> NewProperties;
6694 NewProperties.push_back(nullptr);
6695
6696 // If the basic block already has metadata, prepend it to the new metadata.
6697 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6698 if (Existing)
6699 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6700
6701 append_range(NewProperties, Properties);
6702 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6703 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6704
6705 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6706}
6707
6708/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6709/// loop already has metadata, the loop properties are appended.
6711 ArrayRef<Metadata *> Properties) {
6712 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6713
6714 // Attach metadata to the loop's latch
6715 BasicBlock *Latch = Loop->getLatch();
6716 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6717 addBasicBlockMetadata(Latch, Properties);
6718}
6719
6720/// Attach llvm.access.group metadata to the memref instructions of \p Block
6722 LoopInfo &LI) {
6723 for (Instruction &I : *Block) {
6724 if (I.mayReadOrWriteMemory()) {
6725 // TODO: This instruction may already have access group from
6726 // other pragmas e.g. #pragma clang loop vectorize. Append
6727 // so that the existing metadata is not overwritten.
6728 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6729 }
6730 }
6731}
6732
6733CanonicalLoopInfo *
6735 CanonicalLoopInfo *firstLoop = Loops.front();
6736 CanonicalLoopInfo *lastLoop = Loops.back();
6737 Function *F = firstLoop->getPreheader()->getParent();
6738
6739 // Loop control blocks that will become orphaned later
6740 SmallVector<BasicBlock *> oldControlBBs;
6742 Loop->collectControlBlocks(oldControlBBs);
6743
6744 // Collect original trip counts
6745 SmallVector<Value *> origTripCounts;
6746 for (CanonicalLoopInfo *L : Loops) {
6747 assert(L->isValid() && "All input loops must be valid canonical loops");
6748 origTripCounts.push_back(L->getTripCount());
6749 }
6750
6751 Builder.SetCurrentDebugLocation(DL);
6752
6753 // Compute max trip count.
6754 // The fused loop will be from 0 to max(origTripCounts)
6755 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6756 F, firstLoop->getHeader());
6757 Builder.SetInsertPoint(TCBlock);
6758 Value *fusedTripCount = nullptr;
6759 for (CanonicalLoopInfo *L : Loops) {
6760 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6761 Value *origTripCount = L->getTripCount();
6762 if (!fusedTripCount) {
6763 fusedTripCount = origTripCount;
6764 continue;
6765 }
6766 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6767 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6768 ".omp.fuse.tc");
6769 }
6770
6771 // Generate new loop
6772 CanonicalLoopInfo *fused =
6773 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6774 lastLoop->getLatch(), "fused");
6775
6776 // Replace original loops with the fused loop
6777 // Preheader and After are not considered inside the CLI.
6778 // These are used to compute the individual TCs of the loops
6779 // so they have to be put before the resulting fused loop.
6780 // Moving them up for readability.
6781 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6782 Loops[i]->getPreheader()->moveBefore(TCBlock);
6783 Loops[i]->getAfter()->moveBefore(TCBlock);
6784 }
6785 lastLoop->getPreheader()->moveBefore(TCBlock);
6786
6787 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6788 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6789 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6790 }
6791 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6792 redirectTo(TCBlock, fused->getPreheader(), DL);
6793 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6794
6795 // Build the fused body
6796 // Create new Blocks with conditions that jump to the original loop bodies
6798 SmallVector<Value *> condValues;
6799 for (size_t i = 0; i < Loops.size(); ++i) {
6800 BasicBlock *condBlock = BasicBlock::Create(
6801 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6802 Builder.SetInsertPoint(condBlock);
6803 Value *condValue =
6804 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6805 condBBs.push_back(condBlock);
6806 condValues.push_back(condValue);
6807 }
6808 // Join the condition blocks with the bodies of the original loops
6809 redirectTo(fused->getBody(), condBBs[0], DL);
6810 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6811 Builder.SetInsertPoint(condBBs[i]);
6812 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6813 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6814 // Replace the IV with the fused IV
6815 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6816 }
6817 // Last body jumps to the created end body block
6818 Builder.SetInsertPoint(condBBs.back());
6819 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6820 fused->getLatch());
6821 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6822 // Replace the IV with the fused IV
6823 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6824
6825 // The loop latch must have only one predecessor. Currently it is branched to
6826 // from both the last condition block and the last loop body
6827 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6828 "omp.fused.pre_latch");
6829
6830 // Remove unused parts
6831 removeUnusedBlocksFromParent(oldControlBBs);
6832
6833 // Invalidate old CLIs
6834 for (CanonicalLoopInfo *L : Loops)
6835 L->invalidate();
6836
6837#ifndef NDEBUG
6838 fused->assertOK();
6839#endif
6840 return fused;
6841}
6842
6844 LLVMContext &Ctx = Builder.getContext();
6846 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6847 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6848}
6849
6851 LLVMContext &Ctx = Builder.getContext();
6853 Loop, {
6854 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6855 });
6856}
6857
6858void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6859 Value *IfCond, ValueToValueMapTy &VMap,
6860 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6861 const Twine &NamePrefix) {
6862 Function *F = CanonicalLoop->getFunction();
6863
6864 // We can't do
6865 // if (cond) {
6866 // simd_loop;
6867 // } else {
6868 // non_simd_loop;
6869 // }
6870 // because then the CanonicalLoopInfo would only point to one of the loops:
6871 // leading to other constructs operating on the same loop to malfunction.
6872 // Instead generate
6873 // while (...) {
6874 // if (cond) {
6875 // simd_body;
6876 // } else {
6877 // not_simd_body;
6878 // }
6879 // }
6880 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6881 // body at -O3
6882
6883 // Define where if branch should be inserted
6884 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6885
6886 // Create additional blocks for the if statement
6887 BasicBlock *Cond = SplitBeforeIt->getParent();
6888 llvm::LLVMContext &C = Cond->getContext();
6890 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6892 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6893
6894 // Create if condition branch.
6895 Builder.SetInsertPoint(SplitBeforeIt);
6896 Instruction *BrInstr =
6897 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6898 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6899 // Then block contains branch to omp loop body which needs to be vectorized
6900 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6901 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6902
6903 Builder.SetInsertPoint(ElseBlock);
6904
6905 // Clone loop for the else branch
6907
6908 SmallVector<BasicBlock *, 8> ExistingBlocks;
6909 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6910 ExistingBlocks.push_back(ThenBlock);
6911 ExistingBlocks.append(L->block_begin(), L->block_end());
6912 // Cond is the block that has the if clause condition
6913 // LoopCond is omp_loop.cond
6914 // LoopHeader is omp_loop.header
6915 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6916 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6917 assert(LoopCond && LoopHeader && "Invalid loop structure");
6918 for (BasicBlock *Block : ExistingBlocks) {
6919 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6920 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6921 continue;
6922 }
6923 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6924
6925 // fix name not to be omp.if.then
6926 if (Block == ThenBlock)
6927 NewBB->setName(NamePrefix + ".if.else");
6928
6929 NewBB->moveBefore(CanonicalLoop->getExit());
6930 VMap[Block] = NewBB;
6931 NewBlocks.push_back(NewBB);
6932 }
6933 remapInstructionsInBlocks(NewBlocks, VMap);
6934 Builder.CreateBr(NewBlocks.front());
6935
6936 // The loop latch must have only one predecessor. Currently it is branched to
6937 // from both the 'then' and 'else' branches.
6938 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6939 NamePrefix + ".pre_latch");
6940
6941 // Ensure that the then block is added to the loop so we add the attributes in
6942 // the next step
6943 L->addBasicBlockToLoop(ThenBlock, LI);
6944}
6945
6946unsigned
6948 const StringMap<bool> &Features) {
6949 if (TargetTriple.isX86()) {
6950 if (Features.lookup("avx512f"))
6951 return 512;
6952 else if (Features.lookup("avx"))
6953 return 256;
6954 return 128;
6955 }
6956 if (TargetTriple.isPPC())
6957 return 128;
6958 if (TargetTriple.isWasm())
6959 return 128;
6960 return 0;
6961}
6962
6964 MapVector<Value *, Value *> AlignedVars,
6965 Value *IfCond, OrderKind Order,
6966 ConstantInt *Simdlen, ConstantInt *Safelen) {
6967 LLVMContext &Ctx = Builder.getContext();
6968
6969 Function *F = CanonicalLoop->getFunction();
6970
6971 // Blocks must have terminators.
6972 // FIXME: Don't run analyses on incomplete/invalid IR.
6974 for (BasicBlock &BB : *F)
6975 if (!BB.hasTerminator())
6976 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
6977
6978 // TODO: We should not rely on pass manager. Currently we use pass manager
6979 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6980 // object. We should have a method which returns all blocks between
6981 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6983 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6984 FAM.registerPass([]() { return LoopAnalysis(); });
6985 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6986
6987 LoopAnalysis LIA;
6988 LoopInfo &&LI = LIA.run(*F, FAM);
6989
6990 for (Instruction *I : UIs)
6991 I->eraseFromParent();
6992
6993 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6994 if (AlignedVars.size()) {
6995 InsertPointTy IP = Builder.saveIP();
6996 for (auto &AlignedItem : AlignedVars) {
6997 Value *AlignedPtr = AlignedItem.first;
6998 Value *Alignment = AlignedItem.second;
6999 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
7000 Builder.SetInsertPoint(loadInst->getNextNode());
7001 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
7002 Alignment);
7003 }
7004 Builder.restoreIP(IP);
7005 }
7006
7007 if (IfCond) {
7008 ValueToValueMapTy VMap;
7009 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
7010 }
7011
7013
7014 // Get the basic blocks from the loop in which memref instructions
7015 // can be found.
7016 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
7017 // preferably without running any passes.
7018 for (BasicBlock *Block : L->getBlocks()) {
7019 if (Block == CanonicalLoop->getCond() ||
7020 Block == CanonicalLoop->getHeader())
7021 continue;
7022 Reachable.insert(Block);
7023 }
7024
7025 SmallVector<Metadata *> LoopMDList;
7026
7027 // In presence of finite 'safelen', it may be unsafe to mark all
7028 // the memory instructions parallel, because loop-carried
7029 // dependences of 'safelen' iterations are possible.
7030 // If clause order(concurrent) is specified then the memory instructions
7031 // are marked parallel even if 'safelen' is finite.
7032 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
7033 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
7034
7035 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
7036 // versions so we can't add the loop attributes in that case.
7037 if (IfCond) {
7038 // we can still add llvm.loop.parallel_access
7039 addLoopMetadata(CanonicalLoop, LoopMDList);
7040 return;
7041 }
7042
7043 // Use the above access group metadata to create loop level
7044 // metadata, which should be distinct for each loop.
7045 ConstantAsMetadata *BoolConst =
7047 LoopMDList.push_back(MDNode::get(
7048 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
7049
7050 if (Simdlen || Safelen) {
7051 // If both simdlen and safelen clauses are specified, the value of the
7052 // simdlen parameter must be less than or equal to the value of the safelen
7053 // parameter. Therefore, use safelen only in the absence of simdlen.
7054 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
7055 LoopMDList.push_back(
7056 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
7057 ConstantAsMetadata::get(VectorizeWidth)}));
7058 }
7059
7060 addLoopMetadata(CanonicalLoop, LoopMDList);
7061}
7062
7063/// Create the TargetMachine object to query the backend for optimization
7064/// preferences.
7065///
7066/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
7067/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
7068/// needed for the LLVM pass pipline. We use some default options to avoid
7069/// having to pass too many settings from the frontend that probably do not
7070/// matter.
7071///
7072/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7073/// method. If we are going to use TargetMachine for more purposes, especially
7074/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7075/// might become be worth requiring front-ends to pass on their TargetMachine,
7076/// or at least cache it between methods. Note that while fontends such as Clang
7077/// have just a single main TargetMachine per translation unit, "target-cpu" and
7078/// "target-features" that determine the TargetMachine are per-function and can
7079/// be overrided using __attribute__((target("OPTIONS"))).
7080static std::unique_ptr<TargetMachine>
7082 Module *M = F->getParent();
7083
7084 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7085 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7086 const llvm::Triple &Triple = M->getTargetTriple();
7087
7088 std::string Error;
7090 if (!TheTarget)
7091 return {};
7092
7094 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7095 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7096 /*CodeModel=*/std::nullopt, OptLevel));
7097}
7098
7099/// Heuristically determine the best-performant unroll factor for \p CLI. This
7100/// depends on the target processor. We are re-using the same heuristics as the
7101/// LoopUnrollPass.
7103 Function *F = CLI->getFunction();
7104
7105 // Assume the user requests the most aggressive unrolling, even if the rest of
7106 // the code is optimized using a lower setting.
7108 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7109
7110 // Blocks must have terminators.
7111 // FIXME: Don't run analyses on incomplete/invalid IR.
7113 for (BasicBlock &BB : *F)
7114 if (!BB.hasTerminator())
7115 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7116
7118 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7119 FAM.registerPass([]() { return AssumptionAnalysis(); });
7120 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7121 FAM.registerPass([]() { return LoopAnalysis(); });
7122 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7123 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7124 TargetIRAnalysis TIRA;
7125 if (TM)
7126 TIRA = TargetIRAnalysis(
7127 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7128 FAM.registerPass([&]() { return TIRA; });
7129
7130 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7132 ScalarEvolution &&SE = SEA.run(*F, FAM);
7134 DominatorTree &&DT = DTA.run(*F, FAM);
7135 LoopAnalysis LIA;
7136 LoopInfo &&LI = LIA.run(*F, FAM);
7138 AssumptionCache &&AC = ACT.run(*F, FAM);
7140
7141 for (Instruction *I : UIs)
7142 I->eraseFromParent();
7143
7144 Loop *L = LI.getLoopFor(CLI->getHeader());
7145 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7146
7148 L, SE, TTI,
7149 /*BlockFrequencyInfo=*/nullptr,
7150 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7151 /*UserThreshold=*/std::nullopt,
7152 /*UserCount=*/std::nullopt,
7153 /*UserAllowPartial=*/true,
7154 /*UserAllowRuntime=*/true,
7155 /*UserUpperBound=*/std::nullopt,
7156 /*UserFullUnrollMaxCount=*/std::nullopt);
7157
7158 UP.Force = true;
7159
7160 // Account for additional optimizations taking place before the LoopUnrollPass
7161 // would unroll the loop.
7164
7165 // Use normal unroll factors even if the rest of the code is optimized for
7166 // size.
7169
7170 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7171 << " Threshold=" << UP.Threshold << "\n"
7172 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7173 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7174 << " PartialOptSizeThreshold="
7175 << UP.PartialOptSizeThreshold << "\n");
7176
7177 // Disable peeling.
7180 /*UserAllowPeeling=*/false,
7181 /*UserAllowProfileBasedPeeling=*/false,
7182 /*UnrollingSpecficValues=*/false);
7183
7185 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7186
7187 // Assume that reads and writes to stack variables can be eliminated by
7188 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7189 // size.
7190 for (BasicBlock *BB : L->blocks()) {
7191 for (Instruction &I : *BB) {
7192 Value *Ptr;
7193 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7194 Ptr = Load->getPointerOperand();
7195 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7196 Ptr = Store->getPointerOperand();
7197 } else
7198 continue;
7199
7200 Ptr = Ptr->stripPointerCasts();
7201
7202 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7203 if (Alloca->getParent() == &F->getEntryBlock())
7204 EphValues.insert(&I);
7205 }
7206 }
7207 }
7208
7209 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7210
7211 // Loop is not unrollable if the loop contains certain instructions.
7212 if (!UCE.canUnroll()) {
7213 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7214 return 1;
7215 }
7216
7217 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7218 << "\n");
7219
7220 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7221 // be able to use it.
7222 int TripCount = 0;
7223 int MaxTripCount = 0;
7224 bool MaxOrZero = false;
7225 unsigned TripMultiple = 0;
7226
7227 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7228 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7229 unsigned Factor = UP.Count;
7230 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7231
7232 // This function returns 1 to signal to not unroll a loop.
7233 if (Factor == 0)
7234 return 1;
7235 return Factor;
7236}
7237
7239 int32_t Factor,
7240 CanonicalLoopInfo **UnrolledCLI) {
7241 assert(Factor >= 0 && "Unroll factor must not be negative");
7242
7243 Function *F = Loop->getFunction();
7244 LLVMContext &Ctx = F->getContext();
7245
7246 // If the unrolled loop is not used for another loop-associated directive, it
7247 // is sufficient to add metadata for the LoopUnrollPass.
7248 if (!UnrolledCLI) {
7249 SmallVector<Metadata *, 2> LoopMetadata;
7250 LoopMetadata.push_back(
7251 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7252
7253 if (Factor >= 1) {
7255 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7256 LoopMetadata.push_back(MDNode::get(
7257 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7258 }
7259
7260 addLoopMetadata(Loop, LoopMetadata);
7261 return;
7262 }
7263
7264 // Heuristically determine the unroll factor.
7265 if (Factor == 0)
7267
7268 // No change required with unroll factor 1.
7269 if (Factor == 1) {
7270 *UnrolledCLI = Loop;
7271 return;
7272 }
7273
7274 assert(Factor >= 2 &&
7275 "unrolling only makes sense with a factor of 2 or larger");
7276
7277 Type *IndVarTy = Loop->getIndVarType();
7278
7279 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7280 // unroll the inner loop.
7281 Value *FactorVal =
7282 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7283 /*isSigned=*/false));
7284 std::vector<CanonicalLoopInfo *> LoopNest =
7285 tileLoops(DL, {Loop}, {FactorVal});
7286 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7287 *UnrolledCLI = LoopNest[0];
7288 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7289
7290 // LoopUnrollPass can only fully unroll loops with constant trip count.
7291 // Unroll by the unroll factor with a fallback epilog for the remainder
7292 // iterations if necessary.
7294 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7296 InnerLoop,
7297 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7299 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7300
7301#ifndef NDEBUG
7302 (*UnrolledCLI)->assertOK();
7303#endif
7304}
7305
7308 llvm::Value *BufSize, llvm::Value *CpyBuf,
7309 llvm::Value *CpyFn, llvm::Value *DidIt) {
7310 if (!updateToLocation(Loc))
7311 return Loc.IP;
7312
7313 uint32_t SrcLocStrSize;
7314 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7315 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7316 Value *ThreadId = getOrCreateThreadID(Ident);
7317
7318 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7319
7320 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7321
7322 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7323 createRuntimeFunctionCall(Fn, Args);
7324
7325 return Builder.saveIP();
7326}
7327
7329 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7330 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7332
7333 if (!updateToLocation(Loc))
7334 return Loc.IP;
7335
7336 // If needed allocate and initialize `DidIt` with 0.
7337 // DidIt: flag variable: 1=single thread; 0=not single thread.
7338 llvm::Value *DidIt = nullptr;
7339 if (!CPVars.empty()) {
7340 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7341 Builder.CreateStore(Builder.getInt32(0), DidIt);
7342 }
7343
7344 Directive OMPD = Directive::OMPD_single;
7345 uint32_t SrcLocStrSize;
7346 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7347 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7348 Value *ThreadId = getOrCreateThreadID(Ident);
7349 Value *Args[] = {Ident, ThreadId};
7350
7351 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7352 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7353
7354 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7355 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7356
7357 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7358 if (Error Err = FiniCB(IP))
7359 return Err;
7360
7361 // The thread that executes the single region must set `DidIt` to 1.
7362 // This is used by __kmpc_copyprivate, to know if the caller is the
7363 // single thread or not.
7364 if (DidIt)
7365 Builder.CreateStore(Builder.getInt32(1), DidIt);
7366
7367 return Error::success();
7368 };
7369
7370 // generates the following:
7371 // if (__kmpc_single()) {
7372 // .... single region ...
7373 // __kmpc_end_single
7374 // }
7375 // __kmpc_copyprivate
7376 // __kmpc_barrier
7377
7378 InsertPointOrErrorTy AfterIP =
7379 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7380 /*Conditional*/ true,
7381 /*hasFinalize*/ true);
7382 if (!AfterIP)
7383 return AfterIP.takeError();
7384
7385 if (DidIt) {
7386 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7387 // NOTE BufSize is currently unused, so just pass 0.
7389 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7390 CPFuncs[I], DidIt);
7391 // NOTE __kmpc_copyprivate already inserts a barrier
7392 } else if (!IsNowait) {
7393 InsertPointOrErrorTy AfterIP =
7395 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7396 /* CheckCancelFlag */ false);
7397 if (!AfterIP)
7398 return AfterIP.takeError();
7399 }
7400 return Builder.saveIP();
7401}
7402
7404 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7405 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7406
7407 if (!updateToLocation(Loc))
7408 return Loc.IP;
7409
7410 Directive OMPD = Directive::OMPD_critical;
7411 uint32_t SrcLocStrSize;
7412 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7413 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7414 Value *ThreadId = getOrCreateThreadID(Ident);
7415 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7416 Value *Args[] = {Ident, ThreadId, LockVar};
7417
7418 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7419 Function *RTFn = nullptr;
7420 if (HintInst) {
7421 // Add Hint to entry Args and create call
7422 EnterArgs.push_back(HintInst);
7423 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7424 } else {
7425 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7426 }
7427 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7428
7429 Function *ExitRTLFn =
7430 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7431 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7432
7433 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7434 /*Conditional*/ false, /*hasFinalize*/ true);
7435}
7436
7439 InsertPointTy AllocaIP, unsigned NumLoops,
7440 ArrayRef<llvm::Value *> StoreValues,
7441 const Twine &Name, bool IsDependSource) {
7442 assert(
7443 llvm::all_of(StoreValues,
7444 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7445 "OpenMP runtime requires depend vec with i64 type");
7446
7447 if (!updateToLocation(Loc))
7448 return Loc.IP;
7449
7450 // Allocate space for vector and generate alloc instruction.
7451 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7452 Builder.restoreIP(AllocaIP);
7453 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7454 ArgsBase->setAlignment(Align(8));
7456
7457 // Store the index value with offset in depend vector.
7458 for (unsigned I = 0; I < NumLoops; ++I) {
7459 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7460 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7461 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7462 STInst->setAlignment(Align(8));
7463 }
7464
7465 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7466 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7467
7468 uint32_t SrcLocStrSize;
7469 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7470 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7471 Value *ThreadId = getOrCreateThreadID(Ident);
7472 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7473
7474 Function *RTLFn = nullptr;
7475 if (IsDependSource)
7476 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7477 else
7478 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7479 createRuntimeFunctionCall(RTLFn, Args);
7480
7481 return Builder.saveIP();
7482}
7483
7485 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7486 FinalizeCallbackTy FiniCB, bool IsThreads) {
7487 if (!updateToLocation(Loc))
7488 return Loc.IP;
7489
7490 Directive OMPD = Directive::OMPD_ordered;
7491 Instruction *EntryCall = nullptr;
7492 Instruction *ExitCall = nullptr;
7493
7494 if (IsThreads) {
7495 uint32_t SrcLocStrSize;
7496 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7497 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7498 Value *ThreadId = getOrCreateThreadID(Ident);
7499 Value *Args[] = {Ident, ThreadId};
7500
7501 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7502 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7503
7504 Function *ExitRTLFn =
7505 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7506 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7507 }
7508
7509 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7510 /*Conditional*/ false, /*hasFinalize*/ true);
7511}
7512
7513OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7514 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7515 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7516 bool HasFinalize, bool IsCancellable) {
7517
7518 if (HasFinalize)
7519 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7520
7521 // Create inlined region's entry and body blocks, in preparation
7522 // for conditional creation
7523 BasicBlock *EntryBB = Builder.GetInsertBlock();
7524 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7526 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7527 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7528 BasicBlock *FiniBB =
7529 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7530
7531 Builder.SetInsertPoint(EntryBB->getTerminator());
7532 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7533
7534 // generate body
7535 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7536 /* CodeGenIP */ Builder.saveIP()))
7537 return Err;
7538
7539 // emit exit call and do any needed finalization.
7540 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7541 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7542 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7543 "Unexpected control flow graph state!!");
7544 InsertPointOrErrorTy AfterIP =
7545 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7546 if (!AfterIP)
7547 return AfterIP.takeError();
7548
7549 // If we are skipping the region of a non conditional, remove the exit
7550 // block, and clear the builder's insertion point.
7551 assert(SplitPos->getParent() == ExitBB &&
7552 "Unexpected Insertion point location!");
7553 auto merged = MergeBlockIntoPredecessor(ExitBB);
7554 BasicBlock *ExitPredBB = SplitPos->getParent();
7555 auto InsertBB = merged ? ExitPredBB : ExitBB;
7557 SplitPos->eraseFromParent();
7558 Builder.SetInsertPoint(InsertBB);
7559
7560 return Builder.saveIP();
7561}
7562
7563OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7564 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7565 // if nothing to do, Return current insertion point.
7566 if (!Conditional || !EntryCall)
7567 return Builder.saveIP();
7568
7569 BasicBlock *EntryBB = Builder.GetInsertBlock();
7570 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7571 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7572 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7573
7574 // Emit thenBB and set the Builder's insertion point there for
7575 // body generation next. Place the block after the current block.
7576 Function *CurFn = EntryBB->getParent();
7577 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7578
7579 // Move Entry branch to end of ThenBB, and replace with conditional
7580 // branch (If-stmt)
7581 Instruction *EntryBBTI = EntryBB->getTerminator();
7582 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7583 EntryBBTI->removeFromParent();
7584 Builder.SetInsertPoint(UI);
7585 Builder.Insert(EntryBBTI);
7586 UI->eraseFromParent();
7587 Builder.SetInsertPoint(ThenBB->getTerminator());
7588
7589 // return an insertion point to ExitBB.
7590 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7591}
7592
7593OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7594 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7595 bool HasFinalize) {
7596
7597 Builder.restoreIP(FinIP);
7598
7599 // If there is finalization to do, emit it before the exit call
7600 if (HasFinalize) {
7601 assert(!FinalizationStack.empty() &&
7602 "Unexpected finalization stack state!");
7603
7604 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7605 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7606
7607 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7608 return std::move(Err);
7609
7610 // Exit condition: insertion point is before the terminator of the new Fini
7611 // block
7612 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7613 }
7614
7615 if (!ExitCall)
7616 return Builder.saveIP();
7617
7618 // place the Exitcall as last instruction before Finalization block terminator
7619 ExitCall->removeFromParent();
7620 Builder.Insert(ExitCall);
7621
7622 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7623 ExitCall->getIterator());
7624}
7625
7627 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7628 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7629 if (!IP.isSet())
7630 return IP;
7631
7633
7634 // creates the following CFG structure
7635 // OMP_Entry : (MasterAddr != PrivateAddr)?
7636 // F T
7637 // | \
7638 // | copin.not.master
7639 // | /
7640 // v /
7641 // copyin.not.master.end
7642 // |
7643 // v
7644 // OMP.Entry.Next
7645
7646 BasicBlock *OMP_Entry = IP.getBlock();
7647 Function *CurFn = OMP_Entry->getParent();
7648 BasicBlock *CopyBegin =
7649 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7650 BasicBlock *CopyEnd = nullptr;
7651
7652 // If entry block is terminated, split to preserve the branch to following
7653 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7655 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7656 "copyin.not.master.end");
7657 OMP_Entry->getTerminator()->eraseFromParent();
7658 } else {
7659 CopyEnd =
7660 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7661 }
7662
7663 Builder.SetInsertPoint(OMP_Entry);
7664 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7665 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7666 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7667 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7668
7669 Builder.SetInsertPoint(CopyBegin);
7670 if (BranchtoEnd)
7671 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7672
7673 return Builder.saveIP();
7674}
7675
7677 Value *Size, Value *Allocator,
7678 std::string Name) {
7680 if (!updateToLocation(Loc))
7681 return nullptr;
7682
7683 uint32_t SrcLocStrSize;
7684 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7685 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7686 Value *ThreadId = getOrCreateThreadID(Ident);
7687 Value *Args[] = {ThreadId, Size, Allocator};
7688
7689 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7690
7691 return createRuntimeFunctionCall(Fn, Args, Name);
7692}
7693
7695 Value *Align, Value *Size,
7696 Value *Allocator,
7697 std::string Name) {
7699 if (!updateToLocation(Loc))
7700 return nullptr;
7701
7702 uint32_t SrcLocStrSize;
7703 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7704 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7705 Value *ThreadId = getOrCreateThreadID(Ident);
7706 Value *Args[] = {ThreadId, Align, Size, Allocator};
7707
7708 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_aligned_alloc);
7709
7710 return Builder.CreateCall(Fn, Args, Name);
7711}
7712
7714 Value *Addr, Value *Allocator,
7715 std::string Name) {
7717 if (!updateToLocation(Loc))
7718 return nullptr;
7719
7720 uint32_t SrcLocStrSize;
7721 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7722 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7723 Value *ThreadId = getOrCreateThreadID(Ident);
7724 Value *Args[] = {ThreadId, Addr, Allocator};
7725 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7726 return createRuntimeFunctionCall(Fn, Args, Name);
7727}
7728
7730 const LocationDescription &Loc, Value *InteropVar,
7731 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7732 Value *DependenceAddress, bool HaveNowaitClause) {
7735
7736 uint32_t SrcLocStrSize;
7737 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7738 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7739 Value *ThreadId = getOrCreateThreadID(Ident);
7740 if (Device == nullptr)
7741 Device = Constant::getAllOnesValue(Int32);
7742 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7743 if (NumDependences == nullptr) {
7744 NumDependences = ConstantInt::get(Int32, 0);
7745 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7746 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7747 }
7748 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7749 Value *Args[] = {
7750 Ident, ThreadId, InteropVar, InteropTypeVal,
7751 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7752
7753 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7754
7755 return createRuntimeFunctionCall(Fn, Args);
7756}
7757
7759 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7760 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7763
7764 uint32_t SrcLocStrSize;
7765 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7766 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7767 Value *ThreadId = getOrCreateThreadID(Ident);
7768 if (Device == nullptr)
7769 Device = Constant::getAllOnesValue(Int32);
7770 if (NumDependences == nullptr) {
7771 NumDependences = ConstantInt::get(Int32, 0);
7772 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7773 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7774 }
7775 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7776 Value *Args[] = {
7777 Ident, ThreadId, InteropVar, Device,
7778 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7779
7780 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7781
7782 return createRuntimeFunctionCall(Fn, Args);
7783}
7784
7786 Value *InteropVar, Value *Device,
7787 Value *NumDependences,
7788 Value *DependenceAddress,
7789 bool HaveNowaitClause) {
7792 uint32_t SrcLocStrSize;
7793 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7794 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7795 Value *ThreadId = getOrCreateThreadID(Ident);
7796 if (Device == nullptr)
7797 Device = Constant::getAllOnesValue(Int32);
7798 if (NumDependences == nullptr) {
7799 NumDependences = ConstantInt::get(Int32, 0);
7800 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7801 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7802 }
7803 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7804 Value *Args[] = {
7805 Ident, ThreadId, InteropVar, Device,
7806 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7807
7808 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7809
7810 return createRuntimeFunctionCall(Fn, Args);
7811}
7812
7815 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7818
7819 uint32_t SrcLocStrSize;
7820 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7821 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7822 Value *ThreadId = getOrCreateThreadID(Ident);
7823 Constant *ThreadPrivateCache =
7824 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7825 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7826
7827 Function *Fn =
7828 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7829
7830 return createRuntimeFunctionCall(Fn, Args);
7831}
7832
7834 const LocationDescription &Loc,
7836 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7837 "expected num_threads and num_teams to be specified");
7838
7839 if (!updateToLocation(Loc))
7840 return Loc.IP;
7841
7842 uint32_t SrcLocStrSize;
7843 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7844 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7845 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7846 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7847 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7848 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7849 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7850
7851 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7852 Function *Kernel = DebugKernelWrapper;
7853
7854 // We need to strip the debug prefix to get the correct kernel name.
7855 StringRef KernelName = Kernel->getName();
7856 const std::string DebugPrefix = "_debug__";
7857 if (KernelName.ends_with(DebugPrefix)) {
7858 KernelName = KernelName.drop_back(DebugPrefix.length());
7859 Kernel = M.getFunction(KernelName);
7860 assert(Kernel && "Expected the real kernel to exist");
7861 }
7862
7863 // Manifest the launch configuration in the metadata matching the kernel
7864 // environment.
7865 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7866 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7867
7868 // If MaxThreads not set, select the maximum between the default workgroup
7869 // size and the MinThreads value.
7870 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7871 if (MaxThreadsVal < 0) {
7872 if (hasGridValue(T)) {
7873 MaxThreadsVal =
7874 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
7875 Attrs.MinThreads);
7876 } else {
7877 MaxThreadsVal = Attrs.MinThreads;
7878 }
7879 }
7880
7881 if (MaxThreadsVal > 0)
7882 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7883
7884 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7885 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7886 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7887 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7888 Constant *ReductionDataSize =
7889 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7890 Constant *ReductionBufferLength =
7891 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7892
7894 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7895 const DataLayout &DL = Fn->getDataLayout();
7896
7897 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7898 Constant *DynamicEnvironmentInitializer =
7899 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7900 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7901 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7902 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7903 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7904 DL.getDefaultGlobalsAddressSpace());
7905 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7906
7907 Constant *DynamicEnvironment =
7908 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7909 ? DynamicEnvironmentGV
7910 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7911 DynamicEnvironmentPtr);
7912
7913 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7914 ConfigurationEnvironment, {
7915 UseGenericStateMachineVal,
7916 MayUseNestedParallelismVal,
7917 IsSPMDVal,
7918 MinThreads,
7919 MaxThreads,
7920 MinTeams,
7921 MaxTeams,
7922 ReductionDataSize,
7923 ReductionBufferLength,
7924 });
7925 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7926 KernelEnvironment, {
7927 ConfigurationEnvironmentInitializer,
7928 Ident,
7929 DynamicEnvironment,
7930 });
7931 std::string KernelEnvironmentName =
7932 (KernelName + "_kernel_environment").str();
7933 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7934 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7935 KernelEnvironmentInitializer, KernelEnvironmentName,
7936 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7937 DL.getDefaultGlobalsAddressSpace());
7938 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7939
7940 Constant *KernelEnvironment =
7941 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7942 ? KernelEnvironmentGV
7943 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7944 KernelEnvironmentPtr);
7945 Value *KernelLaunchEnvironment =
7946 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
7947 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7948 KernelLaunchEnvironment =
7949 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7950 ? KernelLaunchEnvironment
7951 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7952 KernelLaunchEnvParamTy);
7953 CallInst *ThreadKind = createRuntimeFunctionCall(
7954 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7955
7956 Value *ExecUserCode = Builder.CreateICmpEQ(
7957 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7958 "exec_user_code");
7959
7960 // ThreadKind = __kmpc_target_init(...)
7961 // if (ThreadKind == -1)
7962 // user_code
7963 // else
7964 // return;
7965
7966 auto *UI = Builder.CreateUnreachable();
7967 BasicBlock *CheckBB = UI->getParent();
7968 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7969
7970 BasicBlock *WorkerExitBB = BasicBlock::Create(
7971 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7972 Builder.SetInsertPoint(WorkerExitBB);
7973 Builder.CreateRetVoid();
7974
7975 auto *CheckBBTI = CheckBB->getTerminator();
7976 Builder.SetInsertPoint(CheckBBTI);
7977 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7978
7979 CheckBBTI->eraseFromParent();
7980 UI->eraseFromParent();
7981
7982 // Continue in the "user_code" block, see diagram above and in
7983 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7984 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7985}
7986
7988 int32_t TeamsReductionDataSize,
7989 int32_t TeamsReductionBufferLength) {
7990 if (!updateToLocation(Loc))
7991 return;
7992
7994 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7995
7997
7998 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7999 return;
8000
8001 Function *Kernel = Builder.GetInsertBlock()->getParent();
8002 // We need to strip the debug prefix to get the correct kernel name.
8003 StringRef KernelName = Kernel->getName();
8004 const std::string DebugPrefix = "_debug__";
8005 if (KernelName.ends_with(DebugPrefix))
8006 KernelName = KernelName.drop_back(DebugPrefix.length());
8007 auto *KernelEnvironmentGV =
8008 M.getNamedGlobal((KernelName + "_kernel_environment").str());
8009 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
8010 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
8011 auto *NewInitializer = ConstantFoldInsertValueInstruction(
8012 KernelEnvironmentInitializer,
8013 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
8014 NewInitializer = ConstantFoldInsertValueInstruction(
8015 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
8016 {0, 8});
8017 KernelEnvironmentGV->setInitializer(NewInitializer);
8018}
8019
8020static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
8021 bool Min) {
8022 if (Kernel.hasFnAttribute(Name)) {
8023 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
8024 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
8025 }
8026 Kernel.addFnAttr(Name, llvm::utostr(Value));
8027}
8028
8029std::pair<int32_t, int32_t>
8031 int32_t ThreadLimit =
8032 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
8033
8034 if (T.isAMDGPU()) {
8035 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
8036 if (!Attr.isValid() || !Attr.isStringAttribute())
8037 return {0, ThreadLimit};
8038 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
8039 int32_t LB, UB;
8040 if (!llvm::to_integer(UBStr, UB, 10))
8041 return {0, ThreadLimit};
8042 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
8043 if (!llvm::to_integer(LBStr, LB, 10))
8044 return {0, UB};
8045 return {LB, UB};
8046 }
8047
8048 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
8049 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
8050 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
8051 }
8052 return {0, ThreadLimit};
8053}
8054
8056 Function &Kernel, int32_t LB,
8057 int32_t UB) {
8058 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
8059
8060 if (T.isAMDGPU()) {
8061 Kernel.addFnAttr("amdgpu-flat-work-group-size",
8062 llvm::utostr(LB) + "," + llvm::utostr(UB));
8063 return;
8064 }
8065
8067}
8068
8069std::pair<int32_t, int32_t>
8071 // TODO: Read from backend annotations if available.
8072 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
8073}
8074
8076 int32_t LB, int32_t UB) {
8077 if (T.isNVPTX())
8078 if (UB > 0)
8080 if (T.isAMDGPU())
8081 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
8082
8083 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
8084}
8085
8086void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
8087 Function *OutlinedFn) {
8088 if (Config.isTargetDevice()) {
8090 // TODO: Determine if DSO local can be set to true.
8091 OutlinedFn->setDSOLocal(false);
8093 if (T.isAMDGCN())
8095 else if (T.isNVPTX())
8097 else if (T.isSPIRV())
8099 }
8100}
8101
8102Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8103 StringRef EntryFnIDName) {
8104 if (Config.isTargetDevice()) {
8105 assert(OutlinedFn && "The outlined function must exist if embedded");
8106 return OutlinedFn;
8107 }
8108
8109 return new GlobalVariable(
8110 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8111 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8112}
8113
8114Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8115 StringRef EntryFnName) {
8116 if (OutlinedFn)
8117 return OutlinedFn;
8118
8119 assert(!M.getGlobalVariable(EntryFnName, true) &&
8120 "Named kernel already exists?");
8121 return new GlobalVariable(
8122 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8123 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8124}
8125
8127 TargetRegionEntryInfo &EntryInfo,
8128 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8129 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8130
8131 SmallString<64> EntryFnName;
8132 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8133
8134 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8135 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8136 if (!CBResult)
8137 return CBResult.takeError();
8138 OutlinedFn = *CBResult;
8139 } else {
8140 OutlinedFn = nullptr;
8141 }
8142
8143 // If this target outline function is not an offload entry, we don't need to
8144 // register it. This may be in the case of a false if clause, or if there are
8145 // no OpenMP targets.
8146 if (!IsOffloadEntry)
8147 return Error::success();
8148
8149 std::string EntryFnIDName =
8150 Config.isTargetDevice()
8151 ? std::string(EntryFnName)
8152 : createPlatformSpecificName({EntryFnName, "region_id"});
8153
8154 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8155 EntryFnName, EntryFnIDName);
8156 return Error::success();
8157}
8158
8160 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8161 StringRef EntryFnName, StringRef EntryFnIDName) {
8162 if (OutlinedFn)
8163 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8164 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8165 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8166 OffloadInfoManager.registerTargetRegionEntryInfo(
8167 EntryInfo, EntryAddr, OutlinedFnID,
8169 return OutlinedFnID;
8170}
8171
8173 const LocationDescription &Loc, InsertPointTy AllocaIP,
8174 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8175 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8176 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8178 BodyGenTy BodyGenType)>
8179 BodyGenCB,
8180 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8181 if (!updateToLocation(Loc))
8182 return InsertPointTy();
8183
8184 Builder.restoreIP(CodeGenIP);
8185
8186 bool IsStandAlone = !BodyGenCB;
8187 MapInfosTy *MapInfo;
8188 // Generate the code for the opening of the data environment. Capture all the
8189 // arguments of the runtime call by reference because they are used in the
8190 // closing of the region.
8191 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8192 InsertPointTy CodeGenIP) -> Error {
8193 MapInfo = &GenMapInfoCB(Builder.saveIP());
8194 if (Error Err = emitOffloadingArrays(
8195 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8196 /*IsNonContiguous=*/true, DeviceAddrCB))
8197 return Err;
8198
8199 TargetDataRTArgs RTArgs;
8201
8202 // Emit the number of elements in the offloading arrays.
8203 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8204
8205 // Source location for the ident struct
8206 if (!SrcLocInfo) {
8207 uint32_t SrcLocStrSize;
8208 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8209 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8210 }
8211
8212 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8213 SrcLocInfo, DeviceID,
8214 PointerNum, RTArgs.BasePointersArray,
8215 RTArgs.PointersArray, RTArgs.SizesArray,
8216 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8217 RTArgs.MappersArray};
8218
8219 if (IsStandAlone) {
8220 assert(MapperFunc && "MapperFunc missing for standalone target data");
8221
8222 auto TaskBodyCB = [&](Value *, Value *,
8224 if (Info.HasNoWait) {
8225 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8229 }
8230
8232 OffloadingArgs);
8233
8234 if (Info.HasNoWait) {
8235 BasicBlock *OffloadContBlock =
8236 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8237 Function *CurFn = Builder.GetInsertBlock()->getParent();
8238 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8239 Builder.restoreIP(Builder.saveIP());
8240 }
8241 return Error::success();
8242 };
8243
8244 bool RequiresOuterTargetTask = Info.HasNoWait;
8245 if (!RequiresOuterTargetTask)
8246 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8247 /*TargetTaskAllocaIP=*/{}));
8248 else
8249 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8250 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8251 } else {
8252 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8253 omp::OMPRTL___tgt_target_data_begin_mapper);
8254
8255 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8256
8257 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8258 if (isa<AllocaInst>(DeviceMap.second.second)) {
8259 auto *LI =
8260 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8261 Builder.CreateStore(LI, DeviceMap.second.second);
8262 }
8263 }
8264
8265 // If device pointer privatization is required, emit the body of the
8266 // region here. It will have to be duplicated: with and without
8267 // privatization.
8268 InsertPointOrErrorTy AfterIP =
8269 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8270 if (!AfterIP)
8271 return AfterIP.takeError();
8272 Builder.restoreIP(*AfterIP);
8273 }
8274 return Error::success();
8275 };
8276
8277 // If we need device pointer privatization, we need to emit the body of the
8278 // region with no privatization in the 'else' branch of the conditional.
8279 // Otherwise, we don't have to do anything.
8280 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8281 InsertPointTy CodeGenIP) -> Error {
8282 InsertPointOrErrorTy AfterIP =
8283 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8284 if (!AfterIP)
8285 return AfterIP.takeError();
8286 Builder.restoreIP(*AfterIP);
8287 return Error::success();
8288 };
8289
8290 // Generate code for the closing of the data region.
8291 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8292 TargetDataRTArgs RTArgs;
8293 Info.EmitDebug = !MapInfo->Names.empty();
8294 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8295
8296 // Emit the number of elements in the offloading arrays.
8297 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8298
8299 // Source location for the ident struct
8300 if (!SrcLocInfo) {
8301 uint32_t SrcLocStrSize;
8302 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8303 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8304 }
8305
8306 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8307 PointerNum, RTArgs.BasePointersArray,
8308 RTArgs.PointersArray, RTArgs.SizesArray,
8309 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8310 RTArgs.MappersArray};
8311 Function *EndMapperFunc =
8312 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8313
8314 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8315 return Error::success();
8316 };
8317
8318 // We don't have to do anything to close the region if the if clause evaluates
8319 // to false.
8320 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8321 return Error::success();
8322 };
8323
8324 Error Err = [&]() -> Error {
8325 if (BodyGenCB) {
8326 Error Err = [&]() {
8327 if (IfCond)
8328 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8329 return BeginThenGen(AllocaIP, Builder.saveIP());
8330 }();
8331
8332 if (Err)
8333 return Err;
8334
8335 // If we don't require privatization of device pointers, we emit the body
8336 // in between the runtime calls. This avoids duplicating the body code.
8337 InsertPointOrErrorTy AfterIP =
8338 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8339 if (!AfterIP)
8340 return AfterIP.takeError();
8341 restoreIPandDebugLoc(Builder, *AfterIP);
8342
8343 if (IfCond)
8344 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8345 return EndThenGen(AllocaIP, Builder.saveIP());
8346 }
8347 if (IfCond)
8348 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8349 return BeginThenGen(AllocaIP, Builder.saveIP());
8350 }();
8351
8352 if (Err)
8353 return Err;
8354
8355 return Builder.saveIP();
8356}
8357
8360 bool IsGPUDistribute) {
8361 assert((IVSize == 32 || IVSize == 64) &&
8362 "IV size is not compatible with the omp runtime");
8363 RuntimeFunction Name;
8364 if (IsGPUDistribute)
8365 Name = IVSize == 32
8366 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8367 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8368 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8369 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8370 else
8371 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8372 : omp::OMPRTL___kmpc_for_static_init_4u)
8373 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8374 : omp::OMPRTL___kmpc_for_static_init_8u);
8375
8376 return getOrCreateRuntimeFunction(M, Name);
8377}
8378
8380 bool IVSigned) {
8381 assert((IVSize == 32 || IVSize == 64) &&
8382 "IV size is not compatible with the omp runtime");
8383 RuntimeFunction Name = IVSize == 32
8384 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8385 : omp::OMPRTL___kmpc_dispatch_init_4u)
8386 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8387 : omp::OMPRTL___kmpc_dispatch_init_8u);
8388
8389 return getOrCreateRuntimeFunction(M, Name);
8390}
8391
8393 bool IVSigned) {
8394 assert((IVSize == 32 || IVSize == 64) &&
8395 "IV size is not compatible with the omp runtime");
8396 RuntimeFunction Name = IVSize == 32
8397 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8398 : omp::OMPRTL___kmpc_dispatch_next_4u)
8399 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8400 : omp::OMPRTL___kmpc_dispatch_next_8u);
8401
8402 return getOrCreateRuntimeFunction(M, Name);
8403}
8404
8406 bool IVSigned) {
8407 assert((IVSize == 32 || IVSize == 64) &&
8408 "IV size is not compatible with the omp runtime");
8409 RuntimeFunction Name = IVSize == 32
8410 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8411 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8412 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8413 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8414
8415 return getOrCreateRuntimeFunction(M, Name);
8416}
8417
8419 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8420}
8421
8423 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8424 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8425
8426 DISubprogram *NewSP = Func->getSubprogram();
8427 if (!NewSP)
8428 return;
8429
8431
8432 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8433 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8434 // Only use cached variable if the arg number matches. This is important
8435 // so that DIVariable created for privatized variables are not discarded.
8436 if (NewVar && (arg == NewVar->getArg()))
8437 return NewVar;
8438
8440 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8441 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8442 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8443 return NewVar;
8444 };
8445
8446 auto UpdateDebugRecord = [&](auto *DR) {
8447 DILocalVariable *OldVar = DR->getVariable();
8448 unsigned ArgNo = 0;
8449 for (auto Loc : DR->location_ops()) {
8450 auto Iter = ValueReplacementMap.find(Loc);
8451 if (Iter != ValueReplacementMap.end()) {
8452 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8453 ArgNo = std::get<1>(Iter->second) + 1;
8454 }
8455 }
8456 if (ArgNo != 0)
8457 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8458 };
8459
8461 auto MoveDebugRecordToCorrectBlock = [&](DbgVariableRecord *DVR) {
8462 if (DVR->getNumVariableLocationOps() != 1u) {
8463 DVR->setKillLocation();
8464 return;
8465 }
8466 Value *Loc = DVR->getVariableLocationOp(0u);
8467 BasicBlock *CurBB = DVR->getParent();
8468 BasicBlock *RequiredBB = nullptr;
8469
8470 if (Instruction *LocInst = dyn_cast<Instruction>(Loc))
8471 RequiredBB = LocInst->getParent();
8472 else if (isa<llvm::Argument>(Loc))
8473 RequiredBB = &DVR->getFunction()->getEntryBlock();
8474
8475 if (RequiredBB && RequiredBB != CurBB) {
8476 assert(!RequiredBB->empty());
8477 RequiredBB->insertDbgRecordBefore(DVR->clone(),
8478 RequiredBB->back().getIterator());
8479 DVRsToDelete.push_back(DVR);
8480 }
8481 };
8482
8483 // The location and scope of variable intrinsics and records still point to
8484 // the parent function of the target region. Update them.
8485 for (Instruction &I : instructions(Func)) {
8487 "Unexpected debug intrinsic");
8488 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange())) {
8489 UpdateDebugRecord(&DVR);
8490 MoveDebugRecordToCorrectBlock(&DVR);
8491 }
8492 }
8493 for (auto *DVR : DVRsToDelete)
8494 DVR->getMarker()->MarkedInstr->dropOneDbgRecord(DVR);
8495 // An extra argument is passed to the device. Create the debug data for it.
8496 if (OMPBuilder.Config.isTargetDevice()) {
8497 DICompileUnit *CU = NewSP->getUnit();
8498 Module *M = Func->getParent();
8499 DIBuilder DB(*M, true, CU);
8500 DIType *VoidPtrTy =
8501 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8502 unsigned ArgNo = Func->arg_size();
8503 DILocalVariable *Var = DB.createParameterVariable(
8504 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8505 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8506 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8507 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8508 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8509 &(*Func->begin()));
8510 }
8511}
8512
8514 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8515 return cast<Operator>(V)->getOperand(0);
8516 return V;
8517}
8518
8520 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8522 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8525 SmallVector<Type *> ParameterTypes;
8526 if (OMPBuilder.Config.isTargetDevice()) {
8527 // All parameters to target devices are passed as pointers
8528 // or i64. This assumes 64-bit address spaces/pointers.
8529 for (auto &Arg : Inputs)
8530 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8531 ? Arg->getType()
8532 : Type::getInt64Ty(Builder.getContext()));
8533 } else {
8534 for (auto &Arg : Inputs)
8535 ParameterTypes.push_back(Arg->getType());
8536 }
8537
8538 // The implicit dyn_ptr argument is always the last parameter on both host
8539 // and device so the argument counts match without runtime manipulation.
8540 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8541 ParameterTypes.push_back(PtrTy);
8542
8543 auto BB = Builder.GetInsertBlock();
8544 auto M = BB->getModule();
8545 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8546 /*isVarArg*/ false);
8547 auto Func =
8548 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8549
8550 // Forward target-cpu and target-features function attributes from the
8551 // original function to the new outlined function.
8552 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8553
8554 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8555 if (TargetCpuAttr.isStringAttribute())
8556 Func->addFnAttr(TargetCpuAttr);
8557
8558 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8559 if (TargetFeaturesAttr.isStringAttribute())
8560 Func->addFnAttr(TargetFeaturesAttr);
8561
8562 if (OMPBuilder.Config.isTargetDevice()) {
8563 Value *ExecMode =
8564 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8565 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8566 }
8567
8568 // Save insert point.
8569 IRBuilder<>::InsertPointGuard IPG(Builder);
8570 // We will generate the entries in the outlined function but the debug
8571 // location may still be pointing to the parent function. Reset it now.
8572 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8573
8574 // Generate the region into the function.
8575 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8576 Builder.SetInsertPoint(EntryBB);
8577
8578 // Insert target init call in the device compilation pass.
8579 if (OMPBuilder.Config.isTargetDevice())
8580 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8581
8582 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8583
8584 // As we embed the user code in the middle of our target region after we
8585 // generate entry code, we must move what allocas we can into the entry
8586 // block to avoid possible breaking optimisations for device
8587 if (OMPBuilder.Config.isTargetDevice())
8589
8590 // Insert target deinit call in the device compilation pass.
8591 BasicBlock *OutlinedBodyBB =
8592 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8594 Builder.saveIP(),
8595 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8596 if (!AfterIP)
8597 return AfterIP.takeError();
8598 Builder.restoreIP(*AfterIP);
8599 if (OMPBuilder.Config.isTargetDevice())
8600 OMPBuilder.createTargetDeinit(Builder);
8601
8602 // Insert return instruction.
8603 Builder.CreateRetVoid();
8604
8605 // New Alloca IP at entry point of created device function.
8606 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8607 auto AllocaIP = Builder.saveIP();
8608
8609 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8610
8611 // Do not include the artificial dyn_ptr argument.
8612 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8613
8615
8616 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8617 // Things like GEP's can come in the form of Constants. Constants and
8618 // ConstantExpr's do not have access to the knowledge of what they're
8619 // contained in, so we must dig a little to find an instruction so we
8620 // can tell if they're used inside of the function we're outlining. We
8621 // also replace the original constant expression with a new instruction
8622 // equivalent; an instruction as it allows easy modification in the
8623 // following loop, as we can now know the constant (instruction) is
8624 // owned by our target function and replaceUsesOfWith can now be invoked
8625 // on it (cannot do this with constants it seems). A brand new one also
8626 // allows us to be cautious as it is perhaps possible the old expression
8627 // was used inside of the function but exists and is used externally
8628 // (unlikely by the nature of a Constant, but still).
8629 // NOTE: We cannot remove dead constants that have been rewritten to
8630 // instructions at this stage, we run the risk of breaking later lowering
8631 // by doing so as we could still be in the process of lowering the module
8632 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8633 // constants we have created rewritten versions of.
8634 if (auto *Const = dyn_cast<Constant>(Input))
8635 convertUsersOfConstantsToInstructions(Const, Func, false);
8636
8637 // Collect users before iterating over them to avoid invalidating the
8638 // iteration in case a user uses Input more than once (e.g. a call
8639 // instruction).
8640 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8641 // Collect all the instructions
8643 if (auto *Instr = dyn_cast<Instruction>(User))
8644 if (Instr->getFunction() == Func)
8645 Instr->replaceUsesOfWith(Input, InputCopy);
8646 };
8647
8648 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8649
8650 // Rewrite uses of input valus to parameters.
8651 for (auto InArg : zip(Inputs, ArgRange)) {
8652 Value *Input = std::get<0>(InArg);
8653 Argument &Arg = std::get<1>(InArg);
8654 Value *InputCopy = nullptr;
8655
8657 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8658 if (!AfterIP)
8659 return AfterIP.takeError();
8660 Builder.restoreIP(*AfterIP);
8661 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8662
8663 // In certain cases a Global may be set up for replacement, however, this
8664 // Global may be used in multiple arguments to the kernel, just segmented
8665 // apart, for example, if we have a global array, that is sectioned into
8666 // multiple mappings (technically not legal in OpenMP, but there is a case
8667 // in Fortran for Common Blocks where this is neccesary), we will end up
8668 // with GEP's into this array inside the kernel, that refer to the Global
8669 // but are technically separate arguments to the kernel for all intents and
8670 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8671 // index, it will fold into an referal to the Global, if we then encounter
8672 // this folded GEP during replacement all of the references to the
8673 // Global in the kernel will be replaced with the argument we have generated
8674 // that corresponds to it, including any other GEP's that refer to the
8675 // Global that may be other arguments. This will invalidate all of the other
8676 // preceding mapped arguments that refer to the same global that may be
8677 // separate segments. To prevent this, we defer global processing until all
8678 // other processing has been performed.
8681 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8682 continue;
8683 }
8684
8686 continue;
8687
8688 ReplaceValue(Input, InputCopy, Func);
8689 }
8690
8691 // Replace all of our deferred Input values, currently just Globals.
8692 for (auto Deferred : DeferredReplacement)
8693 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8694
8695 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8696 ValueReplacementMap);
8697 return Func;
8698}
8699/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8700/// of pointers containing shared data between the parent task and the created
8701/// task.
8703 IRBuilderBase &Builder,
8704 Value *TaskWithPrivates,
8705 Type *TaskWithPrivatesTy) {
8706
8707 Type *TaskTy = OMPIRBuilder.Task;
8708 LLVMContext &Ctx = Builder.getContext();
8709 Value *TaskT =
8710 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8711 Value *Shareds = TaskT;
8712 // TaskWithPrivatesTy can be one of the following
8713 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8714 // %struct.privates }
8715 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8716 //
8717 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8718 // its first member has to be the task descriptor. TaskTy is the type of the
8719 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8720 // first member of TaskT, gives us the pointer to shared data.
8721 if (TaskWithPrivatesTy != TaskTy)
8722 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8723 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8724}
8725/// Create an entry point for a target task with the following.
8726/// It'll have the following signature
8727/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8728/// This function is called from emitTargetTask once the
8729/// code to launch the target kernel has been outlined already.
8730/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8731/// into the task structure so that the deferred target task can access this
8732/// data even after the stack frame of the generating task has been rolled
8733/// back. Offloading arrays contain base pointers, pointers, sizes etc
8734/// of the data that the target kernel will access. These in effect are the
8735/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8737 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8738 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8739 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8740
8741 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8742 // This is because PrivatesTy is the type of the structure in which
8743 // we pass the offloading arrays to the deferred target task.
8744 assert((!NumOffloadingArrays || PrivatesTy) &&
8745 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8746 "to privatize");
8747
8748 Module &M = OMPBuilder.M;
8749 // KernelLaunchFunction is the target launch function, i.e.
8750 // the function that sets up kernel arguments and calls
8751 // __tgt_target_kernel to launch the kernel on the device.
8752 //
8753 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8754
8755 // StaleCI is the CallInst which is the call to the outlined
8756 // target kernel launch function. If there are local live-in values
8757 // that the outlined function uses then these are aggregated into a structure
8758 // which is passed as the second argument. If there are no local live-in
8759 // values or if all values used by the outlined kernel are global variables,
8760 // then there's only one argument, the threadID. So, StaleCI can be
8761 //
8762 // %structArg = alloca { ptr, ptr }, align 8
8763 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8764 // store ptr %20, ptr %gep_, align 8
8765 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8766 // store ptr %21, ptr %gep_8, align 8
8767 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8768 //
8769 // OR
8770 //
8771 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8773 StaleCI->getIterator());
8774
8775 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8776
8777 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8778 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8779 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8780
8781 auto ProxyFnTy =
8782 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8783 /* isVarArg */ false);
8784 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8785 ".omp_target_task_proxy_func",
8786 Builder.GetInsertBlock()->getModule());
8787 Value *ThreadId = ProxyFn->getArg(0);
8788 Value *TaskWithPrivates = ProxyFn->getArg(1);
8789 ThreadId->setName("thread.id");
8790 TaskWithPrivates->setName("task");
8791
8792 bool HasShareds = SharedArgsOperandNo > 0;
8793 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8794 BasicBlock *EntryBB =
8795 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8796 Builder.SetInsertPoint(EntryBB);
8797
8798 SmallVector<Value *> KernelLaunchArgs;
8799 KernelLaunchArgs.reserve(StaleCI->arg_size());
8800 KernelLaunchArgs.push_back(ThreadId);
8801
8802 if (HasOffloadingArrays) {
8803 assert(TaskTy != TaskWithPrivatesTy &&
8804 "If there are offloading arrays to pass to the target"
8805 "TaskTy cannot be the same as TaskWithPrivatesTy");
8806 (void)TaskTy;
8807 Value *Privates =
8808 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8809 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8810 KernelLaunchArgs.push_back(
8811 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8812 }
8813
8814 if (HasShareds) {
8815 auto *ArgStructAlloca =
8816 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8817 assert(ArgStructAlloca &&
8818 "Unable to find the alloca instruction corresponding to arguments "
8819 "for extracted function");
8820 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8821 std::optional<TypeSize> ArgAllocSize =
8822 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8823 assert(ArgStructType && ArgAllocSize &&
8824 "Unable to determine size of arguments for extracted function");
8825 uint64_t StructSize = ArgAllocSize->getFixedValue();
8826
8827 AllocaInst *NewArgStructAlloca =
8828 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8829
8830 Value *SharedsSize = Builder.getInt64(StructSize);
8831
8833 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8834
8835 Builder.CreateMemCpy(
8836 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8837 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8838 KernelLaunchArgs.push_back(NewArgStructAlloca);
8839 }
8840 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8841 Builder.CreateRetVoid();
8842 return ProxyFn;
8843}
8845
8846 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8847 return GEP->getSourceElementType();
8848 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8849 return Alloca->getAllocatedType();
8850
8851 llvm_unreachable("Unhandled Instruction type");
8852 return nullptr;
8853}
8854// This function returns a struct that has at most two members.
8855// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8856// descriptor. The second member, if needed, is a struct containing arrays
8857// that need to be passed to the offloaded target kernel. For example,
8858// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8859// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8860// respectively, then the types created by this function are
8861//
8862// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8863// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8864// %struct.privates }
8865// %struct.task_with_privates is returned by this function.
8866// If there aren't any offloading arrays to pass to the target kernel,
8867// %struct.kmp_task_ompbuilder_t is returned.
8868static StructType *
8870 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8871
8872 if (OffloadingArraysToPrivatize.empty())
8873 return OMPIRBuilder.Task;
8874
8875 SmallVector<Type *, 4> StructFieldTypes;
8876 for (Value *V : OffloadingArraysToPrivatize) {
8877 assert(V->getType()->isPointerTy() &&
8878 "Expected pointer to array to privatize. Got a non-pointer value "
8879 "instead");
8880 Type *ArrayTy = getOffloadingArrayType(V);
8881 assert(ArrayTy && "ArrayType cannot be nullptr");
8882 StructFieldTypes.push_back(ArrayTy);
8883 }
8884 StructType *PrivatesStructTy =
8885 StructType::create(StructFieldTypes, "struct.privates");
8886 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8887 "struct.task_with_privates");
8888}
8890 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8891 TargetRegionEntryInfo &EntryInfo,
8893 Function *&OutlinedFn, Constant *&OutlinedFnID,
8897
8898 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8899 [&](StringRef EntryFnName) {
8900 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8901 EntryFnName, Inputs, CBFunc,
8902 ArgAccessorFuncCB);
8903 };
8904
8905 return OMPBuilder.emitTargetRegionFunction(
8906 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8907 OutlinedFnID);
8908}
8909
8911 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8913 const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs,
8914 bool HasNoWait) {
8915
8916 // The following explains the code-gen scenario for the `target` directive. A
8917 // similar scneario is followed for other device-related directives (e.g.
8918 // `target enter data`) but in similar fashion since we only need to emit task
8919 // that encapsulates the proper runtime call.
8920 //
8921 // When we arrive at this function, the target region itself has been
8922 // outlined into the function OutlinedFn.
8923 // So at ths point, for
8924 // --------------------------------------------------------------
8925 // void user_code_that_offloads(...) {
8926 // omp target depend(..) map(from:a) map(to:b) private(i)
8927 // do i = 1, 10
8928 // a(i) = b(i) + n
8929 // }
8930 //
8931 // --------------------------------------------------------------
8932 //
8933 // we have
8934 //
8935 // --------------------------------------------------------------
8936 //
8937 // void user_code_that_offloads(...) {
8938 // %.offload_baseptrs = alloca [2 x ptr], align 8
8939 // %.offload_ptrs = alloca [2 x ptr], align 8
8940 // %.offload_mappers = alloca [2 x ptr], align 8
8941 // ;; target region has been outlined and now we need to
8942 // ;; offload to it via a target task.
8943 // }
8944 // void outlined_device_function(ptr a, ptr b, ptr n) {
8945 // n = *n_ptr;
8946 // do i = 1, 10
8947 // a(i) = b(i) + n
8948 // }
8949 //
8950 // We have to now do the following
8951 // (i) Make an offloading call to outlined_device_function using the OpenMP
8952 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8953 // emitted by emitKernelLaunch
8954 // (ii) Create a task entry point function that calls kernel_launch_function
8955 // and is the entry point for the target task. See
8956 // '@.omp_target_task_proxy_func in the pseudocode below.
8957 // (iii) Create a task with the task entry point created in (ii)
8958 //
8959 // That is we create the following
8960 // struct task_with_privates {
8961 // struct kmp_task_ompbuilder_t task_struct;
8962 // struct privates {
8963 // [2 x ptr] ; baseptrs
8964 // [2 x ptr] ; ptrs
8965 // [2 x i64] ; sizes
8966 // }
8967 // }
8968 // void user_code_that_offloads(...) {
8969 // %.offload_baseptrs = alloca [2 x ptr], align 8
8970 // %.offload_ptrs = alloca [2 x ptr], align 8
8971 // %.offload_sizes = alloca [2 x i64], align 8
8972 //
8973 // %structArg = alloca { ptr, ptr, ptr }, align 8
8974 // %strucArg[0] = a
8975 // %strucArg[1] = b
8976 // %strucArg[2] = &n
8977 //
8978 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8979 // sizeof(kmp_task_ompbuilder_t),
8980 // sizeof(structArg),
8981 // @.omp_target_task_proxy_func,
8982 // ...)
8983 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8984 // sizeof(structArg))
8985 // memcpy(target_task_with_privates->privates->baseptrs,
8986 // offload_baseptrs, sizeof(offload_baseptrs)
8987 // memcpy(target_task_with_privates->privates->ptrs,
8988 // offload_ptrs, sizeof(offload_ptrs)
8989 // memcpy(target_task_with_privates->privates->sizes,
8990 // offload_sizes, sizeof(offload_sizes)
8991 // dependencies_array = ...
8992 // ;; if nowait not present
8993 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8994 // call @__kmpc_omp_task_begin_if0(...)
8995 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8996 // %target_task_with_privates)
8997 // call @__kmpc_omp_task_complete_if0(...)
8998 // }
8999 //
9000 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
9001 // ptr %task) {
9002 // %structArg = alloca {ptr, ptr, ptr}
9003 // %task_ptr = getelementptr(%task, 0, 0)
9004 // %shared_data = load (getelementptr %task_ptr, 0, 0)
9005 // mempcy(%structArg, %shared_data, sizeof(%structArg))
9006 //
9007 // %offloading_arrays = getelementptr(%task, 0, 1)
9008 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
9009 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
9010 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
9011 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
9012 // %offload_sizes, %structArg)
9013 // }
9014 //
9015 // We need the proxy function because the signature of the task entry point
9016 // expected by kmpc_omp_task is always the same and will be different from
9017 // that of the kernel_launch function.
9018 //
9019 // kernel_launch_function is generated by emitKernelLaunch and has the
9020 // always_inline attribute. For this example, it'll look like so:
9021 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
9022 // %offload_sizes, %structArg) alwaysinline {
9023 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
9024 // ; load aggregated data from %structArg
9025 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
9026 // ; offload_sizes
9027 // call i32 @__tgt_target_kernel(...,
9028 // outlined_device_function,
9029 // ptr %kernel_args)
9030 // }
9031 // void outlined_device_function(ptr a, ptr b, ptr n) {
9032 // n = *n_ptr;
9033 // do i = 1, 10
9034 // a(i) = b(i) + n
9035 // }
9036 //
9037 BasicBlock *TargetTaskBodyBB =
9038 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
9039 BasicBlock *TargetTaskAllocaBB =
9040 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
9041
9042 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
9043 TargetTaskAllocaBB->begin());
9044 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
9045
9046 OutlineInfo OI;
9047 OI.EntryBB = TargetTaskAllocaBB;
9048 OI.OuterAllocaBB = AllocaIP.getBlock();
9049
9050 // Add the thread ID argument.
9053 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
9054
9055 // Generate the task body which will subsequently be outlined.
9056 Builder.restoreIP(TargetTaskBodyIP);
9057 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
9058 return Err;
9059
9060 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
9061 // it is given. These blocks are enumerated by
9062 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
9063 // to be outside the region. In other words, OI.ExitBlock is expected to be
9064 // the start of the region after the outlining. We used to set OI.ExitBlock
9065 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
9066 // except when the task body is a single basic block. In that case,
9067 // OI.ExitBlock is set to the single task body block and will get left out of
9068 // the outlining process. So, simply create a new empty block to which we
9069 // uncoditionally branch from where TaskBodyCB left off
9070 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
9071 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
9072 /*IsFinished=*/true);
9073
9074 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
9075 bool NeedsTargetTask = HasNoWait && DeviceID;
9076 if (NeedsTargetTask) {
9077 for (auto *V :
9078 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
9079 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
9080 RTArgs.SizesArray}) {
9082 OffloadingArraysToPrivatize.push_back(V);
9084 }
9085 }
9086 }
9087 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
9088 DeviceID, OffloadingArraysToPrivatize](
9089 Function &OutlinedFn) mutable {
9090 assert(OutlinedFn.hasOneUse() &&
9091 "there must be a single user for the outlined function");
9092
9093 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
9094
9095 // The first argument of StaleCI is always the thread id.
9096 // The next few arguments are the pointers to offloading arrays
9097 // if any. (see OffloadingArraysToPrivatize)
9098 // Finally, all other local values that are live-in into the outlined region
9099 // end up in a structure whose pointer is passed as the last argument. This
9100 // piece of data is passed in the "shared" field of the task structure. So,
9101 // we know we have to pass shareds to the task if the number of arguments is
9102 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
9103 // thread id. Further, for safety, we assert that the number of arguments of
9104 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
9105 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
9106 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
9107 assert((!HasShareds ||
9108 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
9109 "Wrong number of arguments for StaleCI when shareds are present");
9110 int SharedArgOperandNo =
9111 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
9112
9113 StructType *TaskWithPrivatesTy =
9114 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
9115 StructType *PrivatesTy = nullptr;
9116
9117 if (!OffloadingArraysToPrivatize.empty())
9118 PrivatesTy =
9119 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9120
9122 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9123 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9124
9125 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9126 << "\n");
9127
9128 Builder.SetInsertPoint(StaleCI);
9129
9130 // Gather the arguments for emitting the runtime call.
9131 uint32_t SrcLocStrSize;
9132 Constant *SrcLocStr =
9134 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9135
9136 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9137 //
9138 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9139 // the DeviceID to the deferred task and also since
9140 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9141 Function *TaskAllocFn =
9142 !NeedsTargetTask
9143 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9145 OMPRTL___kmpc_omp_target_task_alloc);
9146
9147 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9148 // call.
9149 Value *ThreadID = getOrCreateThreadID(Ident);
9150
9151 // Argument - `sizeof_kmp_task_t` (TaskSize)
9152 // Tasksize refers to the size in bytes of kmp_task_t data structure
9153 // plus any other data to be passed to the target task, if any, which
9154 // is packed into a struct. kmp_task_t and the struct so created are
9155 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9156 Value *TaskSize = Builder.getInt64(
9157 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9158
9159 // Argument - `sizeof_shareds` (SharedsSize)
9160 // SharedsSize refers to the shareds array size in the kmp_task_t data
9161 // structure.
9162 Value *SharedsSize = Builder.getInt64(0);
9163 if (HasShareds) {
9164 auto *ArgStructAlloca =
9165 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9166 assert(ArgStructAlloca &&
9167 "Unable to find the alloca instruction corresponding to arguments "
9168 "for extracted function");
9169 std::optional<TypeSize> ArgAllocSize =
9170 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9171 assert(ArgAllocSize &&
9172 "Unable to determine size of arguments for extracted function");
9173 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9174 }
9175
9176 // Argument - `flags`
9177 // Task is tied iff (Flags & 1) == 1.
9178 // Task is untied iff (Flags & 1) == 0.
9179 // Task is final iff (Flags & 2) == 2.
9180 // Task is not final iff (Flags & 2) == 0.
9181 // A target task is not final and is untied.
9182 Value *Flags = Builder.getInt32(0);
9183
9184 // Emit the @__kmpc_omp_task_alloc runtime call
9185 // The runtime call returns a pointer to an area where the task captured
9186 // variables must be copied before the task is run (TaskData)
9187 CallInst *TaskData = nullptr;
9188
9189 SmallVector<llvm::Value *> TaskAllocArgs = {
9190 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9191 /*flags=*/Flags,
9192 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9193 /*task_func=*/ProxyFn};
9194
9195 if (NeedsTargetTask) {
9196 assert(DeviceID && "Expected non-empty device ID.");
9197 TaskAllocArgs.push_back(DeviceID);
9198 }
9199
9200 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9201
9202 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9203 if (HasShareds) {
9204 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9206 *this, Builder, TaskData, TaskWithPrivatesTy);
9207 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9208 SharedsSize);
9209 }
9210 if (!OffloadingArraysToPrivatize.empty()) {
9211 Value *Privates =
9212 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9213 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9214 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9215 [[maybe_unused]] Type *ArrayType =
9216 getOffloadingArrayType(PtrToPrivatize);
9217 assert(ArrayType && "ArrayType cannot be nullptr");
9218
9219 Type *ElementType = PrivatesTy->getElementType(i);
9220 assert(ElementType == ArrayType &&
9221 "ElementType should match ArrayType");
9222 (void)ArrayType;
9223
9224 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9225 Builder.CreateMemCpy(
9226 Dst, Alignment, PtrToPrivatize, Alignment,
9227 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9228 }
9229 }
9230
9231 Value *DepArray = nullptr;
9232 Value *NumDeps = nullptr;
9233 if (Dependencies.DepArray) {
9234 DepArray = Dependencies.DepArray;
9235 NumDeps = Dependencies.NumDeps;
9236 } else if (!Dependencies.Deps.empty()) {
9237 DepArray = emitTaskDependencies(*this, Dependencies.Deps);
9238 NumDeps = Builder.getInt32(Dependencies.Deps.size());
9239 }
9240
9241 // ---------------------------------------------------------------
9242 // V5.2 13.8 target construct
9243 // If the nowait clause is present, execution of the target task
9244 // may be deferred. If the nowait clause is not present, the target task is
9245 // an included task.
9246 // ---------------------------------------------------------------
9247 // The above means that the lack of a nowait on the target construct
9248 // translates to '#pragma omp task if(0)'
9249 if (!NeedsTargetTask) {
9250 if (DepArray) {
9251 Function *TaskWaitFn =
9252 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9254 TaskWaitFn,
9255 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9256 /*ndeps=*/NumDeps,
9257 /*dep_list=*/DepArray,
9258 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9259 /*noalias_dep_list=*/
9261 }
9262 // Included task.
9263 Function *TaskBeginFn =
9264 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9265 Function *TaskCompleteFn =
9266 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9267 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9268 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9269 CI->setDebugLoc(StaleCI->getDebugLoc());
9270 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9271 } else if (DepArray) {
9272 // HasNoWait - meaning the task may be deferred. Call
9273 // __kmpc_omp_task_with_deps if there are dependencies,
9274 // else call __kmpc_omp_task
9275 Function *TaskFn =
9276 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9278 TaskFn,
9279 {Ident, ThreadID, TaskData, NumDeps, DepArray,
9280 ConstantInt::get(Builder.getInt32Ty(), 0),
9282 } else {
9283 // Emit the @__kmpc_omp_task runtime call to spawn the task
9284 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9285 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9286 }
9287
9288 StaleCI->eraseFromParent();
9289 for (Instruction *I : llvm::reverse(ToBeDeleted))
9290 I->eraseFromParent();
9291 };
9292 addOutlineInfo(std::move(OI));
9293
9294 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9295 << *(Builder.GetInsertBlock()) << "\n");
9296 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9297 << *(Builder.GetInsertBlock()->getParent()->getParent())
9298 << "\n");
9299 return Builder.saveIP();
9300}
9301
9303 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9304 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9305 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9306 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9307 if (Error Err =
9308 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9309 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9310 return Err;
9311 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9312 return Error::success();
9313}
9314
9315static void
9321 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9325 const OpenMPIRBuilder::DependenciesInfo &Dependencies,
9326 bool HasNoWait, Value *DynCGroupMem,
9327 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9328 // Generate a function call to the host fallback implementation of the target
9329 // region. This is called by the host when no offload entry was generated for
9330 // the target region and when the offloading call fails at runtime.
9331 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9333 Builder.restoreIP(IP);
9334 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9335 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9336 FallbackArgs.push_back(
9337 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9338 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9339 return Builder.saveIP();
9340 };
9341
9342 bool HasDependencies = !Dependencies.empty();
9343 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9344
9346
9347 auto TaskBodyCB =
9348 [&](Value *DeviceID, Value *RTLoc,
9349 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9350 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9351 // produce any.
9353 // emitKernelLaunch makes the necessary runtime call to offload the
9354 // kernel. We then outline all that code into a separate function
9355 // ('kernel_launch_function' in the pseudo code above). This function is
9356 // then called by the target task proxy function (see
9357 // '@.omp_target_task_proxy_func' in the pseudo code above)
9358 // "@.omp_target_task_proxy_func' is generated by
9359 // emitTargetTaskProxyFunction.
9360 if (OutlinedFnID && DeviceID)
9361 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9362 EmitTargetCallFallbackCB, KArgs,
9363 DeviceID, RTLoc, TargetTaskAllocaIP);
9364
9365 // We only need to do the outlining if `DeviceID` is set to avoid calling
9366 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9367 // generating the `else` branch of an `if` clause.
9368 //
9369 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9370 // In this case, we execute the host implementation directly.
9371 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9372 }());
9373
9374 OMPBuilder.Builder.restoreIP(AfterIP);
9375 return Error::success();
9376 };
9377
9378 auto &&EmitTargetCallElse =
9379 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9381 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9382 // produce any.
9384 if (RequiresOuterTargetTask) {
9385 // Arguments that are intended to be directly forwarded to an
9386 // emitKernelLaunch call are pased as nullptr, since
9387 // OutlinedFnID=nullptr results in that call not being done.
9389 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9390 /*RTLoc=*/nullptr, AllocaIP,
9391 Dependencies, EmptyRTArgs, HasNoWait);
9392 }
9393 return EmitTargetCallFallbackCB(Builder.saveIP());
9394 }());
9395
9396 Builder.restoreIP(AfterIP);
9397 return Error::success();
9398 };
9399
9400 auto &&EmitTargetCallThen =
9401 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9403 Info.HasNoWait = HasNoWait;
9404 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9405
9407 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9408 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9409 /*IsNonContiguous=*/true,
9410 /*ForEndCall=*/false))
9411 return Err;
9412
9413 SmallVector<Value *, 3> NumTeamsC;
9414 for (auto [DefaultVal, RuntimeVal] :
9415 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9416 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9417 : Builder.getInt32(DefaultVal));
9418
9419 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9420 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9421 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9422 if (Clause)
9423 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9424 /*isSigned=*/false);
9425 return Clause;
9426 };
9427 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9428 if (Clause)
9429 Result =
9430 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9431 Result, Clause)
9432 : Clause;
9433 };
9434
9435 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9436 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9437 SmallVector<Value *, 3> NumThreadsC;
9438 Value *MaxThreadsClause =
9439 RuntimeAttrs.TeamsThreadLimit.size() == 1
9440 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9441 : nullptr;
9442
9443 for (auto [TeamsVal, TargetVal] : zip_equal(
9444 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9445 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9446 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9447
9448 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9449 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9450
9451 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9452 }
9453
9454 unsigned NumTargetItems = Info.NumberOfPtrs;
9455 uint32_t SrcLocStrSize;
9456 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9457 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9458 llvm::omp::IdentFlag(0), 0);
9459
9460 Value *TripCount = RuntimeAttrs.LoopTripCount
9461 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9462 Builder.getInt64Ty(),
9463 /*isSigned=*/false)
9464 : Builder.getInt64(0);
9465
9466 // Request zero groupprivate bytes by default.
9467 if (!DynCGroupMem)
9468 DynCGroupMem = Builder.getInt32(0);
9469
9471 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9472 HasNoWait, DynCGroupMemFallback);
9473
9474 // Assume no error was returned because TaskBodyCB and
9475 // EmitTargetCallFallbackCB don't produce any.
9477 // The presence of certain clauses on the target directive require the
9478 // explicit generation of the target task.
9479 if (RequiresOuterTargetTask)
9480 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9481 RTLoc, AllocaIP, Dependencies,
9482 KArgs.RTArgs, Info.HasNoWait);
9483
9484 return OMPBuilder.emitKernelLaunch(
9485 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9486 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9487 }());
9488
9489 Builder.restoreIP(AfterIP);
9490 return Error::success();
9491 };
9492
9493 // If we don't have an ID for the target region, it means an offload entry
9494 // wasn't created. In this case we just run the host fallback directly and
9495 // ignore any potential 'if' clauses.
9496 if (!OutlinedFnID) {
9497 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9498 return;
9499 }
9500
9501 // If there's no 'if' clause, only generate the kernel launch code path.
9502 if (!IfCond) {
9503 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9504 return;
9505 }
9506
9507 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9508 EmitTargetCallElse, AllocaIP));
9509}
9510
9512 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9513 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9514 TargetRegionEntryInfo &EntryInfo,
9515 const TargetKernelDefaultAttrs &DefaultAttrs,
9516 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9517 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9520 CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies,
9521 bool HasNowait, Value *DynCGroupMem,
9522 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9523
9524 if (!updateToLocation(Loc))
9525 return InsertPointTy();
9526
9527 Builder.restoreIP(CodeGenIP);
9528
9529 Function *OutlinedFn;
9530 Constant *OutlinedFnID = nullptr;
9531 // The target region is outlined into its own function. The LLVM IR for
9532 // the target region itself is generated using the callbacks CBFunc
9533 // and ArgAccessorFuncCB
9535 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9536 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9537 return Err;
9538
9539 // If we are not on the target device, then we need to generate code
9540 // to make a remote call (offload) to the previously outlined function
9541 // that represents the target region. Do that now.
9542 if (!Config.isTargetDevice())
9543 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9544 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9545 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9546 DynCGroupMemFallback);
9547 return Builder.saveIP();
9548}
9549
9550std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9551 StringRef FirstSeparator,
9552 StringRef Separator) {
9553 SmallString<128> Buffer;
9554 llvm::raw_svector_ostream OS(Buffer);
9555 StringRef Sep = FirstSeparator;
9556 for (StringRef Part : Parts) {
9557 OS << Sep << Part;
9558 Sep = Separator;
9559 }
9560 return OS.str().str();
9561}
9562
9563std::string
9565 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9566 Config.separator());
9567}
9568
9570 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9571 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9572 if (Elem.second) {
9573 assert(Elem.second->getValueType() == Ty &&
9574 "OMP internal variable has different type than requested");
9575 } else {
9576 // TODO: investigate the appropriate linkage type used for the global
9577 // variable for possibly changing that to internal or private, or maybe
9578 // create different versions of the function for different OMP internal
9579 // variables.
9580 const DataLayout &DL = M.getDataLayout();
9581 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9582 // default global AS is 1.
9583 // See double-target-call-with-declare-target.f90 and
9584 // declare-target-vars-in-target-region.f90 libomptarget
9585 // tests.
9586 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9587 : M.getTargetTriple().isAMDGPU()
9588 ? 0
9589 : DL.getDefaultGlobalsAddressSpace();
9590 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9593 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9594 Constant::getNullValue(Ty), Elem.first(),
9595 /*InsertBefore=*/nullptr,
9596 GlobalValue::NotThreadLocal, AddressSpaceVal);
9597 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9598 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9599 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9600 Elem.second = GV;
9601 }
9602
9603 return Elem.second;
9604}
9605
9606Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9607 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9608 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9609 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9610}
9611
9613 LLVMContext &Ctx = Builder.getContext();
9614 Value *Null =
9615 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9616 Value *SizeGep =
9617 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9618 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9619 return SizePtrToInt;
9620}
9621
9624 std::string VarName) {
9625 llvm::Constant *MaptypesArrayInit =
9626 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9627 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9628 M, MaptypesArrayInit->getType(),
9629 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9630 VarName);
9631 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9632 return MaptypesArrayGlobal;
9633}
9634
9636 InsertPointTy AllocaIP,
9637 unsigned NumOperands,
9638 struct MapperAllocas &MapperAllocas) {
9639 if (!updateToLocation(Loc))
9640 return;
9641
9642 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9643 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9644 Builder.restoreIP(AllocaIP);
9645 AllocaInst *ArgsBase = Builder.CreateAlloca(
9646 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9647 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9648 ".offload_ptrs");
9649 AllocaInst *ArgSizes = Builder.CreateAlloca(
9650 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9652 MapperAllocas.ArgsBase = ArgsBase;
9653 MapperAllocas.Args = Args;
9654 MapperAllocas.ArgSizes = ArgSizes;
9655}
9656
9658 Function *MapperFunc, Value *SrcLocInfo,
9659 Value *MaptypesArg, Value *MapnamesArg,
9661 int64_t DeviceID, unsigned NumOperands) {
9662 if (!updateToLocation(Loc))
9663 return;
9664
9665 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9666 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9667 Value *ArgsBaseGEP =
9668 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9669 {Builder.getInt32(0), Builder.getInt32(0)});
9670 Value *ArgsGEP =
9671 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9672 {Builder.getInt32(0), Builder.getInt32(0)});
9673 Value *ArgSizesGEP =
9674 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9675 {Builder.getInt32(0), Builder.getInt32(0)});
9676 Value *NullPtr =
9677 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9678 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9679 Builder.getInt32(NumOperands),
9680 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9681 MaptypesArg, MapnamesArg, NullPtr});
9682}
9683
9685 TargetDataRTArgs &RTArgs,
9686 TargetDataInfo &Info,
9687 bool ForEndCall) {
9688 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9689 "expected region end call to runtime only when end call is separate");
9690 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9691 auto VoidPtrTy = UnqualPtrTy;
9692 auto VoidPtrPtrTy = UnqualPtrTy;
9693 auto Int64Ty = Type::getInt64Ty(M.getContext());
9694 auto Int64PtrTy = UnqualPtrTy;
9695
9696 if (!Info.NumberOfPtrs) {
9697 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9698 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9699 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9700 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9701 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9702 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9703 return;
9704 }
9705
9706 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9707 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9708 Info.RTArgs.BasePointersArray,
9709 /*Idx0=*/0, /*Idx1=*/0);
9710 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9711 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9712 /*Idx0=*/0,
9713 /*Idx1=*/0);
9714 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9715 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9716 /*Idx0=*/0, /*Idx1=*/0);
9717 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9718 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9719 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9720 : Info.RTArgs.MapTypesArray,
9721 /*Idx0=*/0,
9722 /*Idx1=*/0);
9723
9724 // Only emit the mapper information arrays if debug information is
9725 // requested.
9726 if (!Info.EmitDebug)
9727 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9728 else
9729 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9730 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9731 /*Idx0=*/0,
9732 /*Idx1=*/0);
9733 // If there is no user-defined mapper, set the mapper array to nullptr to
9734 // avoid an unnecessary data privatization
9735 if (!Info.HasMapper)
9736 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9737 else
9738 RTArgs.MappersArray =
9739 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9740}
9741
9743 InsertPointTy CodeGenIP,
9744 MapInfosTy &CombinedInfo,
9745 TargetDataInfo &Info) {
9747 CombinedInfo.NonContigInfo;
9748
9749 // Build an array of struct descriptor_dim and then assign it to
9750 // offload_args.
9751 //
9752 // struct descriptor_dim {
9753 // uint64_t offset;
9754 // uint64_t count;
9755 // uint64_t stride
9756 // };
9757 Type *Int64Ty = Builder.getInt64Ty();
9759 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9760 "struct.descriptor_dim");
9761
9762 enum { OffsetFD = 0, CountFD, StrideFD };
9763 // We need two index variable here since the size of "Dims" is the same as
9764 // the size of Components, however, the size of offset, count, and stride is
9765 // equal to the size of base declaration that is non-contiguous.
9766 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9767 // Skip emitting ir if dimension size is 1 since it cannot be
9768 // non-contiguous.
9769 if (NonContigInfo.Dims[I] == 1)
9770 continue;
9771 Builder.restoreIP(AllocaIP);
9772 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9773 AllocaInst *DimsAddr =
9774 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9775 Builder.restoreIP(CodeGenIP);
9776 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9777 unsigned RevIdx = EE - II - 1;
9778 Value *DimsLVal = Builder.CreateInBoundsGEP(
9779 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9780 // Offset
9781 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9782 Builder.CreateAlignedStore(
9783 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9784 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9785 // Count
9786 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9787 Builder.CreateAlignedStore(
9788 NonContigInfo.Counts[L][RevIdx], CountLVal,
9789 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9790 // Stride
9791 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9792 Builder.CreateAlignedStore(
9793 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9794 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9795 }
9796 // args[I] = &dims
9797 Builder.restoreIP(CodeGenIP);
9798 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9799 DimsAddr, Builder.getPtrTy());
9800 Value *P = Builder.CreateConstInBoundsGEP2_32(
9801 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9802 Info.RTArgs.PointersArray, 0, I);
9803 Builder.CreateAlignedStore(
9804 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9805 ++L;
9806 }
9807}
9808
9809void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9810 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9811 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9812 BasicBlock *ExitBB, bool IsInit) {
9813 StringRef Prefix = IsInit ? ".init" : ".del";
9814
9815 // Evaluate if this is an array section.
9817 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9818 Value *IsArray =
9819 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9820 Value *DeleteBit = Builder.CreateAnd(
9821 MapType,
9822 Builder.getInt64(
9823 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9824 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9825 Value *DeleteCond;
9826 Value *Cond;
9827 if (IsInit) {
9828 // base != begin?
9829 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9830 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9831 DeleteCond = Builder.CreateIsNull(
9832 DeleteBit,
9833 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9834 } else {
9835 Cond = IsArray;
9836 DeleteCond = Builder.CreateIsNotNull(
9837 DeleteBit,
9838 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9839 }
9840 Cond = Builder.CreateAnd(Cond, DeleteCond);
9841 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9842
9843 emitBlock(BodyBB, MapperFn);
9844 // Get the array size by multiplying element size and element number (i.e., \p
9845 // Size).
9846 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9847 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9848 // memory allocation/deletion purpose only.
9849 Value *MapTypeArg = Builder.CreateAnd(
9850 MapType,
9851 Builder.getInt64(
9852 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9853 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9854 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9855 MapTypeArg = Builder.CreateOr(
9856 MapTypeArg,
9857 Builder.getInt64(
9858 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9859 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9860
9861 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9862 // data structure.
9863 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9864 ArraySize, MapTypeArg, MapName};
9866 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9867 OffloadingArgs);
9868}
9869
9872 llvm::Value *BeginArg)>
9873 GenMapInfoCB,
9874 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9875 SmallVector<Type *> Params;
9876 Params.emplace_back(Builder.getPtrTy());
9877 Params.emplace_back(Builder.getPtrTy());
9878 Params.emplace_back(Builder.getPtrTy());
9879 Params.emplace_back(Builder.getInt64Ty());
9880 Params.emplace_back(Builder.getInt64Ty());
9881 Params.emplace_back(Builder.getPtrTy());
9882
9883 auto *FnTy =
9884 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9885
9886 SmallString<64> TyStr;
9887 raw_svector_ostream Out(TyStr);
9888 Function *MapperFn =
9890 MapperFn->addFnAttr(Attribute::NoInline);
9891 MapperFn->addFnAttr(Attribute::NoUnwind);
9892 MapperFn->addParamAttr(0, Attribute::NoUndef);
9893 MapperFn->addParamAttr(1, Attribute::NoUndef);
9894 MapperFn->addParamAttr(2, Attribute::NoUndef);
9895 MapperFn->addParamAttr(3, Attribute::NoUndef);
9896 MapperFn->addParamAttr(4, Attribute::NoUndef);
9897 MapperFn->addParamAttr(5, Attribute::NoUndef);
9898
9899 // Start the mapper function code generation.
9900 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9901 auto SavedIP = Builder.saveIP();
9902 Builder.SetInsertPoint(EntryBB);
9903
9904 Value *MapperHandle = MapperFn->getArg(0);
9905 Value *BaseIn = MapperFn->getArg(1);
9906 Value *BeginIn = MapperFn->getArg(2);
9907 Value *Size = MapperFn->getArg(3);
9908 Value *MapType = MapperFn->getArg(4);
9909 Value *MapName = MapperFn->getArg(5);
9910
9911 // Compute the starting and end addresses of array elements.
9912 // Prepare common arguments for array initiation and deletion.
9913 // Convert the size in bytes into the number of array elements.
9914 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9915 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9916 Value *PtrBegin = BeginIn;
9917 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9918
9919 // Emit array initiation if this is an array section and \p MapType indicates
9920 // that memory allocation is required.
9921 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9922 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9923 MapType, MapName, ElementSize, HeadBB,
9924 /*IsInit=*/true);
9925
9926 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9927
9928 // Emit the loop header block.
9929 emitBlock(HeadBB, MapperFn);
9930 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9931 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9932 // Evaluate whether the initial condition is satisfied.
9933 Value *IsEmpty =
9934 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9935 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9936
9937 // Emit the loop body block.
9938 emitBlock(BodyBB, MapperFn);
9939 BasicBlock *LastBB = BodyBB;
9940 PHINode *PtrPHI =
9941 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9942 PtrPHI->addIncoming(PtrBegin, HeadBB);
9943
9944 // Get map clause information. Fill up the arrays with all mapped variables.
9945 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9946 if (!Info)
9947 return Info.takeError();
9948
9949 // Call the runtime API __tgt_mapper_num_components to get the number of
9950 // pre-existing components.
9951 Value *OffloadingArgs[] = {MapperHandle};
9952 Value *PreviousSize = createRuntimeFunctionCall(
9953 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9954 OffloadingArgs);
9955 Value *ShiftedPreviousSize =
9956 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9957
9958 // Fill up the runtime mapper handle for all components.
9959 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9960 Value *CurBaseArg = Info->BasePointers[I];
9961 Value *CurBeginArg = Info->Pointers[I];
9962 Value *CurSizeArg = Info->Sizes[I];
9963 Value *CurNameArg = Info->Names.size()
9964 ? Info->Names[I]
9965 : Constant::getNullValue(Builder.getPtrTy());
9966
9967 // Extract the MEMBER_OF field from the map type.
9968 Value *OriMapType = Builder.getInt64(
9969 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9970 Info->Types[I]));
9971 Value *MemberMapType =
9972 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9973
9974 // Combine the map type inherited from user-defined mapper with that
9975 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9976 // bits of the \a MapType, which is the input argument of the mapper
9977 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9978 // bits of MemberMapType.
9979 // [OpenMP 5.0], 1.2.6. map-type decay.
9980 // | alloc | to | from | tofrom | release | delete
9981 // ----------------------------------------------------------
9982 // alloc | alloc | alloc | alloc | alloc | release | delete
9983 // to | alloc | to | alloc | to | release | delete
9984 // from | alloc | alloc | from | from | release | delete
9985 // tofrom | alloc | to | from | tofrom | release | delete
9986 Value *LeftToFrom = Builder.CreateAnd(
9987 MapType,
9988 Builder.getInt64(
9989 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9990 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9991 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9992 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9993 BasicBlock *AllocElseBB =
9994 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9995 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9996 BasicBlock *ToElseBB =
9997 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9998 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9999 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
10000 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
10001 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
10002 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
10003 emitBlock(AllocBB, MapperFn);
10004 Value *AllocMapType = Builder.CreateAnd(
10005 MemberMapType,
10006 Builder.getInt64(
10007 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10008 OpenMPOffloadMappingFlags::OMP_MAP_TO |
10009 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10010 Builder.CreateBr(EndBB);
10011 emitBlock(AllocElseBB, MapperFn);
10012 Value *IsTo = Builder.CreateICmpEQ(
10013 LeftToFrom,
10014 Builder.getInt64(
10015 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10016 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10017 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
10018 // In case of to, clear OMP_MAP_FROM.
10019 emitBlock(ToBB, MapperFn);
10020 Value *ToMapType = Builder.CreateAnd(
10021 MemberMapType,
10022 Builder.getInt64(
10023 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10024 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10025 Builder.CreateBr(EndBB);
10026 emitBlock(ToElseBB, MapperFn);
10027 Value *IsFrom = Builder.CreateICmpEQ(
10028 LeftToFrom,
10029 Builder.getInt64(
10030 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10031 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
10032 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
10033 // In case of from, clear OMP_MAP_TO.
10034 emitBlock(FromBB, MapperFn);
10035 Value *FromMapType = Builder.CreateAnd(
10036 MemberMapType,
10037 Builder.getInt64(
10038 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10039 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
10040 // In case of tofrom, do nothing.
10041 emitBlock(EndBB, MapperFn);
10042 LastBB = EndBB;
10043 PHINode *CurMapType =
10044 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
10045 CurMapType->addIncoming(AllocMapType, AllocBB);
10046 CurMapType->addIncoming(ToMapType, ToBB);
10047 CurMapType->addIncoming(FromMapType, FromBB);
10048 CurMapType->addIncoming(MemberMapType, ToElseBB);
10049
10050 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
10051 CurSizeArg, CurMapType, CurNameArg};
10052
10053 auto ChildMapperFn = CustomMapperCB(I);
10054 if (!ChildMapperFn)
10055 return ChildMapperFn.takeError();
10056 if (*ChildMapperFn) {
10057 // Call the corresponding mapper function.
10058 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
10059 ->setDoesNotThrow();
10060 } else {
10061 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
10062 // data structure.
10064 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
10065 OffloadingArgs);
10066 }
10067 }
10068
10069 // Update the pointer to point to the next element that needs to be mapped,
10070 // and check whether we have mapped all elements.
10071 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
10072 "omp.arraymap.next");
10073 PtrPHI->addIncoming(PtrNext, LastBB);
10074 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
10075 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
10076 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
10077
10078 emitBlock(ExitBB, MapperFn);
10079 // Emit array deletion if this is an array section and \p MapType indicates
10080 // that deletion is required.
10081 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
10082 MapType, MapName, ElementSize, DoneBB,
10083 /*IsInit=*/false);
10084
10085 // Emit the function exit block.
10086 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
10087
10088 Builder.CreateRetVoid();
10089 Builder.restoreIP(SavedIP);
10090 return MapperFn;
10091}
10092
10094 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
10095 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
10096 bool IsNonContiguous,
10097 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
10098
10099 // Reset the array information.
10100 Info.clearArrayInfo();
10101 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
10102
10103 if (Info.NumberOfPtrs == 0)
10104 return Error::success();
10105
10106 Builder.restoreIP(AllocaIP);
10107 // Detect if we have any capture size requiring runtime evaluation of the
10108 // size so that a constant array could be eventually used.
10109 ArrayType *PointerArrayType =
10110 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
10111
10112 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
10113 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
10114
10115 Info.RTArgs.PointersArray = Builder.CreateAlloca(
10116 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
10117 AllocaInst *MappersArray = Builder.CreateAlloca(
10118 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
10119 Info.RTArgs.MappersArray = MappersArray;
10120
10121 // If we don't have any VLA types or other types that require runtime
10122 // evaluation, we can use a constant array for the map sizes, otherwise we
10123 // need to fill up the arrays as we do for the pointers.
10124 Type *Int64Ty = Builder.getInt64Ty();
10125 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
10126 ConstantInt::get(Int64Ty, 0));
10127 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10128 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10129 bool IsNonContigEntry =
10130 IsNonContiguous &&
10131 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10132 CombinedInfo.Types[I] &
10133 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10134 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10135 // descriptor_dim records), not the byte size.
10136 if (IsNonContigEntry) {
10137 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10138 "Index must be in-bounds for NON_CONTIG Dims array");
10139 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10140 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10141 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10142 continue;
10143 }
10144 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10145 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10146 ConstSizes[I] = CI;
10147 continue;
10148 }
10149 }
10150 RuntimeSizes.set(I);
10151 }
10152
10153 if (RuntimeSizes.all()) {
10154 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10155 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10156 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10157 restoreIPandDebugLoc(Builder, CodeGenIP);
10158 } else {
10159 auto *SizesArrayInit = ConstantArray::get(
10160 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10161 std::string Name = createPlatformSpecificName({"offload_sizes"});
10162 auto *SizesArrayGbl =
10163 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10164 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10165 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10166
10167 if (!RuntimeSizes.any()) {
10168 Info.RTArgs.SizesArray = SizesArrayGbl;
10169 } else {
10170 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10171 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10172 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10173 AllocaInst *Buffer = Builder.CreateAlloca(
10174 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10175 Buffer->setAlignment(OffloadSizeAlign);
10176 restoreIPandDebugLoc(Builder, CodeGenIP);
10177 Builder.CreateMemCpy(
10178 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10179 SizesArrayGbl, OffloadSizeAlign,
10180 Builder.getIntN(
10181 IndexSize,
10182 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10183
10184 Info.RTArgs.SizesArray = Buffer;
10185 }
10186 restoreIPandDebugLoc(Builder, CodeGenIP);
10187 }
10188
10189 // The map types are always constant so we don't need to generate code to
10190 // fill arrays. Instead, we create an array constant.
10192 for (auto mapFlag : CombinedInfo.Types)
10193 Mapping.push_back(
10194 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10195 mapFlag));
10196 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10197 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10198 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10199
10200 // The information types are only built if provided.
10201 if (!CombinedInfo.Names.empty()) {
10202 auto *MapNamesArrayGbl = createOffloadMapnames(
10203 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10204 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10205 Info.EmitDebug = true;
10206 } else {
10207 Info.RTArgs.MapNamesArray =
10209 Info.EmitDebug = false;
10210 }
10211
10212 // If there's a present map type modifier, it must not be applied to the end
10213 // of a region, so generate a separate map type array in that case.
10214 if (Info.separateBeginEndCalls()) {
10215 bool EndMapTypesDiffer = false;
10216 for (uint64_t &Type : Mapping) {
10217 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10218 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10219 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10220 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10221 EndMapTypesDiffer = true;
10222 }
10223 }
10224 if (EndMapTypesDiffer) {
10225 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10226 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10227 }
10228 }
10229
10230 PointerType *PtrTy = Builder.getPtrTy();
10231 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10232 Value *BPVal = CombinedInfo.BasePointers[I];
10233 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10234 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10235 0, I);
10236 Builder.CreateAlignedStore(BPVal, BP,
10237 M.getDataLayout().getPrefTypeAlign(PtrTy));
10238
10239 if (Info.requiresDevicePointerInfo()) {
10240 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10241 CodeGenIP = Builder.saveIP();
10242 Builder.restoreIP(AllocaIP);
10243 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10244 Builder.restoreIP(CodeGenIP);
10245 if (DeviceAddrCB)
10246 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10247 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10248 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10249 if (DeviceAddrCB)
10250 DeviceAddrCB(I, BP);
10251 }
10252 }
10253
10254 Value *PVal = CombinedInfo.Pointers[I];
10255 Value *P = Builder.CreateConstInBoundsGEP2_32(
10256 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10257 I);
10258 // TODO: Check alignment correct.
10259 Builder.CreateAlignedStore(PVal, P,
10260 M.getDataLayout().getPrefTypeAlign(PtrTy));
10261
10262 if (RuntimeSizes.test(I)) {
10263 Value *S = Builder.CreateConstInBoundsGEP2_32(
10264 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10265 /*Idx0=*/0,
10266 /*Idx1=*/I);
10267 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10268 Int64Ty,
10269 /*isSigned=*/true),
10270 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10271 }
10272 // Fill up the mapper array.
10273 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10274 Value *MFunc = ConstantPointerNull::get(PtrTy);
10275
10276 auto CustomMFunc = CustomMapperCB(I);
10277 if (!CustomMFunc)
10278 return CustomMFunc.takeError();
10279 if (*CustomMFunc)
10280 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10281
10282 Value *MAddr = Builder.CreateInBoundsGEP(
10283 PointerArrayType, MappersArray,
10284 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10285 Builder.CreateAlignedStore(
10286 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10287 }
10288
10289 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10290 Info.NumberOfPtrs == 0)
10291 return Error::success();
10292 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10293 return Error::success();
10294}
10295
10297 BasicBlock *CurBB = Builder.GetInsertBlock();
10298
10299 if (!CurBB || CurBB->hasTerminator()) {
10300 // If there is no insert point or the previous block is already
10301 // terminated, don't touch it.
10302 } else {
10303 // Otherwise, create a fall-through branch.
10304 Builder.CreateBr(Target);
10305 }
10306
10307 Builder.ClearInsertionPoint();
10308}
10309
10311 bool IsFinished) {
10312 BasicBlock *CurBB = Builder.GetInsertBlock();
10313
10314 // Fall out of the current block (if necessary).
10315 emitBranch(BB);
10316
10317 if (IsFinished && BB->use_empty()) {
10318 BB->eraseFromParent();
10319 return;
10320 }
10321
10322 // Place the block after the current block, if possible, or else at
10323 // the end of the function.
10324 if (CurBB && CurBB->getParent())
10325 CurFn->insert(std::next(CurBB->getIterator()), BB);
10326 else
10327 CurFn->insert(CurFn->end(), BB);
10328 Builder.SetInsertPoint(BB);
10329}
10330
10332 BodyGenCallbackTy ElseGen,
10333 InsertPointTy AllocaIP) {
10334 // If the condition constant folds and can be elided, try to avoid emitting
10335 // the condition and the dead arm of the if/else.
10336 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10337 auto CondConstant = CI->getSExtValue();
10338 if (CondConstant)
10339 return ThenGen(AllocaIP, Builder.saveIP());
10340
10341 return ElseGen(AllocaIP, Builder.saveIP());
10342 }
10343
10344 Function *CurFn = Builder.GetInsertBlock()->getParent();
10345
10346 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10347 // emit the conditional branch.
10348 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10349 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10350 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10351 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10352 // Emit the 'then' code.
10353 emitBlock(ThenBlock, CurFn);
10354 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10355 return Err;
10356 emitBranch(ContBlock);
10357 // Emit the 'else' code if present.
10358 // There is no need to emit line number for unconditional branch.
10359 emitBlock(ElseBlock, CurFn);
10360 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10361 return Err;
10362 // There is no need to emit line number for unconditional branch.
10363 emitBranch(ContBlock);
10364 // Emit the continuation block for code after the if.
10365 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10366 return Error::success();
10367}
10368
10369bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10370 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10373 "Unexpected Atomic Ordering.");
10374
10375 bool Flush = false;
10377
10378 switch (AK) {
10379 case Read:
10382 FlushAO = AtomicOrdering::Acquire;
10383 Flush = true;
10384 }
10385 break;
10386 case Write:
10387 case Compare:
10388 case Update:
10391 FlushAO = AtomicOrdering::Release;
10392 Flush = true;
10393 }
10394 break;
10395 case Capture:
10396 switch (AO) {
10398 FlushAO = AtomicOrdering::Acquire;
10399 Flush = true;
10400 break;
10402 FlushAO = AtomicOrdering::Release;
10403 Flush = true;
10404 break;
10408 Flush = true;
10409 break;
10410 default:
10411 // do nothing - leave silently.
10412 break;
10413 }
10414 }
10415
10416 if (Flush) {
10417 // Currently Flush RT call still doesn't take memory_ordering, so for when
10418 // that happens, this tries to do the resolution of which atomic ordering
10419 // to use with but issue the flush call
10420 // TODO: pass `FlushAO` after memory ordering support is added
10421 (void)FlushAO;
10422 emitFlush(Loc);
10423 }
10424
10425 // for AO == AtomicOrdering::Monotonic and all other case combinations
10426 // do nothing
10427 return Flush;
10428}
10429
10433 AtomicOrdering AO, InsertPointTy AllocaIP) {
10434 if (!updateToLocation(Loc))
10435 return Loc.IP;
10436
10437 assert(X.Var->getType()->isPointerTy() &&
10438 "OMP Atomic expects a pointer to target memory");
10439 Type *XElemTy = X.ElemTy;
10440 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10441 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10442 "OMP atomic read expected a scalar type");
10443
10444 Value *XRead = nullptr;
10445
10446 if (XElemTy->isIntegerTy()) {
10447 LoadInst *XLD =
10448 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10449 XLD->setAtomic(AO);
10450 XRead = cast<Value>(XLD);
10451 } else if (XElemTy->isStructTy()) {
10452 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10453 // target does not support `atomicrmw` of the size of the struct
10454 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10455 OldVal->setAtomic(AO);
10456 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10457 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10458 OpenMPIRBuilder::AtomicInfo atomicInfo(
10459 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10460 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10461 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10462 XRead = AtomicLoadRes.first;
10463 OldVal->eraseFromParent();
10464 } else {
10465 // We need to perform atomic op as integer
10466 IntegerType *IntCastTy =
10467 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10468 LoadInst *XLoad =
10469 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10470 XLoad->setAtomic(AO);
10471 if (XElemTy->isFloatingPointTy()) {
10472 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10473 } else {
10474 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10475 }
10476 }
10477 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10478 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10479 return Builder.saveIP();
10480}
10481
10484 AtomicOpValue &X, Value *Expr,
10485 AtomicOrdering AO, InsertPointTy AllocaIP) {
10486 if (!updateToLocation(Loc))
10487 return Loc.IP;
10488
10489 assert(X.Var->getType()->isPointerTy() &&
10490 "OMP Atomic expects a pointer to target memory");
10491 Type *XElemTy = X.ElemTy;
10492 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10493 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10494 "OMP atomic write expected a scalar type");
10495
10496 if (XElemTy->isIntegerTy()) {
10497 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10498 XSt->setAtomic(AO);
10499 } else if (XElemTy->isStructTy()) {
10500 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10501 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10502 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10503 OpenMPIRBuilder::AtomicInfo atomicInfo(
10504 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10505 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10506 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10507 OldVal->eraseFromParent();
10508 } else {
10509 // We need to bitcast and perform atomic op as integers
10510 IntegerType *IntCastTy =
10511 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10512 Value *ExprCast =
10513 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10514 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10515 XSt->setAtomic(AO);
10516 }
10517
10518 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10519 return Builder.saveIP();
10520}
10521
10524 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10525 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10526 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10527 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10528 if (!updateToLocation(Loc))
10529 return Loc.IP;
10530
10531 LLVM_DEBUG({
10532 Type *XTy = X.Var->getType();
10533 assert(XTy->isPointerTy() &&
10534 "OMP Atomic expects a pointer to target memory");
10535 Type *XElemTy = X.ElemTy;
10536 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10537 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10538 "OMP atomic update expected a scalar or struct type");
10539 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10540 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10541 "OpenMP atomic does not support LT or GT operations");
10542 });
10543
10544 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10545 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10546 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10547 if (!AtomicResult)
10548 return AtomicResult.takeError();
10549 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10550 return Builder.saveIP();
10551}
10552
10553// FIXME: Duplicating AtomicExpand
10554Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10555 AtomicRMWInst::BinOp RMWOp) {
10556 switch (RMWOp) {
10557 case AtomicRMWInst::Add:
10558 return Builder.CreateAdd(Src1, Src2);
10559 case AtomicRMWInst::Sub:
10560 return Builder.CreateSub(Src1, Src2);
10561 case AtomicRMWInst::And:
10562 return Builder.CreateAnd(Src1, Src2);
10564 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10565 case AtomicRMWInst::Or:
10566 return Builder.CreateOr(Src1, Src2);
10567 case AtomicRMWInst::Xor:
10568 return Builder.CreateXor(Src1, Src2);
10573 case AtomicRMWInst::Max:
10574 case AtomicRMWInst::Min:
10587 llvm_unreachable("Unsupported atomic update operation");
10588 }
10589 llvm_unreachable("Unsupported atomic update operation");
10590}
10591
10592Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10593 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10595 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10596 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10597 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2.
10598 bool emitRMWOp = false;
10599 switch (RMWOp) {
10600 case AtomicRMWInst::Add:
10601 case AtomicRMWInst::And:
10603 case AtomicRMWInst::Or:
10604 case AtomicRMWInst::Xor:
10606 emitRMWOp = XElemTy;
10607 break;
10608 case AtomicRMWInst::Sub:
10609 emitRMWOp = (IsXBinopExpr && XElemTy);
10610 break;
10611 default:
10612 emitRMWOp = false;
10613 }
10614 emitRMWOp &= XElemTy->isIntegerTy();
10615
10616 std::pair<Value *, Value *> Res;
10617 if (emitRMWOp) {
10618 AtomicRMWInst *RMWInst =
10619 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10620 if (T.isAMDGPU()) {
10621 if (IsIgnoreDenormalMode)
10622 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10623 llvm::MDNode::get(Builder.getContext(), {}));
10624 if (!IsFineGrainedMemory)
10625 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10626 llvm::MDNode::get(Builder.getContext(), {}));
10627 if (!IsRemoteMemory)
10628 RMWInst->setMetadata("amdgpu.no.remote.memory",
10629 llvm::MDNode::get(Builder.getContext(), {}));
10630 }
10631 Res.first = RMWInst;
10632 // not needed except in case of postfix captures. Generate anyway for
10633 // consistency with the else part. Will be removed with any DCE pass.
10634 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10635 if (RMWOp == AtomicRMWInst::Xchg)
10636 Res.second = Res.first;
10637 else
10638 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10639 } else if (XElemTy->isStructTy()) {
10640 LoadInst *OldVal =
10641 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10642 OldVal->setAtomic(AO);
10643 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10644 unsigned LoadSize = LoadDL.getTypeStoreSize(XElemTy);
10645
10646 OpenMPIRBuilder::AtomicInfo atomicInfo(
10647 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10648 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10649 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10650 BasicBlock *CurBB = Builder.GetInsertBlock();
10651 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10652 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10653 BasicBlock *ExitBB =
10654 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10655 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10656 X->getName() + ".atomic.cont");
10657 ContBB->getTerminator()->eraseFromParent();
10658 Builder.restoreIP(AllocaIP);
10659 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10660 NewAtomicAddr->setName(X->getName() + "x.new.val");
10661 Builder.SetInsertPoint(ContBB);
10662 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10663 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10664 Value *OldExprVal = PHI;
10665 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10666 if (!CBResult)
10667 return CBResult.takeError();
10668 Value *Upd = *CBResult;
10669 Builder.CreateStore(Upd, NewAtomicAddr);
10672 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10673 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10674 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10675 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10676 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10677 OldVal->eraseFromParent();
10678 Res.first = OldExprVal;
10679 Res.second = Upd;
10680
10681 if (UnreachableInst *ExitTI =
10683 CurBBTI->eraseFromParent();
10684 Builder.SetInsertPoint(ExitBB);
10685 } else {
10686 Builder.SetInsertPoint(ExitTI);
10687 }
10688 } else {
10689 IntegerType *IntCastTy =
10690 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10691 LoadInst *OldVal =
10692 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10693 OldVal->setAtomic(AO);
10694 // CurBB
10695 // | /---\
10696 // ContBB |
10697 // | \---/
10698 // ExitBB
10699 BasicBlock *CurBB = Builder.GetInsertBlock();
10700 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10701 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10702 BasicBlock *ExitBB =
10703 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10704 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10705 X->getName() + ".atomic.cont");
10706 ContBB->getTerminator()->eraseFromParent();
10707 Builder.restoreIP(AllocaIP);
10708 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10709 NewAtomicAddr->setName(X->getName() + "x.new.val");
10710 Builder.SetInsertPoint(ContBB);
10711 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10712 PHI->addIncoming(OldVal, CurBB);
10713 bool IsIntTy = XElemTy->isIntegerTy();
10714 Value *OldExprVal = PHI;
10715 if (!IsIntTy) {
10716 if (XElemTy->isFloatingPointTy()) {
10717 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10718 X->getName() + ".atomic.fltCast");
10719 } else {
10720 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10721 X->getName() + ".atomic.ptrCast");
10722 }
10723 }
10724
10725 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10726 if (!CBResult)
10727 return CBResult.takeError();
10728 Value *Upd = *CBResult;
10729 Builder.CreateStore(Upd, NewAtomicAddr);
10730 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10733 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10734 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10735 Result->setVolatile(VolatileX);
10736 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10737 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10738 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10739 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10740
10741 Res.first = OldExprVal;
10742 Res.second = Upd;
10743
10744 // set Insertion point in exit block
10745 if (UnreachableInst *ExitTI =
10747 CurBBTI->eraseFromParent();
10748 Builder.SetInsertPoint(ExitBB);
10749 } else {
10750 Builder.SetInsertPoint(ExitTI);
10751 }
10752 }
10753
10754 return Res;
10755}
10756
10759 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10760 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10761 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10762 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10763 if (!updateToLocation(Loc))
10764 return Loc.IP;
10765
10766 LLVM_DEBUG({
10767 Type *XTy = X.Var->getType();
10768 assert(XTy->isPointerTy() &&
10769 "OMP Atomic expects a pointer to target memory");
10770 Type *XElemTy = X.ElemTy;
10771 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10772 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10773 "OMP atomic capture expected a scalar or struct type");
10774 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10775 "OpenMP atomic does not support LT or GT operations");
10776 });
10777
10778 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10779 // 'x' is simply atomically rewritten with 'expr'.
10780 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10781 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10782 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10783 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10784 if (!AtomicResult)
10785 return AtomicResult.takeError();
10786 Value *CapturedVal =
10787 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10788 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10789
10790 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10791 return Builder.saveIP();
10792}
10793
10797 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10798 bool IsFailOnly) {
10799
10801 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10802 IsPostfixUpdate, IsFailOnly, Failure);
10803}
10804
10808 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10809 bool IsFailOnly, AtomicOrdering Failure) {
10810
10811 if (!updateToLocation(Loc))
10812 return Loc.IP;
10813
10814 assert(X.Var->getType()->isPointerTy() &&
10815 "OMP atomic expects a pointer to target memory");
10816 // compare capture
10817 if (V.Var) {
10818 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10819 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10820 }
10821
10822 bool IsInteger = E->getType()->isIntegerTy();
10823
10824 if (Op == OMPAtomicCompareOp::EQ) {
10825 AtomicCmpXchgInst *Result = nullptr;
10826 if (!IsInteger) {
10827 IntegerType *IntCastTy =
10828 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10829 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10830 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10831 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10832 AO, Failure);
10833 } else {
10834 Result =
10835 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10836 }
10837
10838 if (V.Var) {
10839 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10840 if (!IsInteger)
10841 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10842 assert(OldValue->getType() == V.ElemTy &&
10843 "OldValue and V must be of same type");
10844 if (IsPostfixUpdate) {
10845 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10846 } else {
10847 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10848 if (IsFailOnly) {
10849 // CurBB----
10850 // | |
10851 // v |
10852 // ContBB |
10853 // | |
10854 // v |
10855 // ExitBB <-
10856 //
10857 // where ContBB only contains the store of old value to 'v'.
10858 BasicBlock *CurBB = Builder.GetInsertBlock();
10859 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10860 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10861 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10862 CurBBTI, X.Var->getName() + ".atomic.exit");
10863 BasicBlock *ContBB = CurBB->splitBasicBlock(
10864 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10865 ContBB->getTerminator()->eraseFromParent();
10866 CurBB->getTerminator()->eraseFromParent();
10867
10868 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10869
10870 Builder.SetInsertPoint(ContBB);
10871 Builder.CreateStore(OldValue, V.Var);
10872 Builder.CreateBr(ExitBB);
10873
10874 if (UnreachableInst *ExitTI =
10876 CurBBTI->eraseFromParent();
10877 Builder.SetInsertPoint(ExitBB);
10878 } else {
10879 Builder.SetInsertPoint(ExitTI);
10880 }
10881 } else {
10882 Value *CapturedValue =
10883 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10884 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10885 }
10886 }
10887 }
10888 // The comparison result has to be stored.
10889 if (R.Var) {
10890 assert(R.Var->getType()->isPointerTy() &&
10891 "r.var must be of pointer type");
10892 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10893
10894 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10895 Value *ResultCast = R.IsSigned
10896 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10897 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10898 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10899 }
10900 } else {
10901 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10902 "Op should be either max or min at this point");
10903 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10904
10905 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10906 // Let's take max as example.
10907 // OpenMP form:
10908 // x = x > expr ? expr : x;
10909 // LLVM form:
10910 // *ptr = *ptr > val ? *ptr : val;
10911 // We need to transform to LLVM form.
10912 // x = x <= expr ? x : expr;
10914 if (IsXBinopExpr) {
10915 if (IsInteger) {
10916 if (X.IsSigned)
10917 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10919 else
10920 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10922 } else {
10923 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10925 }
10926 } else {
10927 if (IsInteger) {
10928 if (X.IsSigned)
10929 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10931 else
10932 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10934 } else {
10935 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10937 }
10938 }
10939
10940 AtomicRMWInst *OldValue =
10941 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10942 if (V.Var) {
10943 Value *CapturedValue = nullptr;
10944 if (IsPostfixUpdate) {
10945 CapturedValue = OldValue;
10946 } else {
10947 CmpInst::Predicate Pred;
10948 switch (NewOp) {
10949 case AtomicRMWInst::Max:
10950 Pred = CmpInst::ICMP_SGT;
10951 break;
10953 Pred = CmpInst::ICMP_UGT;
10954 break;
10956 Pred = CmpInst::FCMP_OGT;
10957 break;
10958 case AtomicRMWInst::Min:
10959 Pred = CmpInst::ICMP_SLT;
10960 break;
10962 Pred = CmpInst::ICMP_ULT;
10963 break;
10965 Pred = CmpInst::FCMP_OLT;
10966 break;
10967 default:
10968 llvm_unreachable("unexpected comparison op");
10969 }
10970 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10971 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10972 }
10973 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10974 }
10975 }
10976
10977 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10978
10979 return Builder.saveIP();
10980}
10981
10984 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10985 Value *NumTeamsUpper, Value *ThreadLimit,
10986 Value *IfExpr) {
10987 if (!updateToLocation(Loc))
10988 return InsertPointTy();
10989
10990 uint32_t SrcLocStrSize;
10991 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10992 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10993 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10994
10995 // Outer allocation basicblock is the entry block of the current function.
10996 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10997 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10998 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10999 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11000 }
11001
11002 // The current basic block is split into four basic blocks. After outlining,
11003 // they will be mapped as follows:
11004 // ```
11005 // def current_fn() {
11006 // current_basic_block:
11007 // br label %teams.exit
11008 // teams.exit:
11009 // ; instructions after teams
11010 // }
11011 //
11012 // def outlined_fn() {
11013 // teams.alloca:
11014 // br label %teams.body
11015 // teams.body:
11016 // ; instructions within teams body
11017 // }
11018 // ```
11019 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
11020 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
11021 BasicBlock *AllocaBB =
11022 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
11023
11024 bool SubClausesPresent =
11025 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
11026 // Push num_teams
11027 if (!Config.isTargetDevice() && SubClausesPresent) {
11028 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
11029 "if lowerbound is non-null, then upperbound must also be non-null "
11030 "for bounds on num_teams");
11031
11032 if (NumTeamsUpper == nullptr)
11033 NumTeamsUpper = Builder.getInt32(0);
11034
11035 if (NumTeamsLower == nullptr)
11036 NumTeamsLower = NumTeamsUpper;
11037
11038 if (IfExpr) {
11039 assert(IfExpr->getType()->isIntegerTy() &&
11040 "argument to if clause must be an integer value");
11041
11042 // upper = ifexpr ? upper : 1
11043 if (IfExpr->getType() != Int1)
11044 IfExpr = Builder.CreateICmpNE(IfExpr,
11045 ConstantInt::get(IfExpr->getType(), 0));
11046 NumTeamsUpper = Builder.CreateSelect(
11047 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
11048
11049 // lower = ifexpr ? lower : 1
11050 NumTeamsLower = Builder.CreateSelect(
11051 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
11052 }
11053
11054 if (ThreadLimit == nullptr)
11055 ThreadLimit = Builder.getInt32(0);
11056
11057 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
11058 // truncate or sign extend the passed values to match the int32 parameters.
11059 Value *NumTeamsLowerInt32 =
11060 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
11061 Value *NumTeamsUpperInt32 =
11062 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
11063 Value *ThreadLimitInt32 =
11064 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
11065
11066 Value *ThreadNum = getOrCreateThreadID(Ident);
11067
11069 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
11070 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
11071 ThreadLimitInt32});
11072 }
11073 // Generate the body of teams.
11074 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11075 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11076 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11077 return Err;
11078
11079 OutlineInfo OI;
11080 OI.EntryBB = AllocaBB;
11081 OI.ExitBB = ExitBB;
11082 OI.OuterAllocaBB = &OuterAllocaBB;
11083
11084 // Insert fake values for global tid and bound tid.
11086 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
11088 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
11090 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
11091
11092 auto HostPostOutlineCB = [this, Ident,
11093 ToBeDeleted](Function &OutlinedFn) mutable {
11094 // The stale call instruction will be replaced with a new call instruction
11095 // for runtime call with the outlined function.
11096
11097 assert(OutlinedFn.hasOneUse() &&
11098 "there must be a single user for the outlined function");
11099 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
11100 ToBeDeleted.push_back(StaleCI);
11101
11102 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
11103 "Outlined function must have two or three arguments only");
11104
11105 bool HasShared = OutlinedFn.arg_size() == 3;
11106
11107 OutlinedFn.getArg(0)->setName("global.tid.ptr");
11108 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
11109 if (HasShared)
11110 OutlinedFn.getArg(2)->setName("data");
11111
11112 // Call to the runtime function for teams in the current function.
11113 assert(StaleCI && "Error while outlining - no CallInst user found for the "
11114 "outlined function.");
11115 Builder.SetInsertPoint(StaleCI);
11116 SmallVector<Value *> Args = {
11117 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
11118 if (HasShared)
11119 Args.push_back(StaleCI->getArgOperand(2));
11122 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
11123 Args);
11124
11125 for (Instruction *I : llvm::reverse(ToBeDeleted))
11126 I->eraseFromParent();
11127 };
11128
11129 if (!Config.isTargetDevice())
11130 OI.PostOutlineCB = HostPostOutlineCB;
11131
11132 addOutlineInfo(std::move(OI));
11133
11134 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11135
11136 return Builder.saveIP();
11137}
11138
11141 InsertPointTy OuterAllocaIP,
11142 BodyGenCallbackTy BodyGenCB) {
11143 if (!updateToLocation(Loc))
11144 return InsertPointTy();
11145
11146 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
11147
11148 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11149 BasicBlock *BodyBB =
11150 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11151 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11152 }
11153 BasicBlock *ExitBB =
11154 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11155 BasicBlock *BodyBB =
11156 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11157 BasicBlock *AllocaBB =
11158 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11159
11160 // Generate the body of distribute clause
11161 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11162 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11163 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11164 return Err;
11165
11166 // When using target we use different runtime functions which require a
11167 // callback.
11168 if (Config.isTargetDevice()) {
11169 OutlineInfo OI;
11170 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
11171 OI.EntryBB = AllocaBB;
11172 OI.ExitBB = ExitBB;
11173
11174 addOutlineInfo(std::move(OI));
11175 }
11176 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11177
11178 return Builder.saveIP();
11179}
11180
11183 std::string VarName) {
11184 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11186 Names.size()),
11187 Names);
11188 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11189 M, MapNamesArrayInit->getType(),
11190 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11191 VarName);
11192 return MapNamesArrayGlobal;
11193}
11194
11195// Create all simple and struct types exposed by the runtime and remember
11196// the llvm::PointerTypes of them for easy access later.
11197void OpenMPIRBuilder::initializeTypes(Module &M) {
11198 LLVMContext &Ctx = M.getContext();
11199 StructType *T;
11200 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11201 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11202#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11203#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11204 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11205 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11206#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11207 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11208 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11209#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11210 T = StructType::getTypeByName(Ctx, StructName); \
11211 if (!T) \
11212 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11213 VarName = T; \
11214 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11215#include "llvm/Frontend/OpenMP/OMPKinds.def"
11216}
11217
11220 SmallVectorImpl<BasicBlock *> &BlockVector) {
11222 BlockSet.insert(EntryBB);
11223 BlockSet.insert(ExitBB);
11224
11225 Worklist.push_back(EntryBB);
11226 while (!Worklist.empty()) {
11227 BasicBlock *BB = Worklist.pop_back_val();
11228 BlockVector.push_back(BB);
11229 for (BasicBlock *SuccBB : successors(BB))
11230 if (BlockSet.insert(SuccBB).second)
11231 Worklist.push_back(SuccBB);
11232 }
11233}
11234
11236 uint64_t Size, int32_t Flags,
11238 StringRef Name) {
11239 if (!Config.isGPU()) {
11242 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11243 return;
11244 }
11245 // TODO: Add support for global variables on the device after declare target
11246 // support.
11247 Function *Fn = dyn_cast<Function>(Addr);
11248 if (!Fn)
11249 return;
11250
11251 // Add a function attribute for the kernel.
11252 Fn->addFnAttr("kernel");
11253 if (T.isAMDGCN())
11254 Fn->addFnAttr("uniform-work-group-size");
11255 Fn->addFnAttr(Attribute::MustProgress);
11256}
11257
11258// We only generate metadata for function that contain target regions.
11261
11262 // If there are no entries, we don't need to do anything.
11263 if (OffloadInfoManager.empty())
11264 return;
11265
11266 LLVMContext &C = M.getContext();
11269 16>
11270 OrderedEntries(OffloadInfoManager.size());
11271
11272 // Auxiliary methods to create metadata values and strings.
11273 auto &&GetMDInt = [this](unsigned V) {
11274 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11275 };
11276
11277 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11278
11279 // Create the offloading info metadata node.
11280 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11281 auto &&TargetRegionMetadataEmitter =
11282 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11283 const TargetRegionEntryInfo &EntryInfo,
11285 // Generate metadata for target regions. Each entry of this metadata
11286 // contains:
11287 // - Entry 0 -> Kind of this type of metadata (0).
11288 // - Entry 1 -> Device ID of the file where the entry was identified.
11289 // - Entry 2 -> File ID of the file where the entry was identified.
11290 // - Entry 3 -> Mangled name of the function where the entry was
11291 // identified.
11292 // - Entry 4 -> Line in the file where the entry was identified.
11293 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11294 // - Entry 6 -> Order the entry was created.
11295 // The first element of the metadata node is the kind.
11296 Metadata *Ops[] = {
11297 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11298 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11299 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11300 GetMDInt(E.getOrder())};
11301
11302 // Save this entry in the right position of the ordered entries array.
11303 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11304
11305 // Add metadata to the named metadata node.
11306 MD->addOperand(MDNode::get(C, Ops));
11307 };
11308
11309 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11310
11311 // Create function that emits metadata for each device global variable entry;
11312 auto &&DeviceGlobalVarMetadataEmitter =
11313 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11314 StringRef MangledName,
11316 // Generate metadata for global variables. Each entry of this metadata
11317 // contains:
11318 // - Entry 0 -> Kind of this type of metadata (1).
11319 // - Entry 1 -> Mangled name of the variable.
11320 // - Entry 2 -> Declare target kind.
11321 // - Entry 3 -> Order the entry was created.
11322 // The first element of the metadata node is the kind.
11323 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11324 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11325
11326 // Save this entry in the right position of the ordered entries array.
11327 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11328 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11329
11330 // Add metadata to the named metadata node.
11331 MD->addOperand(MDNode::get(C, Ops));
11332 };
11333
11334 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11335 DeviceGlobalVarMetadataEmitter);
11336
11337 for (const auto &E : OrderedEntries) {
11338 assert(E.first && "All ordered entries must exist!");
11339 if (const auto *CE =
11341 E.first)) {
11342 if (!CE->getID() || !CE->getAddress()) {
11343 // Do not blame the entry if the parent funtion is not emitted.
11344 TargetRegionEntryInfo EntryInfo = E.second;
11345 StringRef FnName = EntryInfo.ParentName;
11346 if (!M.getNamedValue(FnName))
11347 continue;
11348 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11349 continue;
11350 }
11351 createOffloadEntry(CE->getID(), CE->getAddress(),
11352 /*Size=*/0, CE->getFlags(),
11354 } else if (const auto *CE = dyn_cast<
11356 E.first)) {
11359 CE->getFlags());
11360 switch (Flags) {
11363 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11364 continue;
11365 if (!CE->getAddress()) {
11366 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11367 continue;
11368 }
11369 // The vaiable has no definition - no need to add the entry.
11370 if (CE->getVarSize() == 0)
11371 continue;
11372 break;
11374 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11375 (!Config.isTargetDevice() && CE->getAddress())) &&
11376 "Declaret target link address is set.");
11377 if (Config.isTargetDevice())
11378 continue;
11379 if (!CE->getAddress()) {
11381 continue;
11382 }
11383 break;
11386 if (!CE->getAddress()) {
11387 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11388 continue;
11389 }
11390 break;
11391 default:
11392 break;
11393 }
11394
11395 // Hidden or internal symbols on the device are not externally visible.
11396 // We should not attempt to register them by creating an offloading
11397 // entry. Indirect variables are handled separately on the device.
11398 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11399 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11400 (Flags !=
11402 Flags != OffloadEntriesInfoManager::
11403 OMPTargetGlobalVarEntryIndirectVTable))
11404 continue;
11405
11406 // Indirect globals need to use a special name that doesn't match the name
11407 // of the associated host global.
11409 Flags ==
11411 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11412 Flags, CE->getLinkage(), CE->getVarName());
11413 else
11414 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11415 Flags, CE->getLinkage());
11416
11417 } else {
11418 llvm_unreachable("Unsupported entry kind.");
11419 }
11420 }
11421
11422 // Emit requires directive globals to a special entry so the runtime can
11423 // register them when the device image is loaded.
11424 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11425 // entries should be redesigned to better suit this use-case.
11426 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11430 ".requires", /*Size=*/0,
11432 Config.getRequiresFlags());
11433}
11434
11437 unsigned FileID, unsigned Line, unsigned Count) {
11438 raw_svector_ostream OS(Name);
11439 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11440 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11441 if (Count)
11442 OS << "_" << Count;
11443}
11444
11446 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11447 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11449 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11450 EntryInfo.Line, NewCount);
11451}
11452
11455 vfs::FileSystem &VFS,
11456 StringRef ParentName) {
11457 sys::fs::UniqueID ID(0xdeadf17e, 0);
11458 auto FileIDInfo = CallBack();
11459 uint64_t FileID = 0;
11460 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11461 ID = Status->getUniqueID();
11462 FileID = Status->getUniqueID().getFile();
11463 } else {
11464 // If the inode ID could not be determined, create a hash value
11465 // the current file name and use that as an ID.
11466 FileID = hash_value(std::get<0>(FileIDInfo));
11467 }
11468
11469 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11470 std::get<1>(FileIDInfo));
11471}
11472
11474 unsigned Offset = 0;
11475 for (uint64_t Remain =
11476 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11478 !(Remain & 1); Remain = Remain >> 1)
11479 Offset++;
11480 return Offset;
11481}
11482
11485 // Rotate by getFlagMemberOffset() bits.
11486 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11487 << getFlagMemberOffset());
11488}
11489
11492 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11493 // If the entry is PTR_AND_OBJ but has not been marked with the special
11494 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11495 // marked as MEMBER_OF.
11496 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11498 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11501 return;
11502
11503 // Entries with ATTACH are not members-of anything. They are handled
11504 // separately by the runtime after other maps have been handled.
11505 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11507 return;
11508
11509 // Reset the placeholder value to prepare the flag for the assignment of the
11510 // proper MEMBER_OF value.
11511 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11512 Flags |= MemberOfFlag;
11513}
11514
11518 bool IsDeclaration, bool IsExternallyVisible,
11519 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11520 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11521 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11522 std::function<Constant *()> GlobalInitializer,
11523 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11524 // TODO: convert this to utilise the IRBuilder Config rather than
11525 // a passed down argument.
11526 if (OpenMPSIMD)
11527 return nullptr;
11528
11531 CaptureClause ==
11533 Config.hasRequiresUnifiedSharedMemory())) {
11534 SmallString<64> PtrName;
11535 {
11536 raw_svector_ostream OS(PtrName);
11537 OS << MangledName;
11538 if (!IsExternallyVisible)
11539 OS << format("_%x", EntryInfo.FileID);
11540 OS << "_decl_tgt_ref_ptr";
11541 }
11542
11543 Value *Ptr = M.getNamedValue(PtrName);
11544
11545 if (!Ptr) {
11546 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11547 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11548
11549 auto *GV = cast<GlobalVariable>(Ptr);
11550 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11551
11552 if (!Config.isTargetDevice()) {
11553 if (GlobalInitializer)
11554 GV->setInitializer(GlobalInitializer());
11555 else
11556 GV->setInitializer(GlobalValue);
11557 }
11558
11560 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11561 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11562 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11563 }
11564
11565 return cast<Constant>(Ptr);
11566 }
11567
11568 return nullptr;
11569}
11570
11574 bool IsDeclaration, bool IsExternallyVisible,
11575 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11576 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11577 std::vector<Triple> TargetTriple,
11578 std::function<Constant *()> GlobalInitializer,
11579 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11580 Constant *Addr) {
11582 (TargetTriple.empty() && !Config.isTargetDevice()))
11583 return;
11584
11586 StringRef VarName;
11587 int64_t VarSize;
11589
11591 CaptureClause ==
11593 !Config.hasRequiresUnifiedSharedMemory()) {
11595 VarName = MangledName;
11596 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11597
11598 if (!IsDeclaration)
11599 VarSize = divideCeil(
11600 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11601 else
11602 VarSize = 0;
11603 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11604
11605 // This is a workaround carried over from Clang which prevents undesired
11606 // optimisation of internal variables.
11607 if (Config.isTargetDevice() &&
11608 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11609 // Do not create a "ref-variable" if the original is not also available
11610 // on the host.
11611 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11612 return;
11613
11614 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11615
11616 if (!M.getNamedValue(RefName)) {
11617 Constant *AddrRef =
11618 getOrCreateInternalVariable(Addr->getType(), RefName);
11619 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11620 GvAddrRef->setConstant(true);
11621 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11622 GvAddrRef->setInitializer(Addr);
11623 GeneratedRefs.push_back(GvAddrRef);
11624 }
11625 }
11626 } else {
11629 else
11631
11632 if (Config.isTargetDevice()) {
11633 VarName = (Addr) ? Addr->getName() : "";
11634 Addr = nullptr;
11635 } else {
11637 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11638 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11639 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11640 VarName = (Addr) ? Addr->getName() : "";
11641 }
11642 VarSize = M.getDataLayout().getPointerSize();
11644 }
11645
11646 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11647 Flags, Linkage);
11648}
11649
11650/// Loads all the offload entries information from the host IR
11651/// metadata.
11653 // If we are in target mode, load the metadata from the host IR. This code has
11654 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11655
11656 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11657 if (!MD)
11658 return;
11659
11660 for (MDNode *MN : MD->operands()) {
11661 auto &&GetMDInt = [MN](unsigned Idx) {
11662 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11663 return cast<ConstantInt>(V->getValue())->getZExtValue();
11664 };
11665
11666 auto &&GetMDString = [MN](unsigned Idx) {
11667 auto *V = cast<MDString>(MN->getOperand(Idx));
11668 return V->getString();
11669 };
11670
11671 switch (GetMDInt(0)) {
11672 default:
11673 llvm_unreachable("Unexpected metadata!");
11674 break;
11675 case OffloadEntriesInfoManager::OffloadEntryInfo::
11676 OffloadingEntryInfoTargetRegion: {
11677 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11678 /*DeviceID=*/GetMDInt(1),
11679 /*FileID=*/GetMDInt(2),
11680 /*Line=*/GetMDInt(4),
11681 /*Count=*/GetMDInt(5));
11682 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11683 /*Order=*/GetMDInt(6));
11684 break;
11685 }
11686 case OffloadEntriesInfoManager::OffloadEntryInfo::
11687 OffloadingEntryInfoDeviceGlobalVar:
11688 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11689 /*MangledName=*/GetMDString(1),
11691 /*Flags=*/GetMDInt(2)),
11692 /*Order=*/GetMDInt(3));
11693 break;
11694 }
11695 }
11696}
11697
11699 StringRef HostFilePath) {
11700 if (HostFilePath.empty())
11701 return;
11702
11703 auto Buf = VFS.getBufferForFile(HostFilePath);
11704 if (std::error_code Err = Buf.getError()) {
11705 report_fatal_error(("error opening host file from host file path inside of "
11706 "OpenMPIRBuilder: " +
11707 Err.message())
11708 .c_str());
11709 }
11710
11711 LLVMContext Ctx;
11713 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11714 if (std::error_code Err = M.getError()) {
11716 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11717 .c_str());
11718 }
11719
11720 loadOffloadInfoMetadata(*M.get());
11721}
11722
11725 llvm::StringRef Name) {
11726 Builder.restoreIP(Loc.IP);
11727
11728 BasicBlock *CurBB = Builder.GetInsertBlock();
11729 assert(CurBB &&
11730 "expected a valid insertion block for creating an iterator loop");
11731 Function *F = CurBB->getParent();
11732
11733 InsertPointTy SplitIP = Builder.saveIP();
11734 if (SplitIP.getPoint() == CurBB->end())
11735 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
11736 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
11737
11738 BasicBlock *ContBB =
11739 splitBB(SplitIP, /*CreateBranch=*/false,
11740 Builder.getCurrentDebugLocation(), "omp.it.cont");
11741
11742 CanonicalLoopInfo *CLI =
11743 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
11744 /*PreInsertBefore=*/ContBB,
11745 /*PostInsertBefore=*/ContBB, Name);
11746
11747 // Enter loop from original block.
11748 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
11749
11750 // Remove the unconditional branch inserted by createLoopSkeleton in the body
11751 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
11752 T->eraseFromParent();
11753
11754 InsertPointTy BodyIP = CLI->getBodyIP();
11755 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
11756 return Err;
11757
11758 // Body must either fallthrough to the latch or branch directly to it.
11759 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
11760 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
11761 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
11763 "iterator bodygen must terminate the canonical body with an "
11764 "unconditional branch to the loop latch",
11766 }
11767 } else {
11768 // Ensure we end the loop body by jumping to the latch.
11769 Builder.SetInsertPoint(CLI->getBody());
11770 Builder.CreateBr(CLI->getLatch());
11771 }
11772
11773 // Link After -> ContBB
11774 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
11775 if (!CLI->getAfter()->hasTerminator())
11776 Builder.CreateBr(ContBB);
11777
11778 return InsertPointTy{ContBB, ContBB->begin()};
11779}
11780
11781/// Mangle the parameter part of the vector function name according to
11782/// their OpenMP classification. The mangling function is defined in
11783/// section 4.5 of the AAVFABI(2021Q1).
11784static std::string mangleVectorParameters(
11786 SmallString<256> Buffer;
11787 llvm::raw_svector_ostream Out(Buffer);
11788 for (const auto &ParamAttr : ParamAttrs) {
11789 switch (ParamAttr.Kind) {
11791 Out << 'l';
11792 break;
11794 Out << 'R';
11795 break;
11797 Out << 'U';
11798 break;
11800 Out << 'L';
11801 break;
11803 Out << 'u';
11804 break;
11806 Out << 'v';
11807 break;
11808 }
11809 if (ParamAttr.HasVarStride)
11810 Out << "s" << ParamAttr.StrideOrArg;
11811 else if (ParamAttr.Kind ==
11813 ParamAttr.Kind ==
11815 ParamAttr.Kind ==
11817 ParamAttr.Kind ==
11819 // Don't print the step value if it is not present or if it is
11820 // equal to 1.
11821 if (ParamAttr.StrideOrArg < 0)
11822 Out << 'n' << -ParamAttr.StrideOrArg;
11823 else if (ParamAttr.StrideOrArg != 1)
11824 Out << ParamAttr.StrideOrArg;
11825 }
11826
11827 if (!!ParamAttr.Alignment)
11828 Out << 'a' << ParamAttr.Alignment;
11829 }
11830
11831 return std::string(Out.str());
11832}
11833
11835 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
11837 struct ISADataTy {
11838 char ISA;
11839 unsigned VecRegSize;
11840 };
11841 ISADataTy ISAData[] = {
11842 {'b', 128}, // SSE
11843 {'c', 256}, // AVX
11844 {'d', 256}, // AVX2
11845 {'e', 512}, // AVX512
11846 };
11848 switch (Branch) {
11850 Masked.push_back('N');
11851 Masked.push_back('M');
11852 break;
11854 Masked.push_back('N');
11855 break;
11857 Masked.push_back('M');
11858 break;
11859 }
11860 for (char Mask : Masked) {
11861 for (const ISADataTy &Data : ISAData) {
11863 llvm::raw_svector_ostream Out(Buffer);
11864 Out << "_ZGV" << Data.ISA << Mask;
11865 if (!VLENVal) {
11866 assert(NumElts && "Non-zero simdlen/cdtsize expected");
11867 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
11868 } else {
11869 Out << VLENVal;
11870 }
11871 Out << mangleVectorParameters(ParamAttrs);
11872 Out << '_' << Fn->getName();
11873 Fn->addFnAttr(Out.str());
11874 }
11875 }
11876}
11877
11878// Function used to add the attribute. The parameter `VLEN` is templated to
11879// allow the use of `x` when targeting scalable functions for SVE.
11880template <typename T>
11881static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
11882 char ISA, StringRef ParSeq,
11883 StringRef MangledName, bool OutputBecomesInput,
11884 llvm::Function *Fn) {
11885 SmallString<256> Buffer;
11886 llvm::raw_svector_ostream Out(Buffer);
11887 Out << Prefix << ISA << LMask << VLEN;
11888 if (OutputBecomesInput)
11889 Out << 'v';
11890 Out << ParSeq << '_' << MangledName;
11891 Fn->addFnAttr(Out.str());
11892}
11893
11894// Helper function to generate the Advanced SIMD names depending on the value
11895// of the NDS when simdlen is not present.
11896static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
11897 StringRef Prefix, char ISA,
11898 StringRef ParSeq, StringRef MangledName,
11899 bool OutputBecomesInput,
11900 llvm::Function *Fn) {
11901 switch (NDS) {
11902 case 8:
11903 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11904 OutputBecomesInput, Fn);
11905 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
11906 OutputBecomesInput, Fn);
11907 break;
11908 case 16:
11909 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11910 OutputBecomesInput, Fn);
11911 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11912 OutputBecomesInput, Fn);
11913 break;
11914 case 32:
11915 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11916 OutputBecomesInput, Fn);
11917 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11918 OutputBecomesInput, Fn);
11919 break;
11920 case 64:
11921 case 128:
11922 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11923 OutputBecomesInput, Fn);
11924 break;
11925 default:
11926 llvm_unreachable("Scalar type is too wide.");
11927 }
11928}
11929
11930/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
11932 llvm::Function *Fn, unsigned UserVLEN,
11934 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
11935 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
11936
11937 // Sort out parameter sequence.
11938 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
11939 StringRef Prefix = "_ZGV";
11940 StringRef MangledName = Fn->getName();
11941
11942 // Generate simdlen from user input (if any).
11943 if (UserVLEN) {
11944 if (ISA == 's') {
11945 // SVE generates only a masked function.
11946 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11947 OutputBecomesInput, Fn);
11948 return;
11949 }
11950
11951 switch (Branch) {
11953 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11954 OutputBecomesInput, Fn);
11955 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11956 OutputBecomesInput, Fn);
11957 break;
11959 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11960 OutputBecomesInput, Fn);
11961 break;
11963 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11964 OutputBecomesInput, Fn);
11965 break;
11966 }
11967 return;
11968 }
11969
11970 if (ISA == 's') {
11971 // SVE, section 3.4.1, item 1.
11972 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
11973 OutputBecomesInput, Fn);
11974 return;
11975 }
11976
11977 switch (Branch) {
11979 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11980 MangledName, OutputBecomesInput, Fn);
11981 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11982 MangledName, OutputBecomesInput, Fn);
11983 break;
11985 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11986 MangledName, OutputBecomesInput, Fn);
11987 break;
11989 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11990 MangledName, OutputBecomesInput, Fn);
11991 break;
11992 }
11993}
11994
11995//===----------------------------------------------------------------------===//
11996// OffloadEntriesInfoManager
11997//===----------------------------------------------------------------------===//
11998
12000 return OffloadEntriesTargetRegion.empty() &&
12001 OffloadEntriesDeviceGlobalVar.empty();
12002}
12003
12004unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
12005 const TargetRegionEntryInfo &EntryInfo) const {
12006 auto It = OffloadEntriesTargetRegionCount.find(
12007 getTargetRegionEntryCountKey(EntryInfo));
12008 if (It == OffloadEntriesTargetRegionCount.end())
12009 return 0;
12010 return It->second;
12011}
12012
12013void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
12014 const TargetRegionEntryInfo &EntryInfo) {
12015 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
12016 EntryInfo.Count + 1;
12017}
12018
12019/// Initialize target region entry.
12021 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
12022 OffloadEntriesTargetRegion[EntryInfo] =
12023 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
12025 ++OffloadingEntriesNum;
12026}
12027
12029 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
12031 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
12032
12033 // Update the EntryInfo with the next available count for this location.
12034 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12035
12036 // If we are emitting code for a target, the entry is already initialized,
12037 // only has to be registered.
12038 if (OMPBuilder->Config.isTargetDevice()) {
12039 // This could happen if the device compilation is invoked standalone.
12040 if (!hasTargetRegionEntryInfo(EntryInfo)) {
12041 return;
12042 }
12043 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
12044 Entry.setAddress(Addr);
12045 Entry.setID(ID);
12046 Entry.setFlags(Flags);
12047 } else {
12049 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
12050 return;
12051 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
12052 "Target region entry already registered!");
12053 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
12054 OffloadEntriesTargetRegion[EntryInfo] = Entry;
12055 ++OffloadingEntriesNum;
12056 }
12057 incrementTargetRegionEntryInfoCount(EntryInfo);
12058}
12059
12061 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
12062
12063 // Update the EntryInfo with the next available count for this location.
12064 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
12065
12066 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
12067 if (It == OffloadEntriesTargetRegion.end()) {
12068 return false;
12069 }
12070 // Fail if this entry is already registered.
12071 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
12072 return false;
12073 return true;
12074}
12075
12077 const OffloadTargetRegionEntryInfoActTy &Action) {
12078 // Scan all target region entries and perform the provided action.
12079 for (const auto &It : OffloadEntriesTargetRegion) {
12080 Action(It.first, It.second);
12081 }
12082}
12083
12085 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
12086 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
12087 ++OffloadingEntriesNum;
12088}
12089
12091 StringRef VarName, Constant *Addr, int64_t VarSize,
12093 if (OMPBuilder->Config.isTargetDevice()) {
12094 // This could happen if the device compilation is invoked standalone.
12095 if (!hasDeviceGlobalVarEntryInfo(VarName))
12096 return;
12097 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12098 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
12099 if (Entry.getVarSize() == 0) {
12100 Entry.setVarSize(VarSize);
12101 Entry.setLinkage(Linkage);
12102 }
12103 return;
12104 }
12105 Entry.setVarSize(VarSize);
12106 Entry.setLinkage(Linkage);
12107 Entry.setAddress(Addr);
12108 } else {
12109 if (hasDeviceGlobalVarEntryInfo(VarName)) {
12110 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
12111 assert(Entry.isValid() && Entry.getFlags() == Flags &&
12112 "Entry not initialized!");
12113 if (Entry.getVarSize() == 0) {
12114 Entry.setVarSize(VarSize);
12115 Entry.setLinkage(Linkage);
12116 }
12117 return;
12118 }
12120 Flags ==
12122 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
12123 Addr, VarSize, Flags, Linkage,
12124 VarName.str());
12125 else
12126 OffloadEntriesDeviceGlobalVar.try_emplace(
12127 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12128 ++OffloadingEntriesNum;
12129 }
12130}
12131
12134 // Scan all target region entries and perform the provided action.
12135 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12136 Action(E.getKey(), E.getValue());
12137}
12138
12139//===----------------------------------------------------------------------===//
12140// CanonicalLoopInfo
12141//===----------------------------------------------------------------------===//
12142
12143void CanonicalLoopInfo::collectControlBlocks(
12145 // We only count those BBs as control block for which we do not need to
12146 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12147 // flow. For consistency, this also means we do not add the Body block, which
12148 // is just the entry to the body code.
12149 BBs.reserve(BBs.size() + 6);
12150 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12151}
12152
12154 assert(isValid() && "Requires a valid canonical loop");
12155 for (BasicBlock *Pred : predecessors(Header)) {
12156 if (Pred != Latch)
12157 return Pred;
12158 }
12159 llvm_unreachable("Missing preheader");
12160}
12161
12162void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12163 assert(isValid() && "Requires a valid canonical loop");
12164
12165 Instruction *CmpI = &getCond()->front();
12166 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12167 CmpI->setOperand(1, TripCount);
12168
12169#ifndef NDEBUG
12170 assertOK();
12171#endif
12172}
12173
12174void CanonicalLoopInfo::mapIndVar(
12175 llvm::function_ref<Value *(Instruction *)> Updater) {
12176 assert(isValid() && "Requires a valid canonical loop");
12177
12178 Instruction *OldIV = getIndVar();
12179
12180 // Record all uses excluding those introduced by the updater. Uses by the
12181 // CanonicalLoopInfo itself to keep track of the number of iterations are
12182 // excluded.
12183 SmallVector<Use *> ReplacableUses;
12184 for (Use &U : OldIV->uses()) {
12185 auto *User = dyn_cast<Instruction>(U.getUser());
12186 if (!User)
12187 continue;
12188 if (User->getParent() == getCond())
12189 continue;
12190 if (User->getParent() == getLatch())
12191 continue;
12192 ReplacableUses.push_back(&U);
12193 }
12194
12195 // Run the updater that may introduce new uses
12196 Value *NewIV = Updater(OldIV);
12197
12198 // Replace the old uses with the value returned by the updater.
12199 for (Use *U : ReplacableUses)
12200 U->set(NewIV);
12201
12202#ifndef NDEBUG
12203 assertOK();
12204#endif
12205}
12206
12208#ifndef NDEBUG
12209 // No constraints if this object currently does not describe a loop.
12210 if (!isValid())
12211 return;
12212
12213 BasicBlock *Preheader = getPreheader();
12214 BasicBlock *Body = getBody();
12215 BasicBlock *After = getAfter();
12216
12217 // Verify standard control-flow we use for OpenMP loops.
12218 assert(Preheader);
12219 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12220 "Preheader must terminate with unconditional branch");
12221 assert(Preheader->getSingleSuccessor() == Header &&
12222 "Preheader must jump to header");
12223
12224 assert(Header);
12225 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12226 "Header must terminate with unconditional branch");
12227 assert(Header->getSingleSuccessor() == Cond &&
12228 "Header must jump to exiting block");
12229
12230 assert(Cond);
12231 assert(Cond->getSinglePredecessor() == Header &&
12232 "Exiting block only reachable from header");
12233
12234 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12235 "Exiting block must terminate with conditional branch");
12236 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12237 "Exiting block's first successor jump to the body");
12238 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12239 "Exiting block's second successor must exit the loop");
12240
12241 assert(Body);
12242 assert(Body->getSinglePredecessor() == Cond &&
12243 "Body only reachable from exiting block");
12244 assert(!isa<PHINode>(Body->front()));
12245
12246 assert(Latch);
12247 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12248 "Latch must terminate with unconditional branch");
12249 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12250 // TODO: To support simple redirecting of the end of the body code that has
12251 // multiple; introduce another auxiliary basic block like preheader and after.
12252 assert(Latch->getSinglePredecessor() != nullptr);
12253 assert(!isa<PHINode>(Latch->front()));
12254
12255 assert(Exit);
12256 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12257 "Exit block must terminate with unconditional branch");
12258 assert(Exit->getSingleSuccessor() == After &&
12259 "Exit block must jump to after block");
12260
12261 assert(After);
12262 assert(After->getSinglePredecessor() == Exit &&
12263 "After block only reachable from exit block");
12264 assert(After->empty() || !isa<PHINode>(After->front()));
12265
12266 Instruction *IndVar = getIndVar();
12267 assert(IndVar && "Canonical induction variable not found?");
12268 assert(isa<IntegerType>(IndVar->getType()) &&
12269 "Induction variable must be an integer");
12270 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12271 "Induction variable must be a PHI in the loop header");
12272 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12273 assert(
12274 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12275 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12276
12277 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12278 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12279 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12280 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12281 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12282 ->isOne());
12283
12284 Value *TripCount = getTripCount();
12285 assert(TripCount && "Loop trip count not found?");
12286 assert(IndVar->getType() == TripCount->getType() &&
12287 "Trip count and induction variable must have the same type");
12288
12289 auto *CmpI = cast<CmpInst>(&Cond->front());
12290 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12291 "Exit condition must be a signed less-than comparison");
12292 assert(CmpI->getOperand(0) == IndVar &&
12293 "Exit condition must compare the induction variable");
12294 assert(CmpI->getOperand(1) == TripCount &&
12295 "Exit condition must compare with the trip count");
12296#endif
12297}
12298
12300 Header = nullptr;
12301 Cond = nullptr;
12302 Latch = nullptr;
12303 Exit = nullptr;
12304}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:851
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const OpenMPIRBuilder::DependenciesInfo &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:109
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:150
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
LLVM_ABI void insertDbgRecordBefore(DbgRecord *DR, InstListType::iterator Here)
Insert a DbgRecord into a block at the position given by Here.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this GlobalObject has any metadata attached to it.
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:298
BasicBlock * getBlock() const
Definition IRBuilder.h:313
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:311
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:314
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:318
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:330
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2847
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:990
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const DependenciesInfo &Dependencies={}, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitTaskDependency(IRBuilderBase &Builder, Value *Entry, const DependData &Dep)
Store one kmp_depend_info entry at the given Entry pointer.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI CallInst * createOMPAlignedAlloc(const LocationDescription &Loc, Value *Align, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_align_alloc.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, const DependenciesInfo &Dependencies={}, const AffinityData &Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const DependenciesInfo &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1047
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1107
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1121
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:151
LLVM_ABI bool canUnroll(OptimizationRemarkEmitter *ORE=nullptr, const Loop *L=nullptr) const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:169
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:393
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
User * user_back()
Definition Value.h:412
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:964
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:146
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:184
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:709
bool use_empty() const
Definition Value.h:346
user_iterator user_end()
Definition Value.h:410
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:557
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
Flag
These should be considered private to the implementation of the MCInstrDesc class.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:87
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:378
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
omp::RTLDependenceKindTy DepKind
A struct to pack static and dynamic dependency information for a task.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...