LLVM 23.0.0git
OMPIRBuilder.cpp
Go to the documentation of this file.
1//===- OpenMPIRBuilder.cpp - Builder for LLVM-IR for OpenMP directives ----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9///
10/// This file implements the OpenMPIRBuilder class, which is used as a
11/// convenient way to create LLVM instructions for OpenMP directives.
12///
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/SmallSet.h"
19#include "llvm/ADT/StringRef.h"
30#include "llvm/IR/Attributes.h"
31#include "llvm/IR/BasicBlock.h"
32#include "llvm/IR/CFG.h"
33#include "llvm/IR/CallingConv.h"
34#include "llvm/IR/Constant.h"
35#include "llvm/IR/Constants.h"
36#include "llvm/IR/DIBuilder.h"
39#include "llvm/IR/Function.h"
41#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/LLVMContext.h"
45#include "llvm/IR/MDBuilder.h"
46#include "llvm/IR/Metadata.h"
48#include "llvm/IR/PassManager.h"
50#include "llvm/IR/Value.h"
53#include "llvm/Support/Error.h"
65
66#include <cstdint>
67#include <optional>
68
69#define DEBUG_TYPE "openmp-ir-builder"
70
71using namespace llvm;
72using namespace omp;
73
74static cl::opt<bool>
75 OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
76 cl::desc("Use optimistic attributes describing "
77 "'as-if' properties of runtime calls."),
78 cl::init(false));
79
81 "openmp-ir-builder-unroll-threshold-factor", cl::Hidden,
82 cl::desc("Factor for the unroll threshold to account for code "
83 "simplifications still taking place"),
84 cl::init(1.5));
85
86#ifndef NDEBUG
87/// Return whether IP1 and IP2 are ambiguous, i.e. that inserting instructions
88/// at position IP1 may change the meaning of IP2 or vice-versa. This is because
89/// an InsertPoint stores the instruction before something is inserted. For
90/// instance, if both point to the same instruction, two IRBuilders alternating
91/// creating instruction will cause the instructions to be interleaved.
94 if (!IP1.isSet() || !IP2.isSet())
95 return false;
96 return IP1.getBlock() == IP2.getBlock() && IP1.getPoint() == IP2.getPoint();
97}
98
100 // Valid ordered/unordered and base algorithm combinations.
101 switch (SchedType & ~OMPScheduleType::MonotonicityMask) {
102 case OMPScheduleType::UnorderedStaticChunked:
103 case OMPScheduleType::UnorderedStatic:
104 case OMPScheduleType::UnorderedDynamicChunked:
105 case OMPScheduleType::UnorderedGuidedChunked:
106 case OMPScheduleType::UnorderedRuntime:
107 case OMPScheduleType::UnorderedAuto:
108 case OMPScheduleType::UnorderedTrapezoidal:
109 case OMPScheduleType::UnorderedGreedy:
110 case OMPScheduleType::UnorderedBalanced:
111 case OMPScheduleType::UnorderedGuidedIterativeChunked:
112 case OMPScheduleType::UnorderedGuidedAnalyticalChunked:
113 case OMPScheduleType::UnorderedSteal:
114 case OMPScheduleType::UnorderedStaticBalancedChunked:
115 case OMPScheduleType::UnorderedGuidedSimd:
116 case OMPScheduleType::UnorderedRuntimeSimd:
117 case OMPScheduleType::OrderedStaticChunked:
118 case OMPScheduleType::OrderedStatic:
119 case OMPScheduleType::OrderedDynamicChunked:
120 case OMPScheduleType::OrderedGuidedChunked:
121 case OMPScheduleType::OrderedRuntime:
122 case OMPScheduleType::OrderedAuto:
123 case OMPScheduleType::OrderdTrapezoidal:
124 case OMPScheduleType::NomergeUnorderedStaticChunked:
125 case OMPScheduleType::NomergeUnorderedStatic:
126 case OMPScheduleType::NomergeUnorderedDynamicChunked:
127 case OMPScheduleType::NomergeUnorderedGuidedChunked:
128 case OMPScheduleType::NomergeUnorderedRuntime:
129 case OMPScheduleType::NomergeUnorderedAuto:
130 case OMPScheduleType::NomergeUnorderedTrapezoidal:
131 case OMPScheduleType::NomergeUnorderedGreedy:
132 case OMPScheduleType::NomergeUnorderedBalanced:
133 case OMPScheduleType::NomergeUnorderedGuidedIterativeChunked:
134 case OMPScheduleType::NomergeUnorderedGuidedAnalyticalChunked:
135 case OMPScheduleType::NomergeUnorderedSteal:
136 case OMPScheduleType::NomergeOrderedStaticChunked:
137 case OMPScheduleType::NomergeOrderedStatic:
138 case OMPScheduleType::NomergeOrderedDynamicChunked:
139 case OMPScheduleType::NomergeOrderedGuidedChunked:
140 case OMPScheduleType::NomergeOrderedRuntime:
141 case OMPScheduleType::NomergeOrderedAuto:
142 case OMPScheduleType::NomergeOrderedTrapezoidal:
143 case OMPScheduleType::OrderedDistributeChunked:
144 case OMPScheduleType::OrderedDistribute:
145 break;
146 default:
147 return false;
148 }
149
150 // Must not set both monotonicity modifiers at the same time.
151 OMPScheduleType MonotonicityFlags =
152 SchedType & OMPScheduleType::MonotonicityMask;
153 if (MonotonicityFlags == OMPScheduleType::MonotonicityMask)
154 return false;
155
156 return true;
157}
158#endif
159
160/// This is wrapper over IRBuilderBase::restoreIP that also restores the current
161/// debug location to the last instruction in the specified basic block if the
162/// insert point points to the end of the block.
165 Builder.restoreIP(IP);
166 llvm::BasicBlock *BB = Builder.GetInsertBlock();
167 llvm::BasicBlock::iterator I = Builder.GetInsertPoint();
168 if (!BB->empty() && I == BB->end())
169 Builder.SetCurrentDebugLocation(BB->back().getStableDebugLoc());
170}
171
172static bool hasGridValue(const Triple &T) {
173 return T.isAMDGPU() || T.isNVPTX() || T.isSPIRV();
174}
175
176static const omp::GV &getGridValue(const Triple &T, Function *Kernel) {
177 if (T.isAMDGPU()) {
178 StringRef Features =
179 Kernel->getFnAttribute("target-features").getValueAsString();
180 if (Features.count("+wavefrontsize64"))
183 }
184 if (T.isNVPTX())
186 if (T.isSPIRV())
188 llvm_unreachable("No grid value available for this architecture!");
189}
190
191/// Determine which scheduling algorithm to use, determined from schedule clause
192/// arguments.
193static OMPScheduleType
194getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks,
195 bool HasSimdModifier, bool HasDistScheduleChunks) {
196 // Currently, the default schedule it static.
197 switch (ClauseKind) {
198 case OMP_SCHEDULE_Default:
199 case OMP_SCHEDULE_Static:
200 return HasChunks ? OMPScheduleType::BaseStaticChunked
201 : OMPScheduleType::BaseStatic;
202 case OMP_SCHEDULE_Dynamic:
203 return OMPScheduleType::BaseDynamicChunked;
204 case OMP_SCHEDULE_Guided:
205 return HasSimdModifier ? OMPScheduleType::BaseGuidedSimd
206 : OMPScheduleType::BaseGuidedChunked;
207 case OMP_SCHEDULE_Auto:
209 case OMP_SCHEDULE_Runtime:
210 return HasSimdModifier ? OMPScheduleType::BaseRuntimeSimd
211 : OMPScheduleType::BaseRuntime;
212 case OMP_SCHEDULE_Distribute:
213 return HasDistScheduleChunks ? OMPScheduleType::BaseDistributeChunked
214 : OMPScheduleType::BaseDistribute;
215 }
216 llvm_unreachable("unhandled schedule clause argument");
217}
218
219/// Adds ordering modifier flags to schedule type.
220static OMPScheduleType
222 bool HasOrderedClause) {
223 assert((BaseScheduleType & OMPScheduleType::ModifierMask) ==
224 OMPScheduleType::None &&
225 "Must not have ordering nor monotonicity flags already set");
226
227 OMPScheduleType OrderingModifier = HasOrderedClause
228 ? OMPScheduleType::ModifierOrdered
229 : OMPScheduleType::ModifierUnordered;
230 OMPScheduleType OrderingScheduleType = BaseScheduleType | OrderingModifier;
231
232 // Unsupported combinations
233 if (OrderingScheduleType ==
234 (OMPScheduleType::BaseGuidedSimd | OMPScheduleType::ModifierOrdered))
235 return OMPScheduleType::OrderedGuidedChunked;
236 else if (OrderingScheduleType == (OMPScheduleType::BaseRuntimeSimd |
237 OMPScheduleType::ModifierOrdered))
238 return OMPScheduleType::OrderedRuntime;
239
240 return OrderingScheduleType;
241}
242
243/// Adds monotonicity modifier flags to schedule type.
244static OMPScheduleType
246 bool HasSimdModifier, bool HasMonotonic,
247 bool HasNonmonotonic, bool HasOrderedClause) {
248 assert((ScheduleType & OMPScheduleType::MonotonicityMask) ==
249 OMPScheduleType::None &&
250 "Must not have monotonicity flags already set");
251 assert((!HasMonotonic || !HasNonmonotonic) &&
252 "Monotonic and Nonmonotonic are contradicting each other");
253
254 if (HasMonotonic) {
255 return ScheduleType | OMPScheduleType::ModifierMonotonic;
256 } else if (HasNonmonotonic) {
257 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
258 } else {
259 // OpenMP 5.1, 2.11.4 Worksharing-Loop Construct, Description.
260 // If the static schedule kind is specified or if the ordered clause is
261 // specified, and if the nonmonotonic modifier is not specified, the
262 // effect is as if the monotonic modifier is specified. Otherwise, unless
263 // the monotonic modifier is specified, the effect is as if the
264 // nonmonotonic modifier is specified.
265 OMPScheduleType BaseScheduleType =
266 ScheduleType & ~OMPScheduleType::ModifierMask;
267 if ((BaseScheduleType == OMPScheduleType::BaseStatic) ||
268 (BaseScheduleType == OMPScheduleType::BaseStaticChunked) ||
269 HasOrderedClause) {
270 // The monotonic is used by default in openmp runtime library, so no need
271 // to set it.
272 return ScheduleType;
273 } else {
274 return ScheduleType | OMPScheduleType::ModifierNonmonotonic;
275 }
276 }
277}
278
279/// Determine the schedule type using schedule and ordering clause arguments.
280static OMPScheduleType
281computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks,
282 bool HasSimdModifier, bool HasMonotonicModifier,
283 bool HasNonmonotonicModifier, bool HasOrderedClause,
284 bool HasDistScheduleChunks) {
286 ClauseKind, HasChunks, HasSimdModifier, HasDistScheduleChunks);
287 OMPScheduleType OrderedSchedule =
288 getOpenMPOrderingScheduleType(BaseSchedule, HasOrderedClause);
290 OrderedSchedule, HasSimdModifier, HasMonotonicModifier,
291 HasNonmonotonicModifier, HasOrderedClause);
292
294 return Result;
295}
296
297/// Make \p Source branch to \p Target.
298///
299/// Handles two situations:
300/// * \p Source already has an unconditional branch.
301/// * \p Source is a degenerate block (no terminator because the BB is
302/// the current head of the IR construction).
304 if (Instruction *Term = Source->getTerminatorOrNull()) {
305 auto *Br = cast<UncondBrInst>(Term);
306 BasicBlock *Succ = Br->getSuccessor();
307 Succ->removePredecessor(Source, /*KeepOneInputPHIs=*/true);
308 Br->setSuccessor(Target);
309 return;
310 }
311
312 auto *NewBr = UncondBrInst::Create(Target, Source);
313 NewBr->setDebugLoc(DL);
314}
315
317 bool CreateBranch, DebugLoc DL) {
318 assert(New->getFirstInsertionPt() == New->begin() &&
319 "Target BB must not have PHI nodes");
320
321 // Move instructions to new block.
322 BasicBlock *Old = IP.getBlock();
323 // If the `Old` block is empty then there are no instructions to move. But in
324 // the new debug scheme, it could have trailing debug records which will be
325 // moved to `New` in `spliceDebugInfoEmptyBlock`. We dont want that for 2
326 // reasons:
327 // 1. If `New` is also empty, `BasicBlock::splice` crashes.
328 // 2. Even if `New` is not empty, the rationale to move those records to `New`
329 // (in `spliceDebugInfoEmptyBlock`) does not apply here. That function
330 // assumes that `Old` is optimized out and is going away. This is not the case
331 // here. The `Old` block is still being used e.g. a branch instruction is
332 // added to it later in this function.
333 // So we call `BasicBlock::splice` only when `Old` is not empty.
334 if (!Old->empty())
335 New->splice(New->begin(), Old, IP.getPoint(), Old->end());
336
337 if (CreateBranch) {
338 auto *NewBr = UncondBrInst::Create(New, Old);
339 NewBr->setDebugLoc(DL);
340 }
341}
342
343void llvm::spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch) {
344 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
345 BasicBlock *Old = Builder.GetInsertBlock();
346
347 spliceBB(Builder.saveIP(), New, CreateBranch, DebugLoc);
348 if (CreateBranch)
349 Builder.SetInsertPoint(Old->getTerminator());
350 else
351 Builder.SetInsertPoint(Old);
352
353 // SetInsertPoint also updates the Builder's debug location, but we want to
354 // keep the one the Builder was configured to use.
355 Builder.SetCurrentDebugLocation(DebugLoc);
356}
357
359 DebugLoc DL, llvm::Twine Name) {
360 BasicBlock *Old = IP.getBlock();
362 Old->getContext(), Name.isTriviallyEmpty() ? Old->getName() : Name,
363 Old->getParent(), Old->getNextNode());
364 spliceBB(IP, New, CreateBranch, DL);
365 New->replaceSuccessorsPhiUsesWith(Old, New);
366 return New;
367}
368
369BasicBlock *llvm::splitBB(IRBuilderBase &Builder, bool CreateBranch,
370 llvm::Twine Name) {
371 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
372 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
373 if (CreateBranch)
374 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
375 else
376 Builder.SetInsertPoint(Builder.GetInsertBlock());
377 // SetInsertPoint also updates the Builder's debug location, but we want to
378 // keep the one the Builder was configured to use.
379 Builder.SetCurrentDebugLocation(DebugLoc);
380 return New;
381}
382
383BasicBlock *llvm::splitBB(IRBuilder<> &Builder, bool CreateBranch,
384 llvm::Twine Name) {
385 DebugLoc DebugLoc = Builder.getCurrentDebugLocation();
386 BasicBlock *New = splitBB(Builder.saveIP(), CreateBranch, DebugLoc, Name);
387 if (CreateBranch)
388 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
389 else
390 Builder.SetInsertPoint(Builder.GetInsertBlock());
391 // SetInsertPoint also updates the Builder's debug location, but we want to
392 // keep the one the Builder was configured to use.
393 Builder.SetCurrentDebugLocation(DebugLoc);
394 return New;
395}
396
398 llvm::Twine Suffix) {
399 BasicBlock *Old = Builder.GetInsertBlock();
400 return splitBB(Builder, CreateBranch, Old->getName() + Suffix);
401}
402
403// This function creates a fake integer value and a fake use for the integer
404// value. It returns the fake value created. This is useful in modeling the
405// extra arguments to the outlined functions.
407 OpenMPIRBuilder::InsertPointTy OuterAllocaIP,
409 OpenMPIRBuilder::InsertPointTy InnerAllocaIP,
410 const Twine &Name = "", bool AsPtr = true,
411 bool Is64Bit = false) {
412 Builder.restoreIP(OuterAllocaIP);
413 IntegerType *IntTy = Is64Bit ? Builder.getInt64Ty() : Builder.getInt32Ty();
414 Instruction *FakeVal;
415 AllocaInst *FakeValAddr =
416 Builder.CreateAlloca(IntTy, nullptr, Name + ".addr");
417 ToBeDeleted.push_back(FakeValAddr);
418
419 if (AsPtr) {
420 FakeVal = FakeValAddr;
421 } else {
422 FakeVal = Builder.CreateLoad(IntTy, FakeValAddr, Name + ".val");
423 ToBeDeleted.push_back(FakeVal);
424 }
425
426 // Generate a fake use of this value
427 Builder.restoreIP(InnerAllocaIP);
428 Instruction *UseFakeVal;
429 if (AsPtr) {
430 UseFakeVal = Builder.CreateLoad(IntTy, FakeVal, Name + ".use");
431 } else {
432 UseFakeVal = cast<BinaryOperator>(Builder.CreateAdd(
433 FakeVal, Is64Bit ? Builder.getInt64(10) : Builder.getInt32(10)));
434 }
435 ToBeDeleted.push_back(UseFakeVal);
436 return FakeVal;
437}
438
439//===----------------------------------------------------------------------===//
440// OpenMPIRBuilderConfig
441//===----------------------------------------------------------------------===//
442
443namespace {
445/// Values for bit flags for marking which requires clauses have been used.
446enum OpenMPOffloadingRequiresDirFlags {
447 /// flag undefined.
448 OMP_REQ_UNDEFINED = 0x000,
449 /// no requires directive present.
450 OMP_REQ_NONE = 0x001,
451 /// reverse_offload clause.
452 OMP_REQ_REVERSE_OFFLOAD = 0x002,
453 /// unified_address clause.
454 OMP_REQ_UNIFIED_ADDRESS = 0x004,
455 /// unified_shared_memory clause.
456 OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
457 /// dynamic_allocators clause.
458 OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
459 LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/OMP_REQ_DYNAMIC_ALLOCATORS)
460};
461
462} // anonymous namespace
463
465 : RequiresFlags(OMP_REQ_UNDEFINED) {}
466
469 bool HasRequiresReverseOffload, bool HasRequiresUnifiedAddress,
470 bool HasRequiresUnifiedSharedMemory, bool HasRequiresDynamicAllocators)
473 RequiresFlags(OMP_REQ_UNDEFINED) {
474 if (HasRequiresReverseOffload)
475 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
476 if (HasRequiresUnifiedAddress)
477 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
478 if (HasRequiresUnifiedSharedMemory)
479 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
480 if (HasRequiresDynamicAllocators)
481 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
482}
483
485 return RequiresFlags & OMP_REQ_REVERSE_OFFLOAD;
486}
487
489 return RequiresFlags & OMP_REQ_UNIFIED_ADDRESS;
490}
491
493 return RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY;
494}
495
497 return RequiresFlags & OMP_REQ_DYNAMIC_ALLOCATORS;
498}
499
501 return hasRequiresFlags() ? RequiresFlags
502 : static_cast<int64_t>(OMP_REQ_NONE);
503}
504
506 if (Value)
507 RequiresFlags |= OMP_REQ_REVERSE_OFFLOAD;
508 else
509 RequiresFlags &= ~OMP_REQ_REVERSE_OFFLOAD;
510}
511
513 if (Value)
514 RequiresFlags |= OMP_REQ_UNIFIED_ADDRESS;
515 else
516 RequiresFlags &= ~OMP_REQ_UNIFIED_ADDRESS;
517}
518
520 if (Value)
521 RequiresFlags |= OMP_REQ_UNIFIED_SHARED_MEMORY;
522 else
523 RequiresFlags &= ~OMP_REQ_UNIFIED_SHARED_MEMORY;
524}
525
527 if (Value)
528 RequiresFlags |= OMP_REQ_DYNAMIC_ALLOCATORS;
529 else
530 RequiresFlags &= ~OMP_REQ_DYNAMIC_ALLOCATORS;
531}
532
533//===----------------------------------------------------------------------===//
534// OpenMPIRBuilder
535//===----------------------------------------------------------------------===//
536
539 SmallVector<Value *> &ArgsVector) {
541 Value *PointerNum = Builder.getInt32(KernelArgs.NumTargetItems);
542 auto Int32Ty = Type::getInt32Ty(Builder.getContext());
543 constexpr size_t MaxDim = 3;
544 Value *ZeroArray = Constant::getNullValue(ArrayType::get(Int32Ty, MaxDim));
545
546 Value *HasNoWaitFlag = Builder.getInt64(KernelArgs.HasNoWait);
547
548 Value *DynCGroupMemFallbackFlag =
549 Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
550 DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
551 Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
552
553 assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
554
555 Value *NumTeams3D =
556 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumTeams[0], {0});
557 Value *NumThreads3D =
558 Builder.CreateInsertValue(ZeroArray, KernelArgs.NumThreads[0], {0});
559 for (unsigned I :
560 seq<unsigned>(1, std::min(KernelArgs.NumTeams.size(), MaxDim)))
561 NumTeams3D =
562 Builder.CreateInsertValue(NumTeams3D, KernelArgs.NumTeams[I], {I});
563 for (unsigned I :
564 seq<unsigned>(1, std::min(KernelArgs.NumThreads.size(), MaxDim)))
565 NumThreads3D =
566 Builder.CreateInsertValue(NumThreads3D, KernelArgs.NumThreads[I], {I});
567
568 ArgsVector = {Version,
569 PointerNum,
570 KernelArgs.RTArgs.BasePointersArray,
571 KernelArgs.RTArgs.PointersArray,
572 KernelArgs.RTArgs.SizesArray,
573 KernelArgs.RTArgs.MapTypesArray,
574 KernelArgs.RTArgs.MapNamesArray,
575 KernelArgs.RTArgs.MappersArray,
576 KernelArgs.NumIterations,
577 Flags,
578 NumTeams3D,
579 NumThreads3D,
580 KernelArgs.DynCGroupMem};
581}
582
584 LLVMContext &Ctx = Fn.getContext();
585
586 // Get the function's current attributes.
587 auto Attrs = Fn.getAttributes();
588 auto FnAttrs = Attrs.getFnAttrs();
589 auto RetAttrs = Attrs.getRetAttrs();
591 for (size_t ArgNo = 0; ArgNo < Fn.arg_size(); ++ArgNo)
592 ArgAttrs.emplace_back(Attrs.getParamAttrs(ArgNo));
593
594 // Add AS to FnAS while taking special care with integer extensions.
595 auto addAttrSet = [&](AttributeSet &FnAS, const AttributeSet &AS,
596 bool Param = true) -> void {
597 bool HasSignExt = AS.hasAttribute(Attribute::SExt);
598 bool HasZeroExt = AS.hasAttribute(Attribute::ZExt);
599 if (HasSignExt || HasZeroExt) {
600 assert(AS.getNumAttributes() == 1 &&
601 "Currently not handling extension attr combined with others.");
602 if (Param) {
603 if (auto AK = TargetLibraryInfo::getExtAttrForI32Param(T, HasSignExt))
604 FnAS = FnAS.addAttribute(Ctx, AK);
605 } else if (auto AK =
606 TargetLibraryInfo::getExtAttrForI32Return(T, HasSignExt))
607 FnAS = FnAS.addAttribute(Ctx, AK);
608 } else {
609 FnAS = FnAS.addAttributes(Ctx, AS);
610 }
611 };
612
613#define OMP_ATTRS_SET(VarName, AttrSet) AttributeSet VarName = AttrSet;
614#include "llvm/Frontend/OpenMP/OMPKinds.def"
615
616 // Add attributes to the function declaration.
617 switch (FnID) {
618#define OMP_RTL_ATTRS(Enum, FnAttrSet, RetAttrSet, ArgAttrSets) \
619 case Enum: \
620 FnAttrs = FnAttrs.addAttributes(Ctx, FnAttrSet); \
621 addAttrSet(RetAttrs, RetAttrSet, /*Param*/ false); \
622 for (size_t ArgNo = 0; ArgNo < ArgAttrSets.size(); ++ArgNo) \
623 addAttrSet(ArgAttrs[ArgNo], ArgAttrSets[ArgNo]); \
624 Fn.setAttributes(AttributeList::get(Ctx, FnAttrs, RetAttrs, ArgAttrs)); \
625 break;
626#include "llvm/Frontend/OpenMP/OMPKinds.def"
627 default:
628 // Attributes are optional.
629 break;
630 }
631}
632
635 FunctionType *FnTy = nullptr;
636 Function *Fn = nullptr;
637
638 // Try to find the declation in the module first.
639 switch (FnID) {
640#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...) \
641 case Enum: \
642 FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__}, \
643 IsVarArg); \
644 Fn = M.getFunction(Str); \
645 break;
646#include "llvm/Frontend/OpenMP/OMPKinds.def"
647 }
648
649 if (!Fn) {
650 // Create a new declaration if we need one.
651 switch (FnID) {
652#define OMP_RTL(Enum, Str, ...) \
653 case Enum: \
654 Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M); \
655 break;
656#include "llvm/Frontend/OpenMP/OMPKinds.def"
657 }
658 Fn->setCallingConv(Config.getRuntimeCC());
659 // Add information if the runtime function takes a callback function
660 if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
661 if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
662 LLVMContext &Ctx = Fn->getContext();
663 MDBuilder MDB(Ctx);
664 // Annotate the callback behavior of the runtime function:
665 // - The callback callee is argument number 2 (microtask).
666 // - The first two arguments of the callback callee are unknown (-1).
667 // - All variadic arguments to the runtime function are passed to the
668 // callback callee.
669 Fn->addMetadata(
670 LLVMContext::MD_callback,
672 2, {-1, -1}, /* VarArgsArePassed */ true)}));
673 }
674 }
675
676 LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
677 << " with type " << *Fn->getFunctionType() << "\n");
678 addAttributes(FnID, *Fn);
679
680 } else {
681 LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
682 << " with type " << *Fn->getFunctionType() << "\n");
683 }
684
685 assert(Fn && "Failed to create OpenMP runtime function");
686
687 return {FnTy, Fn};
688}
689
692 if (!FiniBB) {
693 Function *ParentFunc = Builder.GetInsertBlock()->getParent();
695 FiniBB = BasicBlock::Create(Builder.getContext(), ".fini", ParentFunc);
696 Builder.SetInsertPoint(FiniBB);
697 // FiniCB adds the branch to the exit stub.
698 if (Error Err = FiniCB(Builder.saveIP()))
699 return Err;
700 }
701 return FiniBB;
702}
703
705 BasicBlock *OtherFiniBB) {
706 // Simple case: FiniBB does not exist yet: re-use OtherFiniBB.
707 if (!FiniBB) {
708 FiniBB = OtherFiniBB;
709
710 Builder.SetInsertPoint(FiniBB->getFirstNonPHIIt());
711 if (Error Err = FiniCB(Builder.saveIP()))
712 return Err;
713
714 return Error::success();
715 }
716
717 // Move instructions from FiniBB to the start of OtherFiniBB.
718 auto EndIt = FiniBB->end();
719 if (FiniBB->size() >= 1)
720 if (auto Prev = std::prev(EndIt); Prev->isTerminator())
721 EndIt = Prev;
722 OtherFiniBB->splice(OtherFiniBB->getFirstNonPHIIt(), FiniBB, FiniBB->begin(),
723 EndIt);
724
725 FiniBB->replaceAllUsesWith(OtherFiniBB);
726 FiniBB->eraseFromParent();
727 FiniBB = OtherFiniBB;
728 return Error::success();
729}
730
733 auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
734 assert(Fn && "Failed to create OpenMP runtime function pointer");
735 return Fn;
736}
737
740 StringRef Name) {
741 CallInst *Call = Builder.CreateCall(Callee, Args, Name);
742 Call->setCallingConv(Config.getRuntimeCC());
743 return Call;
744}
745
746void OpenMPIRBuilder::initialize() { initializeTypes(M); }
747
750 BasicBlock &EntryBlock = Function->getEntryBlock();
751 BasicBlock::iterator MoveLocInst = EntryBlock.getFirstNonPHIIt();
752
753 // Loop over blocks looking for constant allocas, skipping the entry block
754 // as any allocas there are already in the desired location.
755 for (auto Block = std::next(Function->begin(), 1); Block != Function->end();
756 Block++) {
757 for (auto Inst = Block->getReverseIterator()->begin();
758 Inst != Block->getReverseIterator()->end();) {
760 Inst++;
762 continue;
763 AllocaInst->moveBeforePreserving(MoveLocInst);
764 } else {
765 Inst++;
766 }
767 }
768 }
769}
770
773
774 auto ShouldHoistAlloca = [](const llvm::AllocaInst &AllocaInst) {
775 // TODO: For now, we support simple static allocations, we might need to
776 // move non-static ones as well. However, this will need further analysis to
777 // move the lenght arguments as well.
779 };
780
781 for (llvm::Instruction &Inst : Block)
783 if (ShouldHoistAlloca(*AllocaInst))
784 AllocasToMove.push_back(AllocaInst);
785
786 auto InsertPoint =
787 Block.getParent()->getEntryBlock().getTerminator()->getIterator();
788
789 for (llvm::Instruction *AllocaInst : AllocasToMove)
791}
792
794 PostDominatorTree PostDomTree(*Func);
795 for (llvm::BasicBlock &BB : *Func)
796 if (PostDomTree.properlyDominates(&BB, &Func->getEntryBlock()))
798}
799
801 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
803 SmallVector<OutlineInfo, 16> DeferredOutlines;
804 for (OutlineInfo &OI : OutlineInfos) {
805 // Skip functions that have not finalized yet; may happen with nested
806 // function generation.
807 if (Fn && OI.getFunction() != Fn) {
808 DeferredOutlines.push_back(OI);
809 continue;
810 }
811
812 ParallelRegionBlockSet.clear();
813 Blocks.clear();
814 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
815
816 Function *OuterFn = OI.getFunction();
817 CodeExtractorAnalysisCache CEAC(*OuterFn);
818 // If we generate code for the target device, we need to allocate
819 // struct for aggregate params in the device default alloca address space.
820 // OpenMP runtime requires that the params of the extracted functions are
821 // passed as zero address space pointers. This flag ensures that
822 // CodeExtractor generates correct code for extracted functions
823 // which are used by OpenMP runtime.
824 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
825 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
826 /* AggregateArgs */ true,
827 /* BlockFrequencyInfo */ nullptr,
828 /* BranchProbabilityInfo */ nullptr,
829 /* AssumptionCache */ nullptr,
830 /* AllowVarArgs */ true,
831 /* AllowAlloca */ true,
832 /* AllocaBlock*/ OI.OuterAllocaBB,
833 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
834
835 LLVM_DEBUG(dbgs() << "Before outlining: " << *OuterFn << "\n");
836 LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
837 << " Exit: " << OI.ExitBB->getName() << "\n");
838 assert(Extractor.isEligible() &&
839 "Expected OpenMP outlining to be possible!");
840
841 for (auto *V : OI.ExcludeArgsFromAggregate)
842 Extractor.excludeArgFromAggregate(V);
843
844 Function *OutlinedFn =
845 Extractor.extractCodeRegion(CEAC, OI.Inputs, OI.Outputs);
846
847 // Forward target-cpu, target-features attributes to the outlined function.
848 auto TargetCpuAttr = OuterFn->getFnAttribute("target-cpu");
849 if (TargetCpuAttr.isStringAttribute())
850 OutlinedFn->addFnAttr(TargetCpuAttr);
851
852 auto TargetFeaturesAttr = OuterFn->getFnAttribute("target-features");
853 if (TargetFeaturesAttr.isStringAttribute())
854 OutlinedFn->addFnAttr(TargetFeaturesAttr);
855
856 LLVM_DEBUG(dbgs() << "After outlining: " << *OuterFn << "\n");
857 LLVM_DEBUG(dbgs() << " Outlined function: " << *OutlinedFn << "\n");
858 assert(OutlinedFn->getReturnType()->isVoidTy() &&
859 "OpenMP outlined functions should not return a value!");
860
861 // For compability with the clang CG we move the outlined function after the
862 // one with the parallel region.
863 OutlinedFn->removeFromParent();
864 M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
865
866 // Remove the artificial entry introduced by the extractor right away, we
867 // made our own entry block after all.
868 {
869 BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
870 assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
871 assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
872 // Move instructions from the to-be-deleted ArtificialEntry to the entry
873 // basic block of the parallel region. CodeExtractor generates
874 // instructions to unwrap the aggregate argument and may sink
875 // allocas/bitcasts for values that are solely used in the outlined region
876 // and do not escape.
877 assert(!ArtificialEntry.empty() &&
878 "Expected instructions to add in the outlined region entry");
879 for (BasicBlock::reverse_iterator It = ArtificialEntry.rbegin(),
880 End = ArtificialEntry.rend();
881 It != End;) {
882 Instruction &I = *It;
883 It++;
884
885 if (I.isTerminator()) {
886 // Absorb any debug value that terminator may have
887 if (Instruction *TI = OI.EntryBB->getTerminatorOrNull())
888 TI->adoptDbgRecords(&ArtificialEntry, I.getIterator(), false);
889 continue;
890 }
891
892 I.moveBeforePreserving(*OI.EntryBB, OI.EntryBB->getFirstInsertionPt());
893 }
894
895 OI.EntryBB->moveBefore(&ArtificialEntry);
896 ArtificialEntry.eraseFromParent();
897 }
898 assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
899 assert(OutlinedFn && OutlinedFn->hasNUses(1));
900
901 // Run a user callback, e.g. to add attributes.
902 if (OI.PostOutlineCB)
903 OI.PostOutlineCB(*OutlinedFn);
904
905 if (OI.FixUpNonEntryAllocas)
907 }
908
909 // Remove work items that have been completed.
910 OutlineInfos = std::move(DeferredOutlines);
911
912 // The createTarget functions embeds user written code into
913 // the target region which may inject allocas which need to
914 // be moved to the entry block of our target or risk malformed
915 // optimisations by later passes, this is only relevant for
916 // the device pass which appears to be a little more delicate
917 // when it comes to optimisations (however, we do not block on
918 // that here, it's up to the inserter to the list to do so).
919 // This notbaly has to occur after the OutlinedInfo candidates
920 // have been extracted so we have an end product that will not
921 // be implicitly adversely affected by any raises unless
922 // intentionally appended to the list.
923 // NOTE: This only does so for ConstantData, it could be extended
924 // to ConstantExpr's with further effort, however, they should
925 // largely be folded when they get here. Extending it to runtime
926 // defined/read+writeable allocation sizes would be non-trivial
927 // (need to factor in movement of any stores to variables the
928 // allocation size depends on, as well as the usual loads,
929 // otherwise it'll yield the wrong result after movement) and
930 // likely be more suitable as an LLVM optimisation pass.
933
934 EmitMetadataErrorReportFunctionTy &&ErrorReportFn =
935 [](EmitMetadataErrorKind Kind,
936 const TargetRegionEntryInfo &EntryInfo) -> void {
937 errs() << "Error of kind: " << Kind
938 << " when emitting offload entries and metadata during "
939 "OMPIRBuilder finalization \n";
940 };
941
942 if (!OffloadInfoManager.empty())
944
945 if (Config.EmitLLVMUsedMetaInfo.value_or(false)) {
946 std::vector<WeakTrackingVH> LLVMCompilerUsed = {
947 M.getGlobalVariable("__openmp_nvptx_data_transfer_temporary_storage")};
948 emitUsed("llvm.compiler.used", LLVMCompilerUsed);
949 }
950
951 IsFinalized = true;
952}
953
954bool OpenMPIRBuilder::isFinalized() { return IsFinalized; }
955
957 assert(OutlineInfos.empty() && "There must be no outstanding outlinings");
958}
959
961 IntegerType *I32Ty = Type::getInt32Ty(M.getContext());
962 auto *GV =
963 new GlobalVariable(M, I32Ty,
964 /* isConstant = */ true, GlobalValue::WeakODRLinkage,
965 ConstantInt::get(I32Ty, Value), Name);
966 GV->setVisibility(GlobalValue::HiddenVisibility);
967
968 return GV;
969}
970
972 if (List.empty())
973 return;
974
975 // Convert List to what ConstantArray needs.
977 UsedArray.resize(List.size());
978 for (unsigned I = 0, E = List.size(); I != E; ++I)
980 cast<Constant>(&*List[I]), Builder.getPtrTy());
981
982 if (UsedArray.empty())
983 return;
984 ArrayType *ATy = ArrayType::get(Builder.getPtrTy(), UsedArray.size());
985
986 auto *GV = new GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
987 ConstantArray::get(ATy, UsedArray), Name);
988
989 GV->setSection("llvm.metadata");
990}
991
994 OMPTgtExecModeFlags Mode) {
995 auto *Int8Ty = Builder.getInt8Ty();
996 auto *GVMode = new GlobalVariable(
997 M, Int8Ty, /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
998 ConstantInt::get(Int8Ty, Mode), Twine(KernelName, "_exec_mode"));
999 GVMode->setVisibility(GlobalVariable::ProtectedVisibility);
1000 return GVMode;
1001}
1002
1004 uint32_t SrcLocStrSize,
1005 IdentFlag LocFlags,
1006 unsigned Reserve2Flags) {
1007 // Enable "C-mode".
1008 LocFlags |= OMP_IDENT_FLAG_KMPC;
1009
1010 Constant *&Ident =
1011 IdentMap[{SrcLocStr, uint64_t(LocFlags) << 31 | Reserve2Flags}];
1012 if (!Ident) {
1013 Constant *I32Null = ConstantInt::getNullValue(Int32);
1014 Constant *IdentData[] = {I32Null,
1015 ConstantInt::get(Int32, uint32_t(LocFlags)),
1016 ConstantInt::get(Int32, Reserve2Flags),
1017 ConstantInt::get(Int32, SrcLocStrSize), SrcLocStr};
1018
1019 size_t SrcLocStrArgIdx = 4;
1020 if (OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx)
1022 IdentData[SrcLocStrArgIdx]->getType()->getPointerAddressSpace())
1023 IdentData[SrcLocStrArgIdx] = ConstantExpr::getAddrSpaceCast(
1024 SrcLocStr, OpenMPIRBuilder::Ident->getElementType(SrcLocStrArgIdx));
1025 Constant *Initializer =
1026 ConstantStruct::get(OpenMPIRBuilder::Ident, IdentData);
1027
1028 // Look for existing encoding of the location + flags, not needed but
1029 // minimizes the difference to the existing solution while we transition.
1030 for (GlobalVariable &GV : M.globals())
1031 if (GV.getValueType() == OpenMPIRBuilder::Ident && GV.hasInitializer())
1032 if (GV.getInitializer() == Initializer)
1033 Ident = &GV;
1034
1035 if (!Ident) {
1036 auto *GV = new GlobalVariable(
1037 M, OpenMPIRBuilder::Ident,
1038 /* isConstant = */ true, GlobalValue::PrivateLinkage, Initializer, "",
1040 M.getDataLayout().getDefaultGlobalsAddressSpace());
1041 GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
1042 GV->setAlignment(Align(8));
1043 Ident = GV;
1044 }
1045 }
1046
1047 return ConstantExpr::getPointerBitCastOrAddrSpaceCast(Ident, IdentPtr);
1048}
1049
1051 uint32_t &SrcLocStrSize) {
1052 SrcLocStrSize = LocStr.size();
1053 Constant *&SrcLocStr = SrcLocStrMap[LocStr];
1054 if (!SrcLocStr) {
1055 Constant *Initializer =
1056 ConstantDataArray::getString(M.getContext(), LocStr);
1057
1058 // Look for existing encoding of the location, not needed but minimizes the
1059 // difference to the existing solution while we transition.
1060 for (GlobalVariable &GV : M.globals())
1061 if (GV.isConstant() && GV.hasInitializer() &&
1062 GV.getInitializer() == Initializer)
1063 return SrcLocStr = ConstantExpr::getPointerCast(&GV, Int8Ptr);
1064
1065 SrcLocStr = Builder.CreateGlobalString(
1066 LocStr, /*Name=*/"", M.getDataLayout().getDefaultGlobalsAddressSpace(),
1067 &M);
1068 }
1069 return SrcLocStr;
1070}
1071
1073 StringRef FileName,
1074 unsigned Line, unsigned Column,
1075 uint32_t &SrcLocStrSize) {
1076 SmallString<128> Buffer;
1077 Buffer.push_back(';');
1078 Buffer.append(FileName);
1079 Buffer.push_back(';');
1080 Buffer.append(FunctionName);
1081 Buffer.push_back(';');
1082 Buffer.append(std::to_string(Line));
1083 Buffer.push_back(';');
1084 Buffer.append(std::to_string(Column));
1085 Buffer.push_back(';');
1086 Buffer.push_back(';');
1087 return getOrCreateSrcLocStr(Buffer.str(), SrcLocStrSize);
1088}
1089
1090Constant *
1092 StringRef UnknownLoc = ";unknown;unknown;0;0;;";
1093 return getOrCreateSrcLocStr(UnknownLoc, SrcLocStrSize);
1094}
1095
1097 uint32_t &SrcLocStrSize,
1098 Function *F) {
1099 DILocation *DIL = DL.get();
1100 if (!DIL)
1101 return getOrCreateDefaultSrcLocStr(SrcLocStrSize);
1102 StringRef FileName = M.getName();
1103 if (DIFile *DIF = DIL->getFile())
1104 if (std::optional<StringRef> Source = DIF->getSource())
1105 FileName = *Source;
1106 StringRef Function = DIL->getScope()->getSubprogram()->getName();
1107 if (Function.empty() && F)
1108 Function = F->getName();
1109 return getOrCreateSrcLocStr(Function, FileName, DIL->getLine(),
1110 DIL->getColumn(), SrcLocStrSize);
1111}
1112
1114 uint32_t &SrcLocStrSize) {
1115 return getOrCreateSrcLocStr(Loc.DL, SrcLocStrSize,
1116 Loc.IP.getBlock()->getParent());
1117}
1118
1121 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
1122 "omp_global_thread_num");
1123}
1124
1127 bool ForceSimpleCall, bool CheckCancelFlag) {
1128 if (!updateToLocation(Loc))
1129 return Loc.IP;
1130
1131 // Build call __kmpc_cancel_barrier(loc, thread_id) or
1132 // __kmpc_barrier(loc, thread_id);
1133
1134 IdentFlag BarrierLocFlags;
1135 switch (Kind) {
1136 case OMPD_for:
1137 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_FOR;
1138 break;
1139 case OMPD_sections:
1140 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SECTIONS;
1141 break;
1142 case OMPD_single:
1143 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL_SINGLE;
1144 break;
1145 case OMPD_barrier:
1146 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_EXPL;
1147 break;
1148 default:
1149 BarrierLocFlags = OMP_IDENT_FLAG_BARRIER_IMPL;
1150 break;
1151 }
1152
1153 uint32_t SrcLocStrSize;
1154 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1155 Value *Args[] = {
1156 getOrCreateIdent(SrcLocStr, SrcLocStrSize, BarrierLocFlags),
1157 getOrCreateThreadID(getOrCreateIdent(SrcLocStr, SrcLocStrSize))};
1158
1159 // If we are in a cancellable parallel region, barriers are cancellation
1160 // points.
1161 // TODO: Check why we would force simple calls or to ignore the cancel flag.
1162 bool UseCancelBarrier =
1163 !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
1164
1166 getOrCreateRuntimeFunctionPtr(UseCancelBarrier
1167 ? OMPRTL___kmpc_cancel_barrier
1168 : OMPRTL___kmpc_barrier),
1169 Args);
1170
1171 if (UseCancelBarrier && CheckCancelFlag)
1172 if (Error Err = emitCancelationCheckImpl(Result, OMPD_parallel))
1173 return Err;
1174
1175 return Builder.saveIP();
1176}
1177
1180 Value *IfCondition,
1181 omp::Directive CanceledDirective) {
1182 if (!updateToLocation(Loc))
1183 return Loc.IP;
1184
1185 // LLVM utilities like blocks with terminators.
1186 auto *UI = Builder.CreateUnreachable();
1187
1188 Instruction *ThenTI = UI, *ElseTI = nullptr;
1189 if (IfCondition) {
1190 SplitBlockAndInsertIfThenElse(IfCondition, UI, &ThenTI, &ElseTI);
1191
1192 // Even if the if condition evaluates to false, this should count as a
1193 // cancellation point
1194 Builder.SetInsertPoint(ElseTI);
1195 auto ElseIP = Builder.saveIP();
1196
1198 LocationDescription{ElseIP, Loc.DL}, CanceledDirective);
1199 if (!IPOrErr)
1200 return IPOrErr;
1201 }
1202
1203 Builder.SetInsertPoint(ThenTI);
1204
1205 Value *CancelKind = nullptr;
1206 switch (CanceledDirective) {
1207#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1208 case DirectiveEnum: \
1209 CancelKind = Builder.getInt32(Value); \
1210 break;
1211#include "llvm/Frontend/OpenMP/OMPKinds.def"
1212 default:
1213 llvm_unreachable("Unknown cancel kind!");
1214 }
1215
1216 uint32_t SrcLocStrSize;
1217 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1218 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1219 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1221 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
1222
1223 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1224 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1225 return Err;
1226
1227 // Update the insertion point and remove the terminator we introduced.
1228 Builder.SetInsertPoint(UI->getParent());
1229 UI->eraseFromParent();
1230
1231 return Builder.saveIP();
1232}
1233
1236 omp::Directive CanceledDirective) {
1237 if (!updateToLocation(Loc))
1238 return Loc.IP;
1239
1240 // LLVM utilities like blocks with terminators.
1241 auto *UI = Builder.CreateUnreachable();
1242 Builder.SetInsertPoint(UI);
1243
1244 Value *CancelKind = nullptr;
1245 switch (CanceledDirective) {
1246#define OMP_CANCEL_KIND(Enum, Str, DirectiveEnum, Value) \
1247 case DirectiveEnum: \
1248 CancelKind = Builder.getInt32(Value); \
1249 break;
1250#include "llvm/Frontend/OpenMP/OMPKinds.def"
1251 default:
1252 llvm_unreachable("Unknown cancel kind!");
1253 }
1254
1255 uint32_t SrcLocStrSize;
1256 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1257 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1258 Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
1260 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancellationpoint), Args);
1261
1262 // The actual cancel logic is shared with others, e.g., cancel_barriers.
1263 if (Error Err = emitCancelationCheckImpl(Result, CanceledDirective))
1264 return Err;
1265
1266 // Update the insertion point and remove the terminator we introduced.
1267 Builder.SetInsertPoint(UI->getParent());
1268 UI->eraseFromParent();
1269
1270 return Builder.saveIP();
1271}
1272
1274 const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return,
1275 Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads,
1276 Value *HostPtr, ArrayRef<Value *> KernelArgs) {
1277 if (!updateToLocation(Loc))
1278 return Loc.IP;
1279
1280 Builder.restoreIP(AllocaIP);
1281 auto *KernelArgsPtr =
1282 Builder.CreateAlloca(OpenMPIRBuilder::KernelArgs, nullptr, "kernel_args");
1284
1285 for (unsigned I = 0, Size = KernelArgs.size(); I != Size; ++I) {
1286 llvm::Value *Arg =
1287 Builder.CreateStructGEP(OpenMPIRBuilder::KernelArgs, KernelArgsPtr, I);
1288 Builder.CreateAlignedStore(
1289 KernelArgs[I], Arg,
1290 M.getDataLayout().getPrefTypeAlign(KernelArgs[I]->getType()));
1291 }
1292
1293 SmallVector<Value *> OffloadingArgs{Ident, DeviceID, NumTeams,
1294 NumThreads, HostPtr, KernelArgsPtr};
1295
1297 getOrCreateRuntimeFunction(M, OMPRTL___tgt_target_kernel),
1298 OffloadingArgs);
1299
1300 return Builder.saveIP();
1301}
1302
1304 const LocationDescription &Loc, Value *OutlinedFnID,
1305 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
1306 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP) {
1307
1308 if (!updateToLocation(Loc))
1309 return Loc.IP;
1310
1311 // On top of the arrays that were filled up, the target offloading call
1312 // takes as arguments the device id as well as the host pointer. The host
1313 // pointer is used by the runtime library to identify the current target
1314 // region, so it only has to be unique and not necessarily point to
1315 // anything. It could be the pointer to the outlined function that
1316 // implements the target region, but we aren't using that so that the
1317 // compiler doesn't need to keep that, and could therefore inline the host
1318 // function if proven worthwhile during optimization.
1319
1320 // From this point on, we need to have an ID of the target region defined.
1321 assert(OutlinedFnID && "Invalid outlined function ID!");
1322 (void)OutlinedFnID;
1323
1324 // Return value of the runtime offloading call.
1325 Value *Return = nullptr;
1326
1327 // Arguments for the target kernel.
1328 SmallVector<Value *> ArgsVector;
1329 getKernelArgsVector(Args, Builder, ArgsVector);
1330
1331 // The target region is an outlined function launched by the runtime
1332 // via calls to __tgt_target_kernel().
1333 //
1334 // Note that on the host and CPU targets, the runtime implementation of
1335 // these calls simply call the outlined function without forking threads.
1336 // The outlined functions themselves have runtime calls to
1337 // __kmpc_fork_teams() and __kmpc_fork() for this purpose, codegen'd by
1338 // the compiler in emitTeamsCall() and emitParallelCall().
1339 //
1340 // In contrast, on the NVPTX target, the implementation of
1341 // __tgt_target_teams() launches a GPU kernel with the requested number
1342 // of teams and threads so no additional calls to the runtime are required.
1343 // Check the error code and execute the host version if required.
1344 Builder.restoreIP(emitTargetKernel(
1345 Builder, AllocaIP, Return, RTLoc, DeviceID, Args.NumTeams.front(),
1346 Args.NumThreads.front(), OutlinedFnID, ArgsVector));
1347
1348 BasicBlock *OffloadFailedBlock =
1349 BasicBlock::Create(Builder.getContext(), "omp_offload.failed");
1350 BasicBlock *OffloadContBlock =
1351 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
1352 Value *Failed = Builder.CreateIsNotNull(Return);
1353 Builder.CreateCondBr(Failed, OffloadFailedBlock, OffloadContBlock);
1354
1355 auto CurFn = Builder.GetInsertBlock()->getParent();
1356 emitBlock(OffloadFailedBlock, CurFn);
1357 InsertPointOrErrorTy AfterIP = EmitTargetCallFallbackCB(Builder.saveIP());
1358 if (!AfterIP)
1359 return AfterIP.takeError();
1360 Builder.restoreIP(*AfterIP);
1361 emitBranch(OffloadContBlock);
1362 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
1363 return Builder.saveIP();
1364}
1365
1367 Value *CancelFlag, omp::Directive CanceledDirective) {
1368 assert(isLastFinalizationInfoCancellable(CanceledDirective) &&
1369 "Unexpected cancellation!");
1370
1371 // For a cancel barrier we create two new blocks.
1372 BasicBlock *BB = Builder.GetInsertBlock();
1373 BasicBlock *NonCancellationBlock;
1374 if (Builder.GetInsertPoint() == BB->end()) {
1375 // TODO: This branch will not be needed once we moved to the
1376 // OpenMPIRBuilder codegen completely.
1377 NonCancellationBlock = BasicBlock::Create(
1378 BB->getContext(), BB->getName() + ".cont", BB->getParent());
1379 } else {
1380 NonCancellationBlock = SplitBlock(BB, &*Builder.GetInsertPoint());
1382 Builder.SetInsertPoint(BB);
1383 }
1384 BasicBlock *CancellationBlock = BasicBlock::Create(
1385 BB->getContext(), BB->getName() + ".cncl", BB->getParent());
1386
1387 // Jump to them based on the return value.
1388 Value *Cmp = Builder.CreateIsNull(CancelFlag);
1389 Builder.CreateCondBr(Cmp, NonCancellationBlock, CancellationBlock,
1390 /* TODO weight */ nullptr, nullptr);
1391
1392 // From the cancellation block we finalize all variables and go to the
1393 // post finalization block that is known to the FiniCB callback.
1394 auto &FI = FinalizationStack.back();
1395 Expected<BasicBlock *> FiniBBOrErr = FI.getFiniBB(Builder);
1396 if (!FiniBBOrErr)
1397 return FiniBBOrErr.takeError();
1398 Builder.SetInsertPoint(CancellationBlock);
1399 Builder.CreateBr(*FiniBBOrErr);
1400
1401 // The continuation block is where code generation continues.
1402 Builder.SetInsertPoint(NonCancellationBlock, NonCancellationBlock->begin());
1403 return Error::success();
1404}
1405
1406// Callback used to create OpenMP runtime calls to support
1407// omp parallel clause for the device.
1408// We need to use this callback to replace call to the OutlinedFn in OuterFn
1409// by the call to the OpenMP DeviceRTL runtime function (kmpc_parallel_60)
1411 OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn,
1412 BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition,
1413 Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1414 Value *ThreadID, const SmallVector<Instruction *, 4> &ToBeDeleted) {
1415 // Add some known attributes.
1416 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1417 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1418 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1419 OutlinedFn.addParamAttr(0, Attribute::NoUndef);
1420 OutlinedFn.addParamAttr(1, Attribute::NoUndef);
1421 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1422
1423 assert(OutlinedFn.arg_size() >= 2 &&
1424 "Expected at least tid and bounded tid as arguments");
1425 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1426
1427 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1428 assert(CI && "Expected call instruction to outlined function");
1429 CI->getParent()->setName("omp_parallel");
1430
1431 Builder.SetInsertPoint(CI);
1432 Type *PtrTy = OMPIRBuilder->VoidPtr;
1433 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1434
1435 // Add alloca for kernel args
1436 OpenMPIRBuilder ::InsertPointTy CurrentIP = Builder.saveIP();
1437 Builder.SetInsertPoint(OuterAllocaBB, OuterAllocaBB->getFirstInsertionPt());
1438 AllocaInst *ArgsAlloca =
1439 Builder.CreateAlloca(ArrayType::get(PtrTy, NumCapturedVars));
1440 Value *Args = ArgsAlloca;
1441 // Add address space cast if array for storing arguments is not allocated
1442 // in address space 0
1443 if (ArgsAlloca->getAddressSpace())
1444 Args = Builder.CreatePointerCast(ArgsAlloca, PtrTy);
1445 Builder.restoreIP(CurrentIP);
1446
1447 // Store captured vars which are used by kmpc_parallel_60
1448 for (unsigned Idx = 0; Idx < NumCapturedVars; Idx++) {
1449 Value *V = *(CI->arg_begin() + 2 + Idx);
1450 Value *StoreAddress = Builder.CreateConstInBoundsGEP2_64(
1451 ArrayType::get(PtrTy, NumCapturedVars), Args, 0, Idx);
1452 Builder.CreateStore(V, StoreAddress);
1453 }
1454
1455 Value *Cond =
1456 IfCondition ? Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32)
1457 : Builder.getInt32(1);
1458
1459 // Build kmpc_parallel_60 call
1460 Value *Parallel60CallArgs[] = {
1461 /* identifier*/ Ident,
1462 /* global thread num*/ ThreadID,
1463 /* if expression */ Cond,
1464 /* number of threads */ NumThreads ? NumThreads : Builder.getInt32(-1),
1465 /* Proc bind */ Builder.getInt32(-1),
1466 /* outlined function */ &OutlinedFn,
1467 /* wrapper function */ NullPtrValue,
1468 /* arguments of the outlined funciton*/ Args,
1469 /* number of arguments */ Builder.getInt64(NumCapturedVars),
1470 /* strict for number of threads */ Builder.getInt32(0)};
1471
1472 FunctionCallee RTLFn =
1473 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_parallel_60);
1474
1475 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, Parallel60CallArgs);
1476
1477 LLVM_DEBUG(dbgs() << "With kmpc_parallel_60 placed: "
1478 << *Builder.GetInsertBlock()->getParent() << "\n");
1479
1480 // Initialize the local TID stack location with the argument value.
1481 Builder.SetInsertPoint(PrivTID);
1482 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1483 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1484 PrivTIDAddr);
1485
1486 // Remove redundant call to the outlined function.
1487 CI->eraseFromParent();
1488
1489 for (Instruction *I : ToBeDeleted) {
1490 I->eraseFromParent();
1491 }
1492}
1493
1494// Callback used to create OpenMP runtime calls to support
1495// omp parallel clause for the host.
1496// We need to use this callback to replace call to the OutlinedFn in OuterFn
1497// by the call to the OpenMP host runtime function ( __kmpc_fork_call[_if])
1498static void
1500 Function *OuterFn, Value *Ident, Value *IfCondition,
1501 Instruction *PrivTID, AllocaInst *PrivTIDAddr,
1502 const SmallVector<Instruction *, 4> &ToBeDeleted) {
1503 IRBuilder<> &Builder = OMPIRBuilder->Builder;
1504 FunctionCallee RTLFn;
1505 if (IfCondition) {
1506 RTLFn =
1507 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call_if);
1508 } else {
1509 RTLFn =
1510 OMPIRBuilder->getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
1511 }
1512 if (auto *F = dyn_cast<Function>(RTLFn.getCallee())) {
1513 if (!F->hasMetadata(LLVMContext::MD_callback)) {
1514 LLVMContext &Ctx = F->getContext();
1515 MDBuilder MDB(Ctx);
1516 // Annotate the callback behavior of the __kmpc_fork_call:
1517 // - The callback callee is argument number 2 (microtask).
1518 // - The first two arguments of the callback callee are unknown (-1).
1519 // - All variadic arguments to the __kmpc_fork_call are passed to the
1520 // callback callee.
1521 F->addMetadata(LLVMContext::MD_callback,
1523 2, {-1, -1},
1524 /* VarArgsArePassed */ true)}));
1525 }
1526 }
1527 // Add some known attributes.
1528 OutlinedFn.addParamAttr(0, Attribute::NoAlias);
1529 OutlinedFn.addParamAttr(1, Attribute::NoAlias);
1530 OutlinedFn.addFnAttr(Attribute::NoUnwind);
1531
1532 assert(OutlinedFn.arg_size() >= 2 &&
1533 "Expected at least tid and bounded tid as arguments");
1534 unsigned NumCapturedVars = OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
1535
1536 CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
1537 CI->getParent()->setName("omp_parallel");
1538 Builder.SetInsertPoint(CI);
1539
1540 // Build call __kmpc_fork_call[_if](Ident, n, microtask, var1, .., varn);
1541 Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
1542 &OutlinedFn};
1543
1544 SmallVector<Value *, 16> RealArgs;
1545 RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
1546 if (IfCondition) {
1547 Value *Cond = Builder.CreateSExtOrTrunc(IfCondition, OMPIRBuilder->Int32);
1548 RealArgs.push_back(Cond);
1549 }
1550 RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
1551
1552 // __kmpc_fork_call_if always expects a void ptr as the last argument
1553 // If there are no arguments, pass a null pointer.
1554 auto PtrTy = OMPIRBuilder->VoidPtr;
1555 if (IfCondition && NumCapturedVars == 0) {
1556 Value *NullPtrValue = Constant::getNullValue(PtrTy);
1557 RealArgs.push_back(NullPtrValue);
1558 }
1559
1560 OMPIRBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
1561
1562 LLVM_DEBUG(dbgs() << "With fork_call placed: "
1563 << *Builder.GetInsertBlock()->getParent() << "\n");
1564
1565 // Initialize the local TID stack location with the argument value.
1566 Builder.SetInsertPoint(PrivTID);
1567 Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
1568 Builder.CreateStore(Builder.CreateLoad(OMPIRBuilder->Int32, OutlinedAI),
1569 PrivTIDAddr);
1570
1571 // Remove redundant call to the outlined function.
1572 CI->eraseFromParent();
1573
1574 for (Instruction *I : ToBeDeleted) {
1575 I->eraseFromParent();
1576 }
1577}
1578
1580 const LocationDescription &Loc, InsertPointTy OuterAllocaIP,
1581 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
1582 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
1583 omp::ProcBindKind ProcBind, bool IsCancellable) {
1584 assert(!isConflictIP(Loc.IP, OuterAllocaIP) && "IPs must not be ambiguous");
1585
1586 if (!updateToLocation(Loc))
1587 return Loc.IP;
1588
1589 uint32_t SrcLocStrSize;
1590 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1591 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1592 const bool NeedThreadID = NumThreads || Config.isTargetDevice() ||
1593 (ProcBind != OMP_PROC_BIND_default);
1594 Value *ThreadID = NeedThreadID ? getOrCreateThreadID(Ident) : nullptr;
1595 // If we generate code for the target device, we need to allocate
1596 // struct for aggregate params in the device default alloca address space.
1597 // OpenMP runtime requires that the params of the extracted functions are
1598 // passed as zero address space pointers. This flag ensures that extracted
1599 // function arguments are declared in zero address space
1600 bool ArgsInZeroAddressSpace = Config.isTargetDevice();
1601
1602 // Build call __kmpc_push_num_threads(&Ident, global_tid, num_threads)
1603 // only if we compile for host side.
1604 if (NumThreads && !Config.isTargetDevice()) {
1605 Value *Args[] = {
1606 Ident, ThreadID,
1607 Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
1609 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
1610 }
1611
1612 if (ProcBind != OMP_PROC_BIND_default) {
1613 // Build call __kmpc_push_proc_bind(&Ident, global_tid, proc_bind)
1614 Value *Args[] = {
1615 Ident, ThreadID,
1616 ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
1618 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
1619 }
1620
1621 BasicBlock *InsertBB = Builder.GetInsertBlock();
1622 Function *OuterFn = InsertBB->getParent();
1623
1624 // Save the outer alloca block because the insertion iterator may get
1625 // invalidated and we still need this later.
1626 BasicBlock *OuterAllocaBlock = OuterAllocaIP.getBlock();
1627
1628 // Vector to remember instructions we used only during the modeling but which
1629 // we want to delete at the end.
1631
1632 // Change the location to the outer alloca insertion point to create and
1633 // initialize the allocas we pass into the parallel region.
1634 InsertPointTy NewOuter(OuterAllocaBlock, OuterAllocaBlock->begin());
1635 Builder.restoreIP(NewOuter);
1636 AllocaInst *TIDAddrAlloca = Builder.CreateAlloca(Int32, nullptr, "tid.addr");
1637 AllocaInst *ZeroAddrAlloca =
1638 Builder.CreateAlloca(Int32, nullptr, "zero.addr");
1639 Instruction *TIDAddr = TIDAddrAlloca;
1640 Instruction *ZeroAddr = ZeroAddrAlloca;
1641 if (ArgsInZeroAddressSpace && M.getDataLayout().getAllocaAddrSpace() != 0) {
1642 // Add additional casts to enforce pointers in zero address space
1643 TIDAddr = new AddrSpaceCastInst(
1644 TIDAddrAlloca, PointerType ::get(M.getContext(), 0), "tid.addr.ascast");
1645 TIDAddr->insertAfter(TIDAddrAlloca->getIterator());
1646 ToBeDeleted.push_back(TIDAddr);
1647 ZeroAddr = new AddrSpaceCastInst(ZeroAddrAlloca,
1648 PointerType ::get(M.getContext(), 0),
1649 "zero.addr.ascast");
1650 ZeroAddr->insertAfter(ZeroAddrAlloca->getIterator());
1651 ToBeDeleted.push_back(ZeroAddr);
1652 }
1653
1654 // We only need TIDAddr and ZeroAddr for modeling purposes to get the
1655 // associated arguments in the outlined function, so we delete them later.
1656 ToBeDeleted.push_back(TIDAddrAlloca);
1657 ToBeDeleted.push_back(ZeroAddrAlloca);
1658
1659 // Create an artificial insertion point that will also ensure the blocks we
1660 // are about to split are not degenerated.
1661 auto *UI = new UnreachableInst(Builder.getContext(), InsertBB);
1662
1663 BasicBlock *EntryBB = UI->getParent();
1664 BasicBlock *PRegEntryBB = EntryBB->splitBasicBlock(UI, "omp.par.entry");
1665 BasicBlock *PRegBodyBB = PRegEntryBB->splitBasicBlock(UI, "omp.par.region");
1666 BasicBlock *PRegPreFiniBB =
1667 PRegBodyBB->splitBasicBlock(UI, "omp.par.pre_finalize");
1668 BasicBlock *PRegExitBB = PRegPreFiniBB->splitBasicBlock(UI, "omp.par.exit");
1669
1670 auto FiniCBWrapper = [&](InsertPointTy IP) {
1671 // Hide "open-ended" blocks from the given FiniCB by setting the right jump
1672 // target to the region exit block.
1673 if (IP.getBlock()->end() == IP.getPoint()) {
1675 Builder.restoreIP(IP);
1676 Instruction *I = Builder.CreateBr(PRegExitBB);
1677 IP = InsertPointTy(I->getParent(), I->getIterator());
1678 }
1679 assert(IP.getBlock()->getTerminator()->getNumSuccessors() == 1 &&
1680 IP.getBlock()->getTerminator()->getSuccessor(0) == PRegExitBB &&
1681 "Unexpected insertion point for finalization call!");
1682 return FiniCB(IP);
1683 };
1684
1685 FinalizationStack.push_back({FiniCBWrapper, OMPD_parallel, IsCancellable});
1686
1687 // Generate the privatization allocas in the block that will become the entry
1688 // of the outlined function.
1689 Builder.SetInsertPoint(PRegEntryBB->getTerminator());
1690 InsertPointTy InnerAllocaIP = Builder.saveIP();
1691
1692 AllocaInst *PrivTIDAddr =
1693 Builder.CreateAlloca(Int32, nullptr, "tid.addr.local");
1694 Instruction *PrivTID = Builder.CreateLoad(Int32, PrivTIDAddr, "tid");
1695
1696 // Add some fake uses for OpenMP provided arguments.
1697 ToBeDeleted.push_back(Builder.CreateLoad(Int32, TIDAddr, "tid.addr.use"));
1698 Instruction *ZeroAddrUse =
1699 Builder.CreateLoad(Int32, ZeroAddr, "zero.addr.use");
1700 ToBeDeleted.push_back(ZeroAddrUse);
1701
1702 // EntryBB
1703 // |
1704 // V
1705 // PRegionEntryBB <- Privatization allocas are placed here.
1706 // |
1707 // V
1708 // PRegionBodyBB <- BodeGen is invoked here.
1709 // |
1710 // V
1711 // PRegPreFiniBB <- The block we will start finalization from.
1712 // |
1713 // V
1714 // PRegionExitBB <- A common exit to simplify block collection.
1715 //
1716
1717 LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
1718
1719 // Let the caller create the body.
1720 assert(BodyGenCB && "Expected body generation callback!");
1721 InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
1722 if (Error Err = BodyGenCB(InnerAllocaIP, CodeGenIP))
1723 return Err;
1724
1725 LLVM_DEBUG(dbgs() << "After body codegen: " << *OuterFn << "\n");
1726
1727 OutlineInfo OI;
1728 if (Config.isTargetDevice()) {
1729 // Generate OpenMP target specific runtime call
1730 OI.PostOutlineCB = [=, ToBeDeletedVec =
1731 std::move(ToBeDeleted)](Function &OutlinedFn) {
1732 targetParallelCallback(this, OutlinedFn, OuterFn, OuterAllocaBlock, Ident,
1733 IfCondition, NumThreads, PrivTID, PrivTIDAddr,
1734 ThreadID, ToBeDeletedVec);
1735 };
1736 OI.FixUpNonEntryAllocas = true;
1737 } else {
1738 // Generate OpenMP host runtime call
1739 OI.PostOutlineCB = [=, ToBeDeletedVec =
1740 std::move(ToBeDeleted)](Function &OutlinedFn) {
1741 hostParallelCallback(this, OutlinedFn, OuterFn, Ident, IfCondition,
1742 PrivTID, PrivTIDAddr, ToBeDeletedVec);
1743 };
1744 OI.FixUpNonEntryAllocas = true;
1745 }
1746
1747 OI.OuterAllocaBB = OuterAllocaBlock;
1748 OI.EntryBB = PRegEntryBB;
1749 OI.ExitBB = PRegExitBB;
1750
1751 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
1753 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
1754
1755 CodeExtractorAnalysisCache CEAC(*OuterFn);
1756 CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
1757 /* AggregateArgs */ false,
1758 /* BlockFrequencyInfo */ nullptr,
1759 /* BranchProbabilityInfo */ nullptr,
1760 /* AssumptionCache */ nullptr,
1761 /* AllowVarArgs */ true,
1762 /* AllowAlloca */ true,
1763 /* AllocationBlock */ OuterAllocaBlock,
1764 /* Suffix */ ".omp_par", ArgsInZeroAddressSpace);
1765
1766 // Find inputs to, outputs from the code region.
1767 BasicBlock *CommonExit = nullptr;
1768 SetVector<Value *> Inputs, Outputs, SinkingCands, HoistingCands;
1769 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
1770
1771 Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands,
1772 /*CollectGlobalInputs=*/true);
1773
1774 Inputs.remove_if([&](Value *I) {
1776 return GV->getValueType() == OpenMPIRBuilder::Ident;
1777
1778 return false;
1779 });
1780
1781 LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
1782
1783 FunctionCallee TIDRTLFn =
1784 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
1785
1786 auto PrivHelper = [&](Value &V) -> Error {
1787 if (&V == TIDAddr || &V == ZeroAddr) {
1789 return Error::success();
1790 }
1791
1793 for (Use &U : V.uses())
1794 if (auto *UserI = dyn_cast<Instruction>(U.getUser()))
1795 if (ParallelRegionBlockSet.count(UserI->getParent()))
1796 Uses.insert(&U);
1797
1798 // __kmpc_fork_call expects extra arguments as pointers. If the input
1799 // already has a pointer type, everything is fine. Otherwise, store the
1800 // value onto stack and load it back inside the to-be-outlined region. This
1801 // will ensure only the pointer will be passed to the function.
1802 // FIXME: if there are more than 15 trailing arguments, they must be
1803 // additionally packed in a struct.
1804 Value *Inner = &V;
1805 if (!V.getType()->isPointerTy()) {
1807 LLVM_DEBUG(llvm::dbgs() << "Forwarding input as pointer: " << V << "\n");
1808
1809 Builder.restoreIP(OuterAllocaIP);
1810 Value *Ptr =
1811 Builder.CreateAlloca(V.getType(), nullptr, V.getName() + ".reloaded");
1812
1813 // Store to stack at end of the block that currently branches to the entry
1814 // block of the to-be-outlined region.
1815 Builder.SetInsertPoint(InsertBB,
1816 InsertBB->getTerminator()->getIterator());
1817 Builder.CreateStore(&V, Ptr);
1818
1819 // Load back next to allocations in the to-be-outlined region.
1820 Builder.restoreIP(InnerAllocaIP);
1821 Inner = Builder.CreateLoad(V.getType(), Ptr);
1822 }
1823
1824 Value *ReplacementValue = nullptr;
1825 CallInst *CI = dyn_cast<CallInst>(&V);
1826 if (CI && CI->getCalledFunction() == TIDRTLFn.getCallee()) {
1827 ReplacementValue = PrivTID;
1828 } else {
1829 InsertPointOrErrorTy AfterIP =
1830 PrivCB(InnerAllocaIP, Builder.saveIP(), V, *Inner, ReplacementValue);
1831 if (!AfterIP)
1832 return AfterIP.takeError();
1833 Builder.restoreIP(*AfterIP);
1834 InnerAllocaIP = {
1835 InnerAllocaIP.getBlock(),
1836 InnerAllocaIP.getBlock()->getTerminator()->getIterator()};
1837
1838 assert(ReplacementValue &&
1839 "Expected copy/create callback to set replacement value!");
1840 if (ReplacementValue == &V)
1841 return Error::success();
1842 }
1843
1844 for (Use *UPtr : Uses)
1845 UPtr->set(ReplacementValue);
1846
1847 return Error::success();
1848 };
1849
1850 // Reset the inner alloca insertion as it will be used for loading the values
1851 // wrapped into pointers before passing them into the to-be-outlined region.
1852 // Configure it to insert immediately after the fake use of zero address so
1853 // that they are available in the generated body and so that the
1854 // OpenMP-related values (thread ID and zero address pointers) remain leading
1855 // in the argument list.
1856 InnerAllocaIP = IRBuilder<>::InsertPoint(
1857 ZeroAddrUse->getParent(), ZeroAddrUse->getNextNode()->getIterator());
1858
1859 // Reset the outer alloca insertion point to the entry of the relevant block
1860 // in case it was invalidated.
1861 OuterAllocaIP = IRBuilder<>::InsertPoint(
1862 OuterAllocaBlock, OuterAllocaBlock->getFirstInsertionPt());
1863
1864 for (Value *Input : Inputs) {
1865 LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
1866 if (Error Err = PrivHelper(*Input))
1867 return Err;
1868 }
1869 LLVM_DEBUG({
1870 for (Value *Output : Outputs)
1871 LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
1872 });
1873 assert(Outputs.empty() &&
1874 "OpenMP outlining should not produce live-out values!");
1875
1876 LLVM_DEBUG(dbgs() << "After privatization: " << *OuterFn << "\n");
1877 LLVM_DEBUG({
1878 for (auto *BB : Blocks)
1879 dbgs() << " PBR: " << BB->getName() << "\n";
1880 });
1881
1882 // Adjust the finalization stack, verify the adjustment, and call the
1883 // finalize function a last time to finalize values between the pre-fini
1884 // block and the exit block if we left the parallel "the normal way".
1885 auto FiniInfo = FinalizationStack.pop_back_val();
1886 (void)FiniInfo;
1887 assert(FiniInfo.DK == OMPD_parallel &&
1888 "Unexpected finalization stack state!");
1889
1890 Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
1891
1892 InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
1893 Expected<BasicBlock *> FiniBBOrErr = FiniInfo.getFiniBB(Builder);
1894 if (!FiniBBOrErr)
1895 return FiniBBOrErr.takeError();
1896 {
1898 Builder.restoreIP(PreFiniIP);
1899 Builder.CreateBr(*FiniBBOrErr);
1900 // There's currently a branch to omp.par.exit. Delete it. We will get there
1901 // via the fini block
1902 if (Instruction *Term = Builder.GetInsertBlock()->getTerminator())
1903 Term->eraseFromParent();
1904 }
1905
1906 // Register the outlined info.
1907 addOutlineInfo(std::move(OI));
1908
1909 InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
1910 UI->eraseFromParent();
1911
1912 return AfterIP;
1913}
1914
1916 // Build call void __kmpc_flush(ident_t *loc)
1917 uint32_t SrcLocStrSize;
1918 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1919 Value *Args[] = {getOrCreateIdent(SrcLocStr, SrcLocStrSize)};
1920
1922 Args);
1923}
1924
1926 if (!updateToLocation(Loc))
1927 return;
1928 emitFlush(Loc);
1929}
1930
1932 // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
1933 // global_tid);
1934 uint32_t SrcLocStrSize;
1935 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1936 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1937 Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
1938
1939 // Ignore return result until untied tasks are supported.
1941 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait), Args);
1942}
1943
1949
1951 // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
1952 uint32_t SrcLocStrSize;
1953 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
1954 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
1955 Constant *I32Null = ConstantInt::getNullValue(Int32);
1956 Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
1957
1959 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield), Args);
1960}
1961
1967
1968// Processes the dependencies in Dependencies and does the following
1969// - Allocates space on the stack of an array of DependInfo objects
1970// - Populates each DependInfo object with relevant information of
1971// the corresponding dependence.
1972// - All code is inserted in the entry block of the current function.
1974 OpenMPIRBuilder &OMPBuilder,
1976 // Early return if we have no dependencies to process
1977 if (Dependencies.empty())
1978 return nullptr;
1979
1980 // Given a vector of DependData objects, in this function we create an
1981 // array on the stack that holds kmp_dep_info objects corresponding
1982 // to each dependency. This is then passed to the OpenMP runtime.
1983 // For example, if there are 'n' dependencies then the following psedo
1984 // code is generated. Assume the first dependence is on a variable 'a'
1985 //
1986 // \code{c}
1987 // DepArray = alloc(n x sizeof(kmp_depend_info);
1988 // idx = 0;
1989 // DepArray[idx].base_addr = ptrtoint(&a);
1990 // DepArray[idx].len = 8;
1991 // DepArray[idx].flags = Dep.DepKind; /*(See OMPContants.h for DepKind)*/
1992 // ++idx;
1993 // DepArray[idx].base_addr = ...;
1994 // \endcode
1995
1996 IRBuilderBase &Builder = OMPBuilder.Builder;
1997 Type *DependInfo = OMPBuilder.DependInfo;
1998 Module &M = OMPBuilder.M;
1999
2000 Value *DepArray = nullptr;
2001 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
2002 Builder.SetInsertPoint(
2004
2005 Type *DepArrayTy = ArrayType::get(DependInfo, Dependencies.size());
2006 DepArray = Builder.CreateAlloca(DepArrayTy, nullptr, ".dep.arr.addr");
2007
2008 Builder.restoreIP(OldIP);
2009
2010 for (const auto &[DepIdx, Dep] : enumerate(Dependencies)) {
2011 Value *Base =
2012 Builder.CreateConstInBoundsGEP2_64(DepArrayTy, DepArray, 0, DepIdx);
2013 // Store the pointer to the variable
2014 Value *Addr = Builder.CreateStructGEP(
2015 DependInfo, Base,
2016 static_cast<unsigned int>(RTLDependInfoFields::BaseAddr));
2017 Value *DepValPtr = Builder.CreatePtrToInt(Dep.DepVal, Builder.getInt64Ty());
2018 Builder.CreateStore(DepValPtr, Addr);
2019 // Store the size of the variable
2020 Value *Size = Builder.CreateStructGEP(
2021 DependInfo, Base, static_cast<unsigned int>(RTLDependInfoFields::Len));
2022 Builder.CreateStore(
2023 Builder.getInt64(M.getDataLayout().getTypeStoreSize(Dep.DepValueType)),
2024 Size);
2025 // Store the dependency kind
2026 Value *Flags = Builder.CreateStructGEP(
2027 DependInfo, Base,
2028 static_cast<unsigned int>(RTLDependInfoFields::Flags));
2029 Builder.CreateStore(
2030 ConstantInt::get(Builder.getInt8Ty(),
2031 static_cast<unsigned int>(Dep.DepKind)),
2032 Flags);
2033 }
2034 return DepArray;
2035}
2036
2037/// Create the task duplication function passed to kmpc_taskloop.
2038Expected<Value *> OpenMPIRBuilder::createTaskDuplicationFunction(
2039 Type *PrivatesTy, int32_t PrivatesIndex, TaskDupCallbackTy DupCB) {
2040 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2041 if (!DupCB)
2043 PointerType::get(Builder.getContext(), ProgramAddressSpace));
2044
2045 // From OpenMP Runtime p_task_dup_t:
2046 // Routine optionally generated by the compiler for setting the lastprivate
2047 // flag and calling needed constructors for private/firstprivate objects (used
2048 // to form taskloop tasks from pattern task) Parameters: dest task, src task,
2049 // lastprivate flag.
2050 // typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
2051
2052 auto *VoidPtrTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2053
2054 FunctionType *DupFuncTy = FunctionType::get(
2055 Builder.getVoidTy(), {VoidPtrTy, VoidPtrTy, Builder.getInt32Ty()},
2056 /*isVarArg=*/false);
2057
2058 Function *DupFunction = Function::Create(DupFuncTy, Function::InternalLinkage,
2059 "omp_taskloop_dup", M);
2060 Value *DestTaskArg = DupFunction->getArg(0);
2061 Value *SrcTaskArg = DupFunction->getArg(1);
2062 Value *LastprivateFlagArg = DupFunction->getArg(2);
2063 DestTaskArg->setName("dest_task");
2064 SrcTaskArg->setName("src_task");
2065 LastprivateFlagArg->setName("lastprivate_flag");
2066
2067 IRBuilderBase::InsertPointGuard Guard(Builder);
2068 Builder.SetInsertPoint(
2069 BasicBlock::Create(Builder.getContext(), "entry", DupFunction));
2070
2071 auto GetTaskContextPtrFromArg = [&](Value *Arg) -> Value * {
2072 Type *TaskWithPrivatesTy =
2073 StructType::get(Builder.getContext(), {Task, PrivatesTy});
2074 Value *TaskPrivates = Builder.CreateGEP(
2075 TaskWithPrivatesTy, Arg, {Builder.getInt32(0), Builder.getInt32(1)});
2076 Value *ContextPtr = Builder.CreateGEP(
2077 PrivatesTy, TaskPrivates,
2078 {Builder.getInt32(0), Builder.getInt32(PrivatesIndex)});
2079 return ContextPtr;
2080 };
2081
2082 Value *DestTaskContextPtr = GetTaskContextPtrFromArg(DestTaskArg);
2083 Value *SrcTaskContextPtr = GetTaskContextPtrFromArg(SrcTaskArg);
2084
2085 DestTaskContextPtr->setName("destPtr");
2086 SrcTaskContextPtr->setName("srcPtr");
2087
2088 InsertPointTy AllocaIP(&DupFunction->getEntryBlock(),
2089 DupFunction->getEntryBlock().begin());
2090 InsertPointTy CodeGenIP = Builder.saveIP();
2091 Expected<IRBuilderBase::InsertPoint> AfterIPOrError =
2092 DupCB(AllocaIP, CodeGenIP, DestTaskContextPtr, SrcTaskContextPtr);
2093 if (!AfterIPOrError)
2094 return AfterIPOrError.takeError();
2095 Builder.restoreIP(*AfterIPOrError);
2096
2097 Builder.CreateRetVoid();
2098
2099 return DupFunction;
2100}
2101
2102OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTaskloop(
2103 const LocationDescription &Loc, InsertPointTy AllocaIP,
2104 BodyGenCallbackTy BodyGenCB,
2105 llvm::function_ref<llvm::Expected<llvm::CanonicalLoopInfo *>()> LoopInfo,
2106 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied, Value *IfCond,
2107 Value *GrainSize, bool NoGroup, int Sched, Value *Final, bool Mergeable,
2108 Value *Priority, uint64_t NumOfCollapseLoops, TaskDupCallbackTy DupCB,
2109 Value *TaskContextStructPtrVal) {
2110
2111 if (!updateToLocation(Loc))
2112 return InsertPointTy();
2113
2114 uint32_t SrcLocStrSize;
2115 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2116 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2117
2118 BasicBlock *TaskloopExitBB =
2119 splitBB(Builder, /*CreateBranch=*/true, "taskloop.exit");
2120 BasicBlock *TaskloopBodyBB =
2121 splitBB(Builder, /*CreateBranch=*/true, "taskloop.body");
2122 BasicBlock *TaskloopAllocaBB =
2123 splitBB(Builder, /*CreateBranch=*/true, "taskloop.alloca");
2124
2125 InsertPointTy TaskloopAllocaIP =
2126 InsertPointTy(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2127 InsertPointTy TaskloopBodyIP =
2128 InsertPointTy(TaskloopBodyBB, TaskloopBodyBB->begin());
2129
2130 if (Error Err = BodyGenCB(TaskloopAllocaIP, TaskloopBodyIP))
2131 return Err;
2132
2133 llvm::Expected<llvm::CanonicalLoopInfo *> result = LoopInfo();
2134 if (!result) {
2135 return result.takeError();
2136 }
2137
2138 llvm::CanonicalLoopInfo *CLI = result.get();
2139 OutlineInfo OI;
2140 OI.EntryBB = TaskloopAllocaBB;
2141 OI.OuterAllocaBB = AllocaIP.getBlock();
2142 OI.ExitBB = TaskloopExitBB;
2143
2144 // Add the thread ID argument.
2145 SmallVector<Instruction *> ToBeDeleted;
2146 // dummy instruction to be used as a fake argument
2147 OI.ExcludeArgsFromAggregate.push_back(createFakeIntVal(
2148 Builder, AllocaIP, ToBeDeleted, TaskloopAllocaIP, "global.tid", false));
2149 Value *FakeLB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2150 TaskloopAllocaIP, "lb", false, true);
2151 Value *FakeUB = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2152 TaskloopAllocaIP, "ub", false, true);
2153 Value *FakeStep = createFakeIntVal(Builder, AllocaIP, ToBeDeleted,
2154 TaskloopAllocaIP, "step", false, true);
2155 // For Taskloop, we want to force the bounds being the first 3 inputs in the
2156 // aggregate struct
2157 OI.Inputs.insert(FakeLB);
2158 OI.Inputs.insert(FakeUB);
2159 OI.Inputs.insert(FakeStep);
2160 if (TaskContextStructPtrVal)
2161 OI.Inputs.insert(TaskContextStructPtrVal);
2162 assert(((TaskContextStructPtrVal && DupCB) ||
2163 (!TaskContextStructPtrVal && !DupCB)) &&
2164 "Task context struct ptr and duplication callback must be both set "
2165 "or both null");
2166
2167 // It isn't safe to run the duplication bodygen callback inside the post
2168 // outlining callback so this has to be run now before we know the real task
2169 // shareds structure type.
2170 unsigned ProgramAddressSpace = M.getDataLayout().getProgramAddressSpace();
2171 Type *PointerTy = PointerType::get(Builder.getContext(), ProgramAddressSpace);
2172 Type *FakeSharedsTy = StructType::get(
2173 Builder.getContext(),
2174 {FakeLB->getType(), FakeUB->getType(), FakeStep->getType(), PointerTy});
2175 Expected<Value *> TaskDupFnOrErr = createTaskDuplicationFunction(
2176 FakeSharedsTy,
2177 /*PrivatesIndex: the pointer after the three indices above*/ 3, DupCB);
2178 if (!TaskDupFnOrErr) {
2179 return TaskDupFnOrErr.takeError();
2180 }
2181 Value *TaskDupFn = *TaskDupFnOrErr;
2182
2183 OI.PostOutlineCB = [this, Ident, LBVal, UBVal, StepVal, Untied,
2184 TaskloopAllocaBB, CLI, Loc, TaskDupFn, ToBeDeleted,
2185 IfCond, GrainSize, NoGroup, Sched, FakeLB, FakeUB,
2186 FakeStep, FakeSharedsTy, Final, Mergeable, Priority,
2187 NumOfCollapseLoops](Function &OutlinedFn) mutable {
2188 // Replace the Stale CI by appropriate RTL function call.
2189 assert(OutlinedFn.hasOneUse() &&
2190 "there must be a single user for the outlined function");
2191 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2192
2193 /* Create the casting for the Bounds Values that can be used when outlining
2194 * to replace the uses of the fakes with real values */
2195 BasicBlock *CodeReplBB = StaleCI->getParent();
2196 Builder.SetInsertPoint(CodeReplBB->getFirstInsertionPt());
2197 Value *CastedLBVal =
2198 Builder.CreateIntCast(LBVal, Builder.getInt64Ty(), true, "lb64");
2199 Value *CastedUBVal =
2200 Builder.CreateIntCast(UBVal, Builder.getInt64Ty(), true, "ub64");
2201 Value *CastedStepVal =
2202 Builder.CreateIntCast(StepVal, Builder.getInt64Ty(), true, "step64");
2203
2204 Builder.SetInsertPoint(StaleCI);
2205
2206 // Gather the arguments for emitting the runtime call for
2207 // @__kmpc_omp_task_alloc
2208 Function *TaskAllocFn =
2209 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2210
2211 Value *ThreadID = getOrCreateThreadID(Ident);
2212
2213 if (!NoGroup) {
2214 // Emit runtime call for @__kmpc_taskgroup
2215 Function *TaskgroupFn =
2216 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2217 Builder.CreateCall(TaskgroupFn, {Ident, ThreadID});
2218 }
2219
2220 // `flags` Argument Configuration
2221 // Task is tied if (Flags & 1) == 1.
2222 // Task is untied if (Flags & 1) == 0.
2223 // Task is final if (Flags & 2) == 2.
2224 // Task is not final if (Flags & 2) == 0.
2225 // Task is mergeable if (Flags & 4) == 4.
2226 // Task is not mergeable if (Flags & 4) == 0.
2227 // Task is priority if (Flags & 32) == 32.
2228 // Task is not priority if (Flags & 32) == 0.
2229 Value *Flags = Builder.getInt32(Untied ? 0 : 1);
2230 if (Final)
2231 Flags = Builder.CreateOr(Builder.getInt32(2), Flags);
2232 if (Mergeable)
2233 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2234 if (Priority)
2235 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2236
2237 Value *TaskSize = Builder.getInt64(
2238 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2239
2240 AllocaInst *ArgStructAlloca =
2242 assert(ArgStructAlloca &&
2243 "Unable to find the alloca instruction corresponding to arguments "
2244 "for extracted function");
2245 std::optional<TypeSize> ArgAllocSize =
2246 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2247 assert(ArgAllocSize &&
2248 "Unable to determine size of arguments for extracted function");
2249 Value *SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2250
2251 // Emit the @__kmpc_omp_task_alloc runtime call
2252 // The runtime call returns a pointer to an area where the task captured
2253 // variables must be copied before the task is run (TaskData)
2254 CallInst *TaskData = Builder.CreateCall(
2255 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2256 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2257 /*task_func=*/&OutlinedFn});
2258
2259 Value *Shareds = StaleCI->getArgOperand(1);
2260 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2261 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2262 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2263 SharedsSize);
2264 // Get the pointer to loop lb, ub, step from task ptr
2265 // and set up the lowerbound,upperbound and step values
2266 llvm::Value *Lb = Builder.CreateGEP(
2267 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(0)});
2268
2269 llvm::Value *Ub = Builder.CreateGEP(
2270 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(1)});
2271
2272 llvm::Value *Step = Builder.CreateGEP(
2273 FakeSharedsTy, TaskShareds, {Builder.getInt32(0), Builder.getInt32(2)});
2274 llvm::Value *Loadstep = Builder.CreateLoad(Builder.getInt64Ty(), Step);
2275
2276 // set up the arguments for emitting kmpc_taskloop runtime call
2277 // setting values for ifval, nogroup, sched, grainsize, task_dup
2278 Value *IfCondVal =
2279 IfCond ? Builder.CreateIntCast(IfCond, Builder.getInt32Ty(), true)
2280 : Builder.getInt32(1);
2281 // As __kmpc_taskgroup is called manually in OMPIRBuilder, NoGroupVal should
2282 // always be 1 when calling __kmpc_taskloop to ensure it is not called again
2283 Value *NoGroupVal = Builder.getInt32(1);
2284 Value *SchedVal = Builder.getInt32(Sched);
2285 Value *GrainSizeVal =
2286 GrainSize ? Builder.CreateIntCast(GrainSize, Builder.getInt64Ty(), true)
2287 : Builder.getInt64(0);
2288 Value *TaskDup = TaskDupFn;
2289
2290 Value *Args[] = {Ident, ThreadID, TaskData, IfCondVal, Lb, Ub,
2291 Loadstep, NoGroupVal, SchedVal, GrainSizeVal, TaskDup};
2292
2293 // taskloop runtime call
2294 Function *TaskloopFn =
2295 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskloop);
2296 Builder.CreateCall(TaskloopFn, Args);
2297
2298 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup if
2299 // nogroup is not defined
2300 if (!NoGroup) {
2301 Function *EndTaskgroupFn =
2302 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2303 Builder.CreateCall(EndTaskgroupFn, {Ident, ThreadID});
2304 }
2305
2306 StaleCI->eraseFromParent();
2307
2308 Builder.SetInsertPoint(TaskloopAllocaBB, TaskloopAllocaBB->begin());
2309
2310 LoadInst *SharedsOutlined =
2311 Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2312 OutlinedFn.getArg(1)->replaceUsesWithIf(
2313 SharedsOutlined,
2314 [SharedsOutlined](Use &U) { return U.getUser() != SharedsOutlined; });
2315
2316 Value *IV = CLI->getIndVar();
2317 Type *IVTy = IV->getType();
2318 Constant *One = ConstantInt::get(Builder.getInt64Ty(), 1);
2319
2320 // When outlining, CodeExtractor will create GEP's to the LowerBound and
2321 // UpperBound. These GEP's can be reused for loading the tasks respective
2322 // bounds.
2323 Value *TaskLB = nullptr;
2324 Value *TaskUB = nullptr;
2325 Value *LoadTaskLB = nullptr;
2326 Value *LoadTaskUB = nullptr;
2327 for (Instruction &I : *TaskloopAllocaBB) {
2328 if (I.getOpcode() == Instruction::GetElementPtr) {
2329 GetElementPtrInst &Gep = cast<GetElementPtrInst>(I);
2330 if (ConstantInt *CI = dyn_cast<ConstantInt>(Gep.getOperand(2))) {
2331 switch (CI->getZExtValue()) {
2332 case 0:
2333 TaskLB = &I;
2334 break;
2335 case 1:
2336 TaskUB = &I;
2337 break;
2338 }
2339 }
2340 } else if (I.getOpcode() == Instruction::Load) {
2341 LoadInst &Load = cast<LoadInst>(I);
2342 if (Load.getPointerOperand() == TaskLB) {
2343 assert(TaskLB != nullptr && "Expected value for TaskLB");
2344 LoadTaskLB = &I;
2345 } else if (Load.getPointerOperand() == TaskUB) {
2346 assert(TaskUB != nullptr && "Expected value for TaskUB");
2347 LoadTaskUB = &I;
2348 }
2349 }
2350 }
2351
2352 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
2353
2354 assert(LoadTaskLB != nullptr && "Expected value for LoadTaskLB");
2355 assert(LoadTaskUB != nullptr && "Expected value for LoadTaskUB");
2356 Value *TripCountMinusOne =
2357 Builder.CreateSDiv(Builder.CreateSub(LoadTaskUB, LoadTaskLB), FakeStep);
2358 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One, "trip_cnt");
2359 Value *CastedTripCount = Builder.CreateIntCast(TripCount, IVTy, true);
2360 Value *CastedTaskLB = Builder.CreateIntCast(LoadTaskLB, IVTy, true);
2361 // set the trip count in the CLI
2362 CLI->setTripCount(CastedTripCount);
2363
2364 Builder.SetInsertPoint(CLI->getBody(),
2365 CLI->getBody()->getFirstInsertionPt());
2366
2367 if (NumOfCollapseLoops > 1) {
2368 llvm::SmallVector<User *> UsersToReplace;
2369 // When using the collapse clause, the bounds of the loop have to be
2370 // adjusted to properly represent the iterator of the outer loop.
2371 Value *IVPlusTaskLB = Builder.CreateAdd(
2372 CLI->getIndVar(),
2373 Builder.CreateSub(CastedTaskLB, ConstantInt::get(IVTy, 1)));
2374 // To ensure every Use is correctly captured, we first want to record
2375 // which users to replace the value in, and then replace the value.
2376 for (auto IVUse = CLI->getIndVar()->uses().begin();
2377 IVUse != CLI->getIndVar()->uses().end(); IVUse++) {
2378 User *IVUser = IVUse->getUser();
2379 if (auto *Op = dyn_cast<BinaryOperator>(IVUser)) {
2380 if (Op->getOpcode() == Instruction::URem ||
2381 Op->getOpcode() == Instruction::UDiv) {
2382 UsersToReplace.push_back(IVUser);
2383 }
2384 }
2385 }
2386 for (User *User : UsersToReplace) {
2387 User->replaceUsesOfWith(CLI->getIndVar(), IVPlusTaskLB);
2388 }
2389 } else {
2390 // The canonical loop is generated with a fixed lower bound. We need to
2391 // update the index calculation code to use the task's lower bound. The
2392 // generated code looks like this:
2393 // %omp_loop.iv = phi ...
2394 // ...
2395 // %tmp = mul [type] %omp_loop.iv, step
2396 // %user_index = add [type] tmp, lb
2397 // OpenMPIRBuilder constructs canonical loops to have exactly three uses
2398 // of the normalised induction variable:
2399 // 1. This one: converting the normalised IV to the user IV
2400 // 2. The increment (add)
2401 // 3. The comparison against the trip count (icmp)
2402 // (1) is the only use that is a mul followed by an add so this cannot
2403 // match other IR.
2404 assert(CLI->getIndVar()->getNumUses() == 3 &&
2405 "Canonical loop should have exactly three uses of the ind var");
2406 for (User *IVUser : CLI->getIndVar()->users()) {
2407 if (auto *Mul = dyn_cast<BinaryOperator>(IVUser)) {
2408 if (Mul->getOpcode() == Instruction::Mul) {
2409 for (User *MulUser : Mul->users()) {
2410 if (auto *Add = dyn_cast<BinaryOperator>(MulUser)) {
2411 if (Add->getOpcode() == Instruction::Add) {
2412 Add->setOperand(1, CastedTaskLB);
2413 }
2414 }
2415 }
2416 }
2417 }
2418 }
2419 }
2420
2421 FakeLB->replaceAllUsesWith(CastedLBVal);
2422 FakeUB->replaceAllUsesWith(CastedUBVal);
2423 FakeStep->replaceAllUsesWith(CastedStepVal);
2424 for (Instruction *I : llvm::reverse(ToBeDeleted)) {
2425 I->eraseFromParent();
2426 }
2427 };
2428
2429 addOutlineInfo(std::move(OI));
2430 Builder.SetInsertPoint(TaskloopExitBB, TaskloopExitBB->begin());
2431 return Builder.saveIP();
2432}
2433
2436 M.getContext(), M.getDataLayout().getPointerSizeInBits());
2437 return llvm::StructType::get(IntPtrTy, IntPtrTy,
2438 llvm::Type::getInt32Ty(M.getContext()));
2439}
2440
2442 const LocationDescription &Loc, InsertPointTy AllocaIP,
2443 BodyGenCallbackTy BodyGenCB, bool Tied, Value *Final, Value *IfCondition,
2444 SmallVector<DependData> Dependencies, AffinityData Affinities,
2445 bool Mergeable, Value *EventHandle, Value *Priority) {
2446
2447 if (!updateToLocation(Loc))
2448 return InsertPointTy();
2449
2450 uint32_t SrcLocStrSize;
2451 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2452 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2453 // The current basic block is split into four basic blocks. After outlining,
2454 // they will be mapped as follows:
2455 // ```
2456 // def current_fn() {
2457 // current_basic_block:
2458 // br label %task.exit
2459 // task.exit:
2460 // ; instructions after task
2461 // }
2462 // def outlined_fn() {
2463 // task.alloca:
2464 // br label %task.body
2465 // task.body:
2466 // ret void
2467 // }
2468 // ```
2469 BasicBlock *TaskExitBB = splitBB(Builder, /*CreateBranch=*/true, "task.exit");
2470 BasicBlock *TaskBodyBB = splitBB(Builder, /*CreateBranch=*/true, "task.body");
2471 BasicBlock *TaskAllocaBB =
2472 splitBB(Builder, /*CreateBranch=*/true, "task.alloca");
2473
2474 InsertPointTy TaskAllocaIP =
2475 InsertPointTy(TaskAllocaBB, TaskAllocaBB->begin());
2476 InsertPointTy TaskBodyIP = InsertPointTy(TaskBodyBB, TaskBodyBB->begin());
2477 if (Error Err = BodyGenCB(TaskAllocaIP, TaskBodyIP))
2478 return Err;
2479
2480 OutlineInfo OI;
2481 OI.EntryBB = TaskAllocaBB;
2482 OI.OuterAllocaBB = AllocaIP.getBlock();
2483 OI.ExitBB = TaskExitBB;
2484
2485 // Add the thread ID argument.
2488 Builder, AllocaIP, ToBeDeleted, TaskAllocaIP, "global.tid", false));
2489
2490 OI.PostOutlineCB = [this, Ident, Tied, Final, IfCondition, Dependencies,
2491 Affinities, Mergeable, Priority, EventHandle,
2492 TaskAllocaBB, ToBeDeleted](Function &OutlinedFn) mutable {
2493 // Replace the Stale CI by appropriate RTL function call.
2494 assert(OutlinedFn.hasOneUse() &&
2495 "there must be a single user for the outlined function");
2496 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
2497
2498 // HasShareds is true if any variables are captured in the outlined region,
2499 // false otherwise.
2500 bool HasShareds = StaleCI->arg_size() > 1;
2501 Builder.SetInsertPoint(StaleCI);
2502
2503 // Gather the arguments for emitting the runtime call for
2504 // @__kmpc_omp_task_alloc
2505 Function *TaskAllocFn =
2506 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc);
2507
2508 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
2509 // call.
2510 Value *ThreadID = getOrCreateThreadID(Ident);
2511
2512 // Argument - `flags`
2513 // Task is tied iff (Flags & 1) == 1.
2514 // Task is untied iff (Flags & 1) == 0.
2515 // Task is final iff (Flags & 2) == 2.
2516 // Task is not final iff (Flags & 2) == 0.
2517 // Task is mergeable iff (Flags & 4) == 4.
2518 // Task is not mergeable iff (Flags & 4) == 0.
2519 // Task is priority iff (Flags & 32) == 32.
2520 // Task is not priority iff (Flags & 32) == 0.
2521 // TODO: Handle the other flags.
2522 Value *Flags = Builder.getInt32(Tied);
2523 if (Final) {
2524 Value *FinalFlag =
2525 Builder.CreateSelect(Final, Builder.getInt32(2), Builder.getInt32(0));
2526 Flags = Builder.CreateOr(FinalFlag, Flags);
2527 }
2528
2529 if (Mergeable)
2530 Flags = Builder.CreateOr(Builder.getInt32(4), Flags);
2531 if (Priority)
2532 Flags = Builder.CreateOr(Builder.getInt32(32), Flags);
2533
2534 // Argument - `sizeof_kmp_task_t` (TaskSize)
2535 // Tasksize refers to the size in bytes of kmp_task_t data structure
2536 // including private vars accessed in task.
2537 // TODO: add kmp_task_t_with_privates (privates)
2538 Value *TaskSize = Builder.getInt64(
2539 divideCeil(M.getDataLayout().getTypeSizeInBits(Task), 8));
2540
2541 // Argument - `sizeof_shareds` (SharedsSize)
2542 // SharedsSize refers to the shareds array size in the kmp_task_t data
2543 // structure.
2544 Value *SharedsSize = Builder.getInt64(0);
2545 if (HasShareds) {
2546 AllocaInst *ArgStructAlloca =
2548 assert(ArgStructAlloca &&
2549 "Unable to find the alloca instruction corresponding to arguments "
2550 "for extracted function");
2551 std::optional<TypeSize> ArgAllocSize =
2552 ArgStructAlloca->getAllocationSize(M.getDataLayout());
2553 assert(ArgAllocSize &&
2554 "Unable to determine size of arguments for extracted function");
2555 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
2556 }
2557 // Emit the @__kmpc_omp_task_alloc runtime call
2558 // The runtime call returns a pointer to an area where the task captured
2559 // variables must be copied before the task is run (TaskData)
2561 TaskAllocFn, {/*loc_ref=*/Ident, /*gtid=*/ThreadID, /*flags=*/Flags,
2562 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
2563 /*task_func=*/&OutlinedFn});
2564
2565 if (Affinities.Count && Affinities.Info) {
2567 OMPRTL___kmpc_omp_reg_task_with_affinity);
2568
2569 createRuntimeFunctionCall(RegAffFn, {Ident, ThreadID, TaskData,
2570 Affinities.Count, Affinities.Info});
2571 }
2572
2573 // Emit detach clause initialization.
2574 // evt = (typeof(evt))__kmpc_task_allow_completion_event(loc, tid,
2575 // task_descriptor);
2576 if (EventHandle) {
2578 OMPRTL___kmpc_task_allow_completion_event);
2579 llvm::Value *EventVal =
2580 createRuntimeFunctionCall(TaskDetachFn, {Ident, ThreadID, TaskData});
2581 llvm::Value *EventHandleAddr =
2582 Builder.CreatePointerBitCastOrAddrSpaceCast(EventHandle,
2583 Builder.getPtrTy(0));
2584 EventVal = Builder.CreatePtrToInt(EventVal, Builder.getInt64Ty());
2585 Builder.CreateStore(EventVal, EventHandleAddr);
2586 }
2587 // Copy the arguments for outlined function
2588 if (HasShareds) {
2589 Value *Shareds = StaleCI->getArgOperand(1);
2590 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
2591 Value *TaskShareds = Builder.CreateLoad(VoidPtr, TaskData);
2592 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
2593 SharedsSize);
2594 }
2595
2596 if (Priority) {
2597 //
2598 // The return type of "__kmpc_omp_task_alloc" is "kmp_task_t *",
2599 // we populate the priority information into the "kmp_task_t" here
2600 //
2601 // The struct "kmp_task_t" definition is available in kmp.h
2602 // kmp_task_t = { shareds, routine, part_id, data1, data2 }
2603 // data2 is used for priority
2604 //
2605 Type *Int32Ty = Builder.getInt32Ty();
2606 Constant *Zero = ConstantInt::get(Int32Ty, 0);
2607 // kmp_task_t* => { ptr }
2608 Type *TaskPtr = StructType::get(VoidPtr);
2609 Value *TaskGEP =
2610 Builder.CreateInBoundsGEP(TaskPtr, TaskData, {Zero, Zero});
2611 // kmp_task_t => { ptr, ptr, i32, ptr, ptr }
2612 Type *TaskStructType = StructType::get(
2613 VoidPtr, VoidPtr, Builder.getInt32Ty(), VoidPtr, VoidPtr);
2614 Value *PriorityData = Builder.CreateInBoundsGEP(
2615 TaskStructType, TaskGEP, {Zero, ConstantInt::get(Int32Ty, 4)});
2616 // kmp_cmplrdata_t => { ptr, ptr }
2617 Type *CmplrStructType = StructType::get(VoidPtr, VoidPtr);
2618 Value *CmplrData = Builder.CreateInBoundsGEP(CmplrStructType,
2619 PriorityData, {Zero, Zero});
2620 Builder.CreateStore(Priority, CmplrData);
2621 }
2622
2623 Value *DepArray = emitTaskDependencies(*this, Dependencies);
2624
2625 // In the presence of the `if` clause, the following IR is generated:
2626 // ...
2627 // %data = call @__kmpc_omp_task_alloc(...)
2628 // br i1 %if_condition, label %then, label %else
2629 // then:
2630 // call @__kmpc_omp_task(...)
2631 // br label %exit
2632 // else:
2633 // ;; Wait for resolution of dependencies, if any, before
2634 // ;; beginning the task
2635 // call @__kmpc_omp_wait_deps(...)
2636 // call @__kmpc_omp_task_begin_if0(...)
2637 // call @outlined_fn(...)
2638 // call @__kmpc_omp_task_complete_if0(...)
2639 // br label %exit
2640 // exit:
2641 // ...
2642 if (IfCondition) {
2643 // `SplitBlockAndInsertIfThenElse` requires the block to have a
2644 // terminator.
2645 splitBB(Builder, /*CreateBranch=*/true, "if.end");
2646 Instruction *IfTerminator =
2647 Builder.GetInsertPoint()->getParent()->getTerminator();
2648 Instruction *ThenTI = IfTerminator, *ElseTI = nullptr;
2649 Builder.SetInsertPoint(IfTerminator);
2650 SplitBlockAndInsertIfThenElse(IfCondition, IfTerminator, &ThenTI,
2651 &ElseTI);
2652 Builder.SetInsertPoint(ElseTI);
2653
2654 if (Dependencies.size()) {
2655 Function *TaskWaitFn =
2656 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
2658 TaskWaitFn,
2659 {Ident, ThreadID, Builder.getInt32(Dependencies.size()), DepArray,
2660 ConstantInt::get(Builder.getInt32Ty(), 0),
2662 }
2663 Function *TaskBeginFn =
2664 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
2665 Function *TaskCompleteFn =
2666 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
2667 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
2668 CallInst *CI = nullptr;
2669 if (HasShareds)
2670 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID, TaskData});
2671 else
2672 CI = createRuntimeFunctionCall(&OutlinedFn, {ThreadID});
2673 CI->setDebugLoc(StaleCI->getDebugLoc());
2674 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
2675 Builder.SetInsertPoint(ThenTI);
2676 }
2677
2678 if (Dependencies.size()) {
2679 Function *TaskFn =
2680 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
2682 TaskFn,
2683 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
2684 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
2686
2687 } else {
2688 // Emit the @__kmpc_omp_task runtime call to spawn the task
2689 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
2690 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
2691 }
2692
2693 StaleCI->eraseFromParent();
2694
2695 Builder.SetInsertPoint(TaskAllocaBB, TaskAllocaBB->begin());
2696 if (HasShareds) {
2697 LoadInst *Shareds = Builder.CreateLoad(VoidPtr, OutlinedFn.getArg(1));
2698 OutlinedFn.getArg(1)->replaceUsesWithIf(
2699 Shareds, [Shareds](Use &U) { return U.getUser() != Shareds; });
2700 }
2701
2702 for (Instruction *I : llvm::reverse(ToBeDeleted))
2703 I->eraseFromParent();
2704 };
2705
2706 addOutlineInfo(std::move(OI));
2707 Builder.SetInsertPoint(TaskExitBB, TaskExitBB->begin());
2708
2709 return Builder.saveIP();
2710}
2711
2714 InsertPointTy AllocaIP,
2715 BodyGenCallbackTy BodyGenCB) {
2716 if (!updateToLocation(Loc))
2717 return InsertPointTy();
2718
2719 uint32_t SrcLocStrSize;
2720 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
2721 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
2722 Value *ThreadID = getOrCreateThreadID(Ident);
2723
2724 // Emit the @__kmpc_taskgroup runtime call to start the taskgroup
2725 Function *TaskgroupFn =
2726 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_taskgroup);
2727 createRuntimeFunctionCall(TaskgroupFn, {Ident, ThreadID});
2728
2729 BasicBlock *TaskgroupExitBB = splitBB(Builder, true, "taskgroup.exit");
2730 if (Error Err = BodyGenCB(AllocaIP, Builder.saveIP()))
2731 return Err;
2732
2733 Builder.SetInsertPoint(TaskgroupExitBB);
2734 // Emit the @__kmpc_end_taskgroup runtime call to end the taskgroup
2735 Function *EndTaskgroupFn =
2736 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_taskgroup);
2737 createRuntimeFunctionCall(EndTaskgroupFn, {Ident, ThreadID});
2738
2739 return Builder.saveIP();
2740}
2741
2743 const LocationDescription &Loc, InsertPointTy AllocaIP,
2745 FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait) {
2746 assert(!isConflictIP(AllocaIP, Loc.IP) && "Dedicated IP allocas required");
2747
2748 if (!updateToLocation(Loc))
2749 return Loc.IP;
2750
2751 FinalizationStack.push_back({FiniCB, OMPD_sections, IsCancellable});
2752
2753 // Each section is emitted as a switch case
2754 // Each finalization callback is handled from clang.EmitOMPSectionDirective()
2755 // -> OMP.createSection() which generates the IR for each section
2756 // Iterate through all sections and emit a switch construct:
2757 // switch (IV) {
2758 // case 0:
2759 // <SectionStmt[0]>;
2760 // break;
2761 // ...
2762 // case <NumSection> - 1:
2763 // <SectionStmt[<NumSection> - 1]>;
2764 // break;
2765 // }
2766 // ...
2767 // section_loop.after:
2768 // <FiniCB>;
2769 auto LoopBodyGenCB = [&](InsertPointTy CodeGenIP, Value *IndVar) -> Error {
2770 Builder.restoreIP(CodeGenIP);
2772 splitBBWithSuffix(Builder, /*CreateBranch=*/false, ".sections.after");
2773 Function *CurFn = Continue->getParent();
2774 SwitchInst *SwitchStmt = Builder.CreateSwitch(IndVar, Continue);
2775
2776 unsigned CaseNumber = 0;
2777 for (auto SectionCB : SectionCBs) {
2779 M.getContext(), "omp_section_loop.body.case", CurFn, Continue);
2780 SwitchStmt->addCase(Builder.getInt32(CaseNumber), CaseBB);
2781 Builder.SetInsertPoint(CaseBB);
2782 UncondBrInst *CaseEndBr = Builder.CreateBr(Continue);
2783 if (Error Err = SectionCB(InsertPointTy(), {CaseEndBr->getParent(),
2784 CaseEndBr->getIterator()}))
2785 return Err;
2786 CaseNumber++;
2787 }
2788 // remove the existing terminator from body BB since there can be no
2789 // terminators after switch/case
2790 return Error::success();
2791 };
2792 // Loop body ends here
2793 // LowerBound, UpperBound, and STride for createCanonicalLoop
2794 Type *I32Ty = Type::getInt32Ty(M.getContext());
2795 Value *LB = ConstantInt::get(I32Ty, 0);
2796 Value *UB = ConstantInt::get(I32Ty, SectionCBs.size());
2797 Value *ST = ConstantInt::get(I32Ty, 1);
2799 Loc, LoopBodyGenCB, LB, UB, ST, true, false, AllocaIP, "section_loop");
2800 if (!LoopInfo)
2801 return LoopInfo.takeError();
2802
2803 InsertPointOrErrorTy WsloopIP =
2804 applyStaticWorkshareLoop(Loc.DL, *LoopInfo, AllocaIP,
2805 WorksharingLoopType::ForStaticLoop, !IsNowait);
2806 if (!WsloopIP)
2807 return WsloopIP.takeError();
2808 InsertPointTy AfterIP = *WsloopIP;
2809
2810 BasicBlock *LoopFini = AfterIP.getBlock()->getSinglePredecessor();
2811 assert(LoopFini && "Bad structure of static workshare loop finalization");
2812
2813 // Apply the finalization callback in LoopAfterBB
2814 auto FiniInfo = FinalizationStack.pop_back_val();
2815 assert(FiniInfo.DK == OMPD_sections &&
2816 "Unexpected finalization stack state!");
2817 if (Error Err = FiniInfo.mergeFiniBB(Builder, LoopFini))
2818 return Err;
2819
2820 return AfterIP;
2821}
2822
2825 BodyGenCallbackTy BodyGenCB,
2826 FinalizeCallbackTy FiniCB) {
2827 if (!updateToLocation(Loc))
2828 return Loc.IP;
2829
2830 auto FiniCBWrapper = [&](InsertPointTy IP) {
2831 if (IP.getBlock()->end() != IP.getPoint())
2832 return FiniCB(IP);
2833 // This must be done otherwise any nested constructs using FinalizeOMPRegion
2834 // will fail because that function requires the Finalization Basic Block to
2835 // have a terminator, which is already removed by EmitOMPRegionBody.
2836 // IP is currently at cancelation block.
2837 // We need to backtrack to the condition block to fetch
2838 // the exit block and create a branch from cancelation
2839 // to exit block.
2841 Builder.restoreIP(IP);
2842 auto *CaseBB = Loc.IP.getBlock();
2843 auto *CondBB = CaseBB->getSinglePredecessor()->getSinglePredecessor();
2844 auto *ExitBB = CondBB->getTerminator()->getSuccessor(1);
2845 Instruction *I = Builder.CreateBr(ExitBB);
2846 IP = InsertPointTy(I->getParent(), I->getIterator());
2847 return FiniCB(IP);
2848 };
2849
2850 Directive OMPD = Directive::OMPD_sections;
2851 // Since we are using Finalization Callback here, HasFinalize
2852 // and IsCancellable have to be true
2853 return EmitOMPInlinedRegion(OMPD, nullptr, nullptr, BodyGenCB, FiniCBWrapper,
2854 /*Conditional*/ false, /*hasFinalize*/ true,
2855 /*IsCancellable*/ true);
2856}
2857
2863
2864Value *OpenMPIRBuilder::getGPUThreadID() {
2867 OMPRTL___kmpc_get_hardware_thread_id_in_block),
2868 {});
2869}
2870
2871Value *OpenMPIRBuilder::getGPUWarpSize() {
2873 getOrCreateRuntimeFunction(M, OMPRTL___kmpc_get_warp_size), {});
2874}
2875
2876Value *OpenMPIRBuilder::getNVPTXWarpID() {
2877 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2878 return Builder.CreateAShr(getGPUThreadID(), LaneIDBits, "nvptx_warp_id");
2879}
2880
2881Value *OpenMPIRBuilder::getNVPTXLaneID() {
2882 unsigned LaneIDBits = Log2_32(Config.getGridValue().GV_Warp_Size);
2883 assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
2884 unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
2885 return Builder.CreateAnd(getGPUThreadID(), Builder.getInt32(LaneIDMask),
2886 "nvptx_lane_id");
2887}
2888
2889Value *OpenMPIRBuilder::castValueToType(InsertPointTy AllocaIP, Value *From,
2890 Type *ToType) {
2891 Type *FromType = From->getType();
2892 uint64_t FromSize = M.getDataLayout().getTypeStoreSize(FromType);
2893 uint64_t ToSize = M.getDataLayout().getTypeStoreSize(ToType);
2894 assert(FromSize > 0 && "From size must be greater than zero");
2895 assert(ToSize > 0 && "To size must be greater than zero");
2896 if (FromType == ToType)
2897 return From;
2898 if (FromSize == ToSize)
2899 return Builder.CreateBitCast(From, ToType);
2900 if (ToType->isIntegerTy() && FromType->isIntegerTy())
2901 return Builder.CreateIntCast(From, ToType, /*isSigned*/ true);
2902 InsertPointTy SaveIP = Builder.saveIP();
2903 Builder.restoreIP(AllocaIP);
2904 Value *CastItem = Builder.CreateAlloca(ToType);
2905 Builder.restoreIP(SaveIP);
2906
2907 Value *ValCastItem = Builder.CreatePointerBitCastOrAddrSpaceCast(
2908 CastItem, Builder.getPtrTy(0));
2909 Builder.CreateStore(From, ValCastItem);
2910 return Builder.CreateLoad(ToType, CastItem);
2911}
2912
2913Value *OpenMPIRBuilder::createRuntimeShuffleFunction(InsertPointTy AllocaIP,
2914 Value *Element,
2915 Type *ElementType,
2916 Value *Offset) {
2917 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElementType);
2918 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction");
2919
2920 // Cast all types to 32- or 64-bit values before calling shuffle routines.
2921 Type *CastTy = Builder.getIntNTy(Size <= 4 ? 32 : 64);
2922 Value *ElemCast = castValueToType(AllocaIP, Element, CastTy);
2923 Value *WarpSize =
2924 Builder.CreateIntCast(getGPUWarpSize(), Builder.getInt16Ty(), true);
2926 Size <= 4 ? RuntimeFunction::OMPRTL___kmpc_shuffle_int32
2927 : RuntimeFunction::OMPRTL___kmpc_shuffle_int64);
2928 Value *WarpSizeCast =
2929 Builder.CreateIntCast(WarpSize, Builder.getInt16Ty(), /*isSigned=*/true);
2930 Value *ShuffleCall =
2931 createRuntimeFunctionCall(ShuffleFunc, {ElemCast, Offset, WarpSizeCast});
2932 return castValueToType(AllocaIP, ShuffleCall, CastTy);
2933}
2934
2935void OpenMPIRBuilder::shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr,
2936 Value *DstAddr, Type *ElemType,
2937 Value *Offset, Type *ReductionArrayTy,
2938 bool IsByRefElem) {
2939 uint64_t Size = M.getDataLayout().getTypeStoreSize(ElemType);
2940 // Create the loop over the big sized data.
2941 // ptr = (void*)Elem;
2942 // ptrEnd = (void*) Elem + 1;
2943 // Step = 8;
2944 // while (ptr + Step < ptrEnd)
2945 // shuffle((int64_t)*ptr);
2946 // Step = 4;
2947 // while (ptr + Step < ptrEnd)
2948 // shuffle((int32_t)*ptr);
2949 // ...
2950 Type *IndexTy = Builder.getIndexTy(
2951 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
2952 Value *ElemPtr = DstAddr;
2953 Value *Ptr = SrcAddr;
2954 for (unsigned IntSize = 8; IntSize >= 1; IntSize /= 2) {
2955 if (Size < IntSize)
2956 continue;
2957 Type *IntType = Builder.getIntNTy(IntSize * 8);
2958 Ptr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2959 Ptr, Builder.getPtrTy(0), Ptr->getName() + ".ascast");
2960 Value *SrcAddrGEP =
2961 Builder.CreateGEP(ElemType, SrcAddr, {ConstantInt::get(IndexTy, 1)});
2962 ElemPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
2963 ElemPtr, Builder.getPtrTy(0), ElemPtr->getName() + ".ascast");
2964
2965 Function *CurFunc = Builder.GetInsertBlock()->getParent();
2966 if ((Size / IntSize) > 1) {
2967 Value *PtrEnd = Builder.CreatePointerBitCastOrAddrSpaceCast(
2968 SrcAddrGEP, Builder.getPtrTy());
2969 BasicBlock *PreCondBB =
2970 BasicBlock::Create(M.getContext(), ".shuffle.pre_cond");
2971 BasicBlock *ThenBB = BasicBlock::Create(M.getContext(), ".shuffle.then");
2972 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), ".shuffle.exit");
2973 BasicBlock *CurrentBB = Builder.GetInsertBlock();
2974 emitBlock(PreCondBB, CurFunc);
2975 PHINode *PhiSrc =
2976 Builder.CreatePHI(Ptr->getType(), /*NumReservedValues=*/2);
2977 PhiSrc->addIncoming(Ptr, CurrentBB);
2978 PHINode *PhiDest =
2979 Builder.CreatePHI(ElemPtr->getType(), /*NumReservedValues=*/2);
2980 PhiDest->addIncoming(ElemPtr, CurrentBB);
2981 Ptr = PhiSrc;
2982 ElemPtr = PhiDest;
2983 Value *PtrDiff = Builder.CreatePtrDiff(
2984 Builder.getInt8Ty(), PtrEnd,
2985 Builder.CreatePointerBitCastOrAddrSpaceCast(Ptr, Builder.getPtrTy()));
2986 Builder.CreateCondBr(
2987 Builder.CreateICmpSGT(PtrDiff, Builder.getInt64(IntSize - 1)), ThenBB,
2988 ExitBB);
2989 emitBlock(ThenBB, CurFunc);
2990 Value *Res = createRuntimeShuffleFunction(
2991 AllocaIP,
2992 Builder.CreateAlignedLoad(
2993 IntType, Ptr, M.getDataLayout().getPrefTypeAlign(ElemType)),
2994 IntType, Offset);
2995 Builder.CreateAlignedStore(Res, ElemPtr,
2996 M.getDataLayout().getPrefTypeAlign(ElemType));
2997 Value *LocalPtr =
2998 Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
2999 Value *LocalElemPtr =
3000 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3001 PhiSrc->addIncoming(LocalPtr, ThenBB);
3002 PhiDest->addIncoming(LocalElemPtr, ThenBB);
3003 emitBranch(PreCondBB);
3004 emitBlock(ExitBB, CurFunc);
3005 } else {
3006 Value *Res = createRuntimeShuffleFunction(
3007 AllocaIP, Builder.CreateLoad(IntType, Ptr), IntType, Offset);
3008 if (ElemType->isIntegerTy() && ElemType->getScalarSizeInBits() <
3009 Res->getType()->getScalarSizeInBits())
3010 Res = Builder.CreateTrunc(Res, ElemType);
3011 Builder.CreateStore(Res, ElemPtr);
3012 Ptr = Builder.CreateGEP(IntType, Ptr, {ConstantInt::get(IndexTy, 1)});
3013 ElemPtr =
3014 Builder.CreateGEP(IntType, ElemPtr, {ConstantInt::get(IndexTy, 1)});
3015 }
3016 Size = Size % IntSize;
3017 }
3018}
3019
3020Error OpenMPIRBuilder::emitReductionListCopy(
3021 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
3022 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
3023 ArrayRef<bool> IsByRef, CopyOptionsTy CopyOptions) {
3024 Type *IndexTy = Builder.getIndexTy(
3025 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3026 Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
3027
3028 // Iterates, element-by-element, through the source Reduce list and
3029 // make a copy.
3030 for (auto En : enumerate(ReductionInfos)) {
3031 const ReductionInfo &RI = En.value();
3032 Value *SrcElementAddr = nullptr;
3033 AllocaInst *DestAlloca = nullptr;
3034 Value *DestElementAddr = nullptr;
3035 Value *DestElementPtrAddr = nullptr;
3036 // Should we shuffle in an element from a remote lane?
3037 bool ShuffleInElement = false;
3038 // Set to true to update the pointer in the dest Reduce list to a
3039 // newly created element.
3040 bool UpdateDestListPtr = false;
3041
3042 // Step 1.1: Get the address for the src element in the Reduce list.
3043 Value *SrcElementPtrAddr = Builder.CreateInBoundsGEP(
3044 ReductionArrayTy, SrcBase,
3045 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3046 SrcElementAddr = Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrAddr);
3047
3048 // Step 1.2: Create a temporary to store the element in the destination
3049 // Reduce list.
3050 DestElementPtrAddr = Builder.CreateInBoundsGEP(
3051 ReductionArrayTy, DestBase,
3052 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3053 bool IsByRefElem = (!IsByRef.empty() && IsByRef[En.index()]);
3054 switch (Action) {
3056 InsertPointTy CurIP = Builder.saveIP();
3057 Builder.restoreIP(AllocaIP);
3058
3059 Type *DestAllocaType =
3060 IsByRefElem ? RI.ByRefAllocatedType : RI.ElementType;
3061 DestAlloca = Builder.CreateAlloca(DestAllocaType, nullptr,
3062 ".omp.reduction.element");
3063 DestAlloca->setAlignment(
3064 M.getDataLayout().getPrefTypeAlign(DestAllocaType));
3065 DestElementAddr = DestAlloca;
3066 DestElementAddr =
3067 Builder.CreateAddrSpaceCast(DestElementAddr, Builder.getPtrTy(),
3068 DestElementAddr->getName() + ".ascast");
3069 Builder.restoreIP(CurIP);
3070 ShuffleInElement = true;
3071 UpdateDestListPtr = true;
3072 break;
3073 }
3075 DestElementAddr =
3076 Builder.CreateLoad(Builder.getPtrTy(), DestElementPtrAddr);
3077 break;
3078 }
3079 }
3080
3081 // Now that all active lanes have read the element in the
3082 // Reduce list, shuffle over the value from the remote lane.
3083 if (ShuffleInElement) {
3084 Type *ShuffleType = RI.ElementType;
3085 Value *ShuffleSrcAddr = SrcElementAddr;
3086 Value *ShuffleDestAddr = DestElementAddr;
3087 AllocaInst *LocalStorage = nullptr;
3088
3089 if (IsByRefElem) {
3090 assert(RI.ByRefElementType && "Expected by-ref element type to be set");
3091 assert(RI.ByRefAllocatedType &&
3092 "Expected by-ref allocated type to be set");
3093 // For by-ref reductions, we need to copy from the remote lane the
3094 // actual value of the partial reduction computed by that remote lane;
3095 // rather than, for example, a pointer to that data or, even worse, a
3096 // pointer to the descriptor of the by-ref reduction element.
3097 ShuffleType = RI.ByRefElementType;
3098
3099 InsertPointOrErrorTy GenResult =
3100 RI.DataPtrPtrGen(Builder.saveIP(), ShuffleSrcAddr, ShuffleSrcAddr);
3101
3102 if (!GenResult)
3103 return GenResult.takeError();
3104
3105 ShuffleSrcAddr = Builder.CreateLoad(Builder.getPtrTy(), ShuffleSrcAddr);
3106
3107 {
3108 InsertPointTy OldIP = Builder.saveIP();
3109 Builder.restoreIP(AllocaIP);
3110
3111 LocalStorage = Builder.CreateAlloca(ShuffleType);
3112 Builder.restoreIP(OldIP);
3113 ShuffleDestAddr = LocalStorage;
3114 }
3115 }
3116
3117 shuffleAndStore(AllocaIP, ShuffleSrcAddr, ShuffleDestAddr, ShuffleType,
3118 RemoteLaneOffset, ReductionArrayTy, IsByRefElem);
3119
3120 if (IsByRefElem) {
3121 // Copy descriptor from source and update base_ptr to shuffled data
3122 Value *DestDescriptorAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3123 DestAlloca, Builder.getPtrTy(), ".ascast");
3124
3125 InsertPointOrErrorTy GenResult = generateReductionDescriptor(
3126 DestDescriptorAddr, LocalStorage, SrcElementAddr,
3127 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3128
3129 if (!GenResult)
3130 return GenResult.takeError();
3131 }
3132 } else {
3133 switch (RI.EvaluationKind) {
3134 case EvalKind::Scalar: {
3135 Value *Elem = Builder.CreateLoad(RI.ElementType, SrcElementAddr);
3136 // Store the source element value to the dest element address.
3137 Builder.CreateStore(Elem, DestElementAddr);
3138 break;
3139 }
3140 case EvalKind::Complex: {
3141 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3142 RI.ElementType, SrcElementAddr, 0, 0, ".realp");
3143 Value *SrcReal = Builder.CreateLoad(
3144 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3145 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3146 RI.ElementType, SrcElementAddr, 0, 1, ".imagp");
3147 Value *SrcImg = Builder.CreateLoad(
3148 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3149
3150 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3151 RI.ElementType, DestElementAddr, 0, 0, ".realp");
3152 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3153 RI.ElementType, DestElementAddr, 0, 1, ".imagp");
3154 Builder.CreateStore(SrcReal, DestRealPtr);
3155 Builder.CreateStore(SrcImg, DestImgPtr);
3156 break;
3157 }
3158 case EvalKind::Aggregate: {
3159 Value *SizeVal = Builder.getInt64(
3160 M.getDataLayout().getTypeStoreSize(RI.ElementType));
3161 Builder.CreateMemCpy(
3162 DestElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3163 SrcElementAddr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
3164 SizeVal, false);
3165 break;
3166 }
3167 };
3168 }
3169
3170 // Step 3.1: Modify reference in dest Reduce list as needed.
3171 // Modifying the reference in Reduce list to point to the newly
3172 // created element. The element is live in the current function
3173 // scope and that of functions it invokes (i.e., reduce_function).
3174 // RemoteReduceData[i] = (void*)&RemoteElem
3175 if (UpdateDestListPtr) {
3176 Value *CastDestAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3177 DestElementAddr, Builder.getPtrTy(),
3178 DestElementAddr->getName() + ".ascast");
3179 Builder.CreateStore(CastDestAddr, DestElementPtrAddr);
3180 }
3181 }
3182
3183 return Error::success();
3184}
3185
3186Expected<Function *> OpenMPIRBuilder::emitInterWarpCopyFunction(
3187 const LocationDescription &Loc, ArrayRef<ReductionInfo> ReductionInfos,
3188 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3189 InsertPointTy SavedIP = Builder.saveIP();
3190 LLVMContext &Ctx = M.getContext();
3191 FunctionType *FuncTy = FunctionType::get(
3192 Builder.getVoidTy(), {Builder.getPtrTy(), Builder.getInt32Ty()},
3193 /* IsVarArg */ false);
3194 Function *WcFunc =
3196 "_omp_reduction_inter_warp_copy_func", &M);
3197 WcFunc->setAttributes(FuncAttrs);
3198 WcFunc->addParamAttr(0, Attribute::NoUndef);
3199 WcFunc->addParamAttr(1, Attribute::NoUndef);
3200 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", WcFunc);
3201 Builder.SetInsertPoint(EntryBB);
3202
3203 // ReduceList: thread local Reduce list.
3204 // At the stage of the computation when this function is called, partially
3205 // aggregated values reside in the first lane of every active warp.
3206 Argument *ReduceListArg = WcFunc->getArg(0);
3207 // NumWarps: number of warps active in the parallel region. This could
3208 // be smaller than 32 (max warps in a CTA) for partial block reduction.
3209 Argument *NumWarpsArg = WcFunc->getArg(1);
3210
3211 // This array is used as a medium to transfer, one reduce element at a time,
3212 // the data from the first lane of every warp to lanes in the first warp
3213 // in order to perform the final step of a reduction in a parallel region
3214 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3215 // for reduced latency, as well as to have a distinct copy for concurrently
3216 // executing target regions. The array is declared with common linkage so
3217 // as to be shared across compilation units.
3218 StringRef TransferMediumName =
3219 "__openmp_nvptx_data_transfer_temporary_storage";
3220 GlobalVariable *TransferMedium = M.getGlobalVariable(TransferMediumName);
3221 unsigned WarpSize = Config.getGridValue().GV_Warp_Size;
3222 ArrayType *ArrayTy = ArrayType::get(Builder.getInt32Ty(), WarpSize);
3223 if (!TransferMedium) {
3224 TransferMedium = new GlobalVariable(
3225 M, ArrayTy, /*isConstant=*/false, GlobalVariable::WeakAnyLinkage,
3226 UndefValue::get(ArrayTy), TransferMediumName,
3227 /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
3228 /*AddressSpace=*/3);
3229 }
3230
3231 // Get the CUDA thread id of the current OpenMP thread on the GPU.
3232 Value *GPUThreadID = getGPUThreadID();
3233 // nvptx_lane_id = nvptx_id % warpsize
3234 Value *LaneID = getNVPTXLaneID();
3235 // nvptx_warp_id = nvptx_id / warpsize
3236 Value *WarpID = getNVPTXWarpID();
3237
3238 InsertPointTy AllocaIP =
3239 InsertPointTy(Builder.GetInsertBlock(),
3240 Builder.GetInsertBlock()->getFirstInsertionPt());
3241 Type *Arg0Type = ReduceListArg->getType();
3242 Type *Arg1Type = NumWarpsArg->getType();
3243 Builder.restoreIP(AllocaIP);
3244 AllocaInst *ReduceListAlloca = Builder.CreateAlloca(
3245 Arg0Type, nullptr, ReduceListArg->getName() + ".addr");
3246 AllocaInst *NumWarpsAlloca =
3247 Builder.CreateAlloca(Arg1Type, nullptr, NumWarpsArg->getName() + ".addr");
3248 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3249 ReduceListAlloca, Arg0Type, ReduceListAlloca->getName() + ".ascast");
3250 Value *NumWarpsAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3251 NumWarpsAlloca, Builder.getPtrTy(0),
3252 NumWarpsAlloca->getName() + ".ascast");
3253 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3254 Builder.CreateStore(NumWarpsArg, NumWarpsAddrCast);
3255 AllocaIP = getInsertPointAfterInstr(NumWarpsAlloca);
3256 InsertPointTy CodeGenIP =
3257 getInsertPointAfterInstr(&Builder.GetInsertBlock()->back());
3258 Builder.restoreIP(CodeGenIP);
3259
3260 Value *ReduceList =
3261 Builder.CreateLoad(Builder.getPtrTy(), ReduceListAddrCast);
3262
3263 for (auto En : enumerate(ReductionInfos)) {
3264 //
3265 // Warp master copies reduce element to transfer medium in __shared__
3266 // memory.
3267 //
3268 const ReductionInfo &RI = En.value();
3269 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
3270 unsigned RealTySize = M.getDataLayout().getTypeAllocSize(
3271 IsByRefElem ? RI.ByRefElementType : RI.ElementType);
3272 for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /= 2) {
3273 Type *CType = Builder.getIntNTy(TySize * 8);
3274
3275 unsigned NumIters = RealTySize / TySize;
3276 if (NumIters == 0)
3277 continue;
3278 Value *Cnt = nullptr;
3279 Value *CntAddr = nullptr;
3280 BasicBlock *PrecondBB = nullptr;
3281 BasicBlock *ExitBB = nullptr;
3282 if (NumIters > 1) {
3283 CodeGenIP = Builder.saveIP();
3284 Builder.restoreIP(AllocaIP);
3285 CntAddr =
3286 Builder.CreateAlloca(Builder.getInt32Ty(), nullptr, ".cnt.addr");
3287
3288 CntAddr = Builder.CreateAddrSpaceCast(CntAddr, Builder.getPtrTy(),
3289 CntAddr->getName() + ".ascast");
3290 Builder.restoreIP(CodeGenIP);
3291 Builder.CreateStore(Constant::getNullValue(Builder.getInt32Ty()),
3292 CntAddr,
3293 /*Volatile=*/false);
3294 PrecondBB = BasicBlock::Create(Ctx, "precond");
3295 ExitBB = BasicBlock::Create(Ctx, "exit");
3296 BasicBlock *BodyBB = BasicBlock::Create(Ctx, "body");
3297 emitBlock(PrecondBB, Builder.GetInsertBlock()->getParent());
3298 Cnt = Builder.CreateLoad(Builder.getInt32Ty(), CntAddr,
3299 /*Volatile=*/false);
3300 Value *Cmp = Builder.CreateICmpULT(
3301 Cnt, ConstantInt::get(Builder.getInt32Ty(), NumIters));
3302 Builder.CreateCondBr(Cmp, BodyBB, ExitBB);
3303 emitBlock(BodyBB, Builder.GetInsertBlock()->getParent());
3304 }
3305
3306 // kmpc_barrier.
3307 InsertPointOrErrorTy BarrierIP1 =
3308 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3309 omp::Directive::OMPD_unknown,
3310 /* ForceSimpleCall */ false,
3311 /* CheckCancelFlag */ true);
3312 if (!BarrierIP1)
3313 return BarrierIP1.takeError();
3314 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3315 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3316 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3317
3318 // if (lane_id == 0)
3319 Value *IsWarpMaster = Builder.CreateIsNull(LaneID, "warp_master");
3320 Builder.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3321 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3322
3323 // Reduce element = LocalReduceList[i]
3324 auto *RedListArrayTy =
3325 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3326 Type *IndexTy = Builder.getIndexTy(
3327 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3328 Value *ElemPtrPtr =
3329 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3330 {ConstantInt::get(IndexTy, 0),
3331 ConstantInt::get(IndexTy, En.index())});
3332 // elemptr = ((CopyType*)(elemptrptr)) + I
3333 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3334
3335 if (IsByRefElem) {
3336 InsertPointOrErrorTy GenRes =
3337 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3338
3339 if (!GenRes)
3340 return GenRes.takeError();
3341
3342 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3343 }
3344
3345 if (NumIters > 1)
3346 ElemPtr = Builder.CreateGEP(Builder.getInt32Ty(), ElemPtr, Cnt);
3347
3348 // Get pointer to location in transfer medium.
3349 // MediumPtr = &medium[warp_id]
3350 Value *MediumPtr = Builder.CreateInBoundsGEP(
3351 ArrayTy, TransferMedium, {Builder.getInt64(0), WarpID});
3352 // elem = *elemptr
3353 //*MediumPtr = elem
3354 Value *Elem = Builder.CreateLoad(CType, ElemPtr);
3355 // Store the source element value to the dest element address.
3356 Builder.CreateStore(Elem, MediumPtr,
3357 /*IsVolatile*/ true);
3358 Builder.CreateBr(MergeBB);
3359
3360 // else
3361 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3362 Builder.CreateBr(MergeBB);
3363
3364 // endif
3365 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3366 InsertPointOrErrorTy BarrierIP2 =
3367 createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
3368 omp::Directive::OMPD_unknown,
3369 /* ForceSimpleCall */ false,
3370 /* CheckCancelFlag */ true);
3371 if (!BarrierIP2)
3372 return BarrierIP2.takeError();
3373
3374 // Warp 0 copies reduce element from transfer medium
3375 BasicBlock *W0ThenBB = BasicBlock::Create(Ctx, "then");
3376 BasicBlock *W0ElseBB = BasicBlock::Create(Ctx, "else");
3377 BasicBlock *W0MergeBB = BasicBlock::Create(Ctx, "ifcont");
3378
3379 Value *NumWarpsVal =
3380 Builder.CreateLoad(Builder.getInt32Ty(), NumWarpsAddrCast);
3381 // Up to 32 threads in warp 0 are active.
3382 Value *IsActiveThread =
3383 Builder.CreateICmpULT(GPUThreadID, NumWarpsVal, "is_active_thread");
3384 Builder.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3385
3386 emitBlock(W0ThenBB, Builder.GetInsertBlock()->getParent());
3387
3388 // SecMediumPtr = &medium[tid]
3389 // SrcMediumVal = *SrcMediumPtr
3390 Value *SrcMediumPtrVal = Builder.CreateInBoundsGEP(
3391 ArrayTy, TransferMedium, {Builder.getInt64(0), GPUThreadID});
3392 // TargetElemPtr = (CopyType*)(SrcDataAddr[i]) + I
3393 Value *TargetElemPtrPtr =
3394 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3395 {ConstantInt::get(IndexTy, 0),
3396 ConstantInt::get(IndexTy, En.index())});
3397 Value *TargetElemPtrVal =
3398 Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtrPtr);
3399 Value *TargetElemPtr = TargetElemPtrVal;
3400
3401 if (IsByRefElem) {
3402 InsertPointOrErrorTy GenRes =
3403 RI.DataPtrPtrGen(Builder.saveIP(), TargetElemPtr, TargetElemPtr);
3404
3405 if (!GenRes)
3406 return GenRes.takeError();
3407
3408 TargetElemPtr = Builder.CreateLoad(Builder.getPtrTy(), TargetElemPtr);
3409 }
3410
3411 if (NumIters > 1)
3412 TargetElemPtr =
3413 Builder.CreateGEP(Builder.getInt32Ty(), TargetElemPtr, Cnt);
3414
3415 // *TargetElemPtr = SrcMediumVal;
3416 Value *SrcMediumValue =
3417 Builder.CreateLoad(CType, SrcMediumPtrVal, /*IsVolatile*/ true);
3418 Builder.CreateStore(SrcMediumValue, TargetElemPtr);
3419 Builder.CreateBr(W0MergeBB);
3420
3421 emitBlock(W0ElseBB, Builder.GetInsertBlock()->getParent());
3422 Builder.CreateBr(W0MergeBB);
3423
3424 emitBlock(W0MergeBB, Builder.GetInsertBlock()->getParent());
3425
3426 if (NumIters > 1) {
3427 Cnt = Builder.CreateNSWAdd(
3428 Cnt, ConstantInt::get(Builder.getInt32Ty(), /*V=*/1));
3429 Builder.CreateStore(Cnt, CntAddr, /*Volatile=*/false);
3430
3431 auto *CurFn = Builder.GetInsertBlock()->getParent();
3432 emitBranch(PrecondBB);
3433 emitBlock(ExitBB, CurFn);
3434 }
3435 RealTySize %= TySize;
3436 }
3437 }
3438
3439 Builder.CreateRetVoid();
3440 Builder.restoreIP(SavedIP);
3441
3442 return WcFunc;
3443}
3444
3445Expected<Function *> OpenMPIRBuilder::emitShuffleAndReduceFunction(
3446 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3447 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3448 LLVMContext &Ctx = M.getContext();
3449 FunctionType *FuncTy =
3450 FunctionType::get(Builder.getVoidTy(),
3451 {Builder.getPtrTy(), Builder.getInt16Ty(),
3452 Builder.getInt16Ty(), Builder.getInt16Ty()},
3453 /* IsVarArg */ false);
3454 Function *SarFunc =
3456 "_omp_reduction_shuffle_and_reduce_func", &M);
3457 SarFunc->setAttributes(FuncAttrs);
3458 SarFunc->addParamAttr(0, Attribute::NoUndef);
3459 SarFunc->addParamAttr(1, Attribute::NoUndef);
3460 SarFunc->addParamAttr(2, Attribute::NoUndef);
3461 SarFunc->addParamAttr(3, Attribute::NoUndef);
3462 SarFunc->addParamAttr(1, Attribute::SExt);
3463 SarFunc->addParamAttr(2, Attribute::SExt);
3464 SarFunc->addParamAttr(3, Attribute::SExt);
3465 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", SarFunc);
3466 Builder.SetInsertPoint(EntryBB);
3467
3468 // Thread local Reduce list used to host the values of data to be reduced.
3469 Argument *ReduceListArg = SarFunc->getArg(0);
3470 // Current lane id; could be logical.
3471 Argument *LaneIDArg = SarFunc->getArg(1);
3472 // Offset of the remote source lane relative to the current lane.
3473 Argument *RemoteLaneOffsetArg = SarFunc->getArg(2);
3474 // Algorithm version. This is expected to be known at compile time.
3475 Argument *AlgoVerArg = SarFunc->getArg(3);
3476
3477 Type *ReduceListArgType = ReduceListArg->getType();
3478 Type *LaneIDArgType = LaneIDArg->getType();
3479 Type *LaneIDArgPtrType = Builder.getPtrTy(0);
3480 Value *ReduceListAlloca = Builder.CreateAlloca(
3481 ReduceListArgType, nullptr, ReduceListArg->getName() + ".addr");
3482 Value *LaneIdAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3483 LaneIDArg->getName() + ".addr");
3484 Value *RemoteLaneOffsetAlloca = Builder.CreateAlloca(
3485 LaneIDArgType, nullptr, RemoteLaneOffsetArg->getName() + ".addr");
3486 Value *AlgoVerAlloca = Builder.CreateAlloca(LaneIDArgType, nullptr,
3487 AlgoVerArg->getName() + ".addr");
3488 ArrayType *RedListArrayTy =
3489 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3490
3491 // Create a local thread-private variable to host the Reduce list
3492 // from a remote lane.
3493 Instruction *RemoteReductionListAlloca = Builder.CreateAlloca(
3494 RedListArrayTy, nullptr, ".omp.reduction.remote_reduce_list");
3495
3496 Value *ReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3497 ReduceListAlloca, ReduceListArgType,
3498 ReduceListAlloca->getName() + ".ascast");
3499 Value *LaneIdAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3500 LaneIdAlloca, LaneIDArgPtrType, LaneIdAlloca->getName() + ".ascast");
3501 Value *RemoteLaneOffsetAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3502 RemoteLaneOffsetAlloca, LaneIDArgPtrType,
3503 RemoteLaneOffsetAlloca->getName() + ".ascast");
3504 Value *AlgoVerAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3505 AlgoVerAlloca, LaneIDArgPtrType, AlgoVerAlloca->getName() + ".ascast");
3506 Value *RemoteListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3507 RemoteReductionListAlloca, Builder.getPtrTy(),
3508 RemoteReductionListAlloca->getName() + ".ascast");
3509
3510 Builder.CreateStore(ReduceListArg, ReduceListAddrCast);
3511 Builder.CreateStore(LaneIDArg, LaneIdAddrCast);
3512 Builder.CreateStore(RemoteLaneOffsetArg, RemoteLaneOffsetAddrCast);
3513 Builder.CreateStore(AlgoVerArg, AlgoVerAddrCast);
3514
3515 Value *ReduceList = Builder.CreateLoad(ReduceListArgType, ReduceListAddrCast);
3516 Value *LaneId = Builder.CreateLoad(LaneIDArgType, LaneIdAddrCast);
3517 Value *RemoteLaneOffset =
3518 Builder.CreateLoad(LaneIDArgType, RemoteLaneOffsetAddrCast);
3519 Value *AlgoVer = Builder.CreateLoad(LaneIDArgType, AlgoVerAddrCast);
3520
3521 InsertPointTy AllocaIP = getInsertPointAfterInstr(RemoteReductionListAlloca);
3522
3523 // This loop iterates through the list of reduce elements and copies,
3524 // element by element, from a remote lane in the warp to RemoteReduceList,
3525 // hosted on the thread's stack.
3526 Error EmitRedLsCpRes = emitReductionListCopy(
3527 AllocaIP, CopyAction::RemoteLaneToThread, RedListArrayTy, ReductionInfos,
3528 ReduceList, RemoteListAddrCast, IsByRef,
3529 {RemoteLaneOffset, nullptr, nullptr});
3530
3531 if (EmitRedLsCpRes)
3532 return EmitRedLsCpRes;
3533
3534 // The actions to be performed on the Remote Reduce list is dependent
3535 // on the algorithm version.
3536 //
3537 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3538 // LaneId % 2 == 0 && Offset > 0):
3539 // do the reduction value aggregation
3540 //
3541 // The thread local variable Reduce list is mutated in place to host the
3542 // reduced data, which is the aggregated value produced from local and
3543 // remote lanes.
3544 //
3545 // Note that AlgoVer is expected to be a constant integer known at compile
3546 // time.
3547 // When AlgoVer==0, the first conjunction evaluates to true, making
3548 // the entire predicate true during compile time.
3549 // When AlgoVer==1, the second conjunction has only the second part to be
3550 // evaluated during runtime. Other conjunctions evaluates to false
3551 // during compile time.
3552 // When AlgoVer==2, the third conjunction has only the second part to be
3553 // evaluated during runtime. Other conjunctions evaluates to false
3554 // during compile time.
3555 Value *CondAlgo0 = Builder.CreateIsNull(AlgoVer);
3556 Value *Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3557 Value *LaneComp = Builder.CreateICmpULT(LaneId, RemoteLaneOffset);
3558 Value *CondAlgo1 = Builder.CreateAnd(Algo1, LaneComp);
3559 Value *Algo2 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(2));
3560 Value *LaneIdAnd1 = Builder.CreateAnd(LaneId, Builder.getInt16(1));
3561 Value *LaneIdComp = Builder.CreateIsNull(LaneIdAnd1);
3562 Value *Algo2AndLaneIdComp = Builder.CreateAnd(Algo2, LaneIdComp);
3563 Value *RemoteOffsetComp =
3564 Builder.CreateICmpSGT(RemoteLaneOffset, Builder.getInt16(0));
3565 Value *CondAlgo2 = Builder.CreateAnd(Algo2AndLaneIdComp, RemoteOffsetComp);
3566 Value *CA0OrCA1 = Builder.CreateOr(CondAlgo0, CondAlgo1);
3567 Value *CondReduce = Builder.CreateOr(CA0OrCA1, CondAlgo2);
3568
3569 BasicBlock *ThenBB = BasicBlock::Create(Ctx, "then");
3570 BasicBlock *ElseBB = BasicBlock::Create(Ctx, "else");
3571 BasicBlock *MergeBB = BasicBlock::Create(Ctx, "ifcont");
3572
3573 Builder.CreateCondBr(CondReduce, ThenBB, ElseBB);
3574 emitBlock(ThenBB, Builder.GetInsertBlock()->getParent());
3575 Value *LocalReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3576 ReduceList, Builder.getPtrTy());
3577 Value *RemoteReduceListPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
3578 RemoteListAddrCast, Builder.getPtrTy());
3579 createRuntimeFunctionCall(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr})
3580 ->addFnAttr(Attribute::NoUnwind);
3581 Builder.CreateBr(MergeBB);
3582
3583 emitBlock(ElseBB, Builder.GetInsertBlock()->getParent());
3584 Builder.CreateBr(MergeBB);
3585
3586 emitBlock(MergeBB, Builder.GetInsertBlock()->getParent());
3587
3588 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3589 // Reduce list.
3590 Algo1 = Builder.CreateICmpEQ(AlgoVer, Builder.getInt16(1));
3591 Value *LaneIdGtOffset = Builder.CreateICmpUGE(LaneId, RemoteLaneOffset);
3592 Value *CondCopy = Builder.CreateAnd(Algo1, LaneIdGtOffset);
3593
3594 BasicBlock *CpyThenBB = BasicBlock::Create(Ctx, "then");
3595 BasicBlock *CpyElseBB = BasicBlock::Create(Ctx, "else");
3596 BasicBlock *CpyMergeBB = BasicBlock::Create(Ctx, "ifcont");
3597 Builder.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3598
3599 emitBlock(CpyThenBB, Builder.GetInsertBlock()->getParent());
3600
3601 EmitRedLsCpRes = emitReductionListCopy(
3602 AllocaIP, CopyAction::ThreadCopy, RedListArrayTy, ReductionInfos,
3603 RemoteListAddrCast, ReduceList, IsByRef);
3604
3605 if (EmitRedLsCpRes)
3606 return EmitRedLsCpRes;
3607
3608 Builder.CreateBr(CpyMergeBB);
3609
3610 emitBlock(CpyElseBB, Builder.GetInsertBlock()->getParent());
3611 Builder.CreateBr(CpyMergeBB);
3612
3613 emitBlock(CpyMergeBB, Builder.GetInsertBlock()->getParent());
3614
3615 Builder.CreateRetVoid();
3616
3617 return SarFunc;
3618}
3619
3621OpenMPIRBuilder::generateReductionDescriptor(
3622 Value *DescriptorAddr, Value *DataPtr, Value *SrcDescriptorAddr,
3623 Type *DescriptorType,
3624 function_ref<InsertPointOrErrorTy(InsertPointTy, Value *, Value *&)>
3625 DataPtrPtrGen) {
3626
3627 // Copy the source descriptor to preserve all metadata (rank, extents,
3628 // strides, etc.)
3629 Value *DescriptorSize =
3630 Builder.getInt64(M.getDataLayout().getTypeStoreSize(DescriptorType));
3631 Builder.CreateMemCpy(
3632 DescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3633 SrcDescriptorAddr, M.getDataLayout().getPrefTypeAlign(DescriptorType),
3634 DescriptorSize);
3635
3636 // Update the base pointer field to point to the local shuffled data
3637 Value *DataPtrField;
3638 InsertPointOrErrorTy GenResult =
3639 DataPtrPtrGen(Builder.saveIP(), DescriptorAddr, DataPtrField);
3640
3641 if (!GenResult)
3642 return GenResult.takeError();
3643
3644 Builder.CreateStore(Builder.CreatePointerBitCastOrAddrSpaceCast(
3645 DataPtr, Builder.getPtrTy(), ".ascast"),
3646 DataPtrField);
3647
3648 return Builder.saveIP();
3649}
3650
3651Expected<Function *> OpenMPIRBuilder::emitListToGlobalCopyFunction(
3652 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3653 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3654 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3655 LLVMContext &Ctx = M.getContext();
3656 FunctionType *FuncTy = FunctionType::get(
3657 Builder.getVoidTy(),
3658 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3659 /* IsVarArg */ false);
3660 Function *LtGCFunc =
3662 "_omp_reduction_list_to_global_copy_func", &M);
3663 LtGCFunc->setAttributes(FuncAttrs);
3664 LtGCFunc->addParamAttr(0, Attribute::NoUndef);
3665 LtGCFunc->addParamAttr(1, Attribute::NoUndef);
3666 LtGCFunc->addParamAttr(2, Attribute::NoUndef);
3667
3668 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGCFunc);
3669 Builder.SetInsertPoint(EntryBlock);
3670
3671 // Buffer: global reduction buffer.
3672 Argument *BufferArg = LtGCFunc->getArg(0);
3673 // Idx: index of the buffer.
3674 Argument *IdxArg = LtGCFunc->getArg(1);
3675 // ReduceList: thread local Reduce list.
3676 Argument *ReduceListArg = LtGCFunc->getArg(2);
3677
3678 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3679 BufferArg->getName() + ".addr");
3680 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3681 IdxArg->getName() + ".addr");
3682 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3683 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3684 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3685 BufferArgAlloca, Builder.getPtrTy(),
3686 BufferArgAlloca->getName() + ".ascast");
3687 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3688 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3689 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3690 ReduceListArgAlloca, Builder.getPtrTy(),
3691 ReduceListArgAlloca->getName() + ".ascast");
3692
3693 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3694 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3695 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3696
3697 Value *LocalReduceList =
3698 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3699 Value *BufferArgVal =
3700 Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3701 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3702 Type *IndexTy = Builder.getIndexTy(
3703 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3704 for (auto En : enumerate(ReductionInfos)) {
3705 const ReductionInfo &RI = En.value();
3706 auto *RedListArrayTy =
3707 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3708 // Reduce element = LocalReduceList[i]
3709 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3710 RedListArrayTy, LocalReduceList,
3711 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3712 // elemptr = ((CopyType*)(elemptrptr)) + I
3713 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3714
3715 // Global = Buffer.VD[Idx];
3716 Value *BufferVD =
3717 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferArgVal, Idxs);
3718 Value *GlobVal = Builder.CreateConstInBoundsGEP2_32(
3719 ReductionsBufferTy, BufferVD, 0, En.index());
3720
3721 switch (RI.EvaluationKind) {
3722 case EvalKind::Scalar: {
3723 Value *TargetElement;
3724
3725 if (IsByRef.empty() || !IsByRef[En.index()]) {
3726 TargetElement = Builder.CreateLoad(RI.ElementType, ElemPtr);
3727 } else {
3728 InsertPointOrErrorTy GenResult =
3729 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3730
3731 if (!GenResult)
3732 return GenResult.takeError();
3733
3734 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3735 TargetElement = Builder.CreateLoad(RI.ByRefElementType, ElemPtr);
3736 }
3737
3738 Builder.CreateStore(TargetElement, GlobVal);
3739 break;
3740 }
3741 case EvalKind::Complex: {
3742 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3743 RI.ElementType, ElemPtr, 0, 0, ".realp");
3744 Value *SrcReal = Builder.CreateLoad(
3745 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3746 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3747 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3748 Value *SrcImg = Builder.CreateLoad(
3749 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3750
3751 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3752 RI.ElementType, GlobVal, 0, 0, ".realp");
3753 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3754 RI.ElementType, GlobVal, 0, 1, ".imagp");
3755 Builder.CreateStore(SrcReal, DestRealPtr);
3756 Builder.CreateStore(SrcImg, DestImgPtr);
3757 break;
3758 }
3759 case EvalKind::Aggregate: {
3760 Value *SizeVal =
3761 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
3762 Builder.CreateMemCpy(
3763 GlobVal, M.getDataLayout().getPrefTypeAlign(RI.ElementType), ElemPtr,
3764 M.getDataLayout().getPrefTypeAlign(RI.ElementType), SizeVal, false);
3765 break;
3766 }
3767 }
3768 }
3769
3770 Builder.CreateRetVoid();
3771 Builder.restoreIP(OldIP);
3772 return LtGCFunc;
3773}
3774
3775Expected<Function *> OpenMPIRBuilder::emitListToGlobalReduceFunction(
3776 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
3777 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3778 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3779 LLVMContext &Ctx = M.getContext();
3780 FunctionType *FuncTy = FunctionType::get(
3781 Builder.getVoidTy(),
3782 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3783 /* IsVarArg */ false);
3784 Function *LtGRFunc =
3786 "_omp_reduction_list_to_global_reduce_func", &M);
3787 LtGRFunc->setAttributes(FuncAttrs);
3788 LtGRFunc->addParamAttr(0, Attribute::NoUndef);
3789 LtGRFunc->addParamAttr(1, Attribute::NoUndef);
3790 LtGRFunc->addParamAttr(2, Attribute::NoUndef);
3791
3792 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", LtGRFunc);
3793 Builder.SetInsertPoint(EntryBlock);
3794
3795 // Buffer: global reduction buffer.
3796 Argument *BufferArg = LtGRFunc->getArg(0);
3797 // Idx: index of the buffer.
3798 Argument *IdxArg = LtGRFunc->getArg(1);
3799 // ReduceList: thread local Reduce list.
3800 Argument *ReduceListArg = LtGRFunc->getArg(2);
3801
3802 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3803 BufferArg->getName() + ".addr");
3804 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3805 IdxArg->getName() + ".addr");
3806 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3807 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3808 auto *RedListArrayTy =
3809 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3810
3811 // 1. Build a list of reduction variables.
3812 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3813 Value *LocalReduceList =
3814 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
3815
3816 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
3817
3818 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3819 BufferArgAlloca, Builder.getPtrTy(),
3820 BufferArgAlloca->getName() + ".ascast");
3821 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3822 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3823 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3824 ReduceListArgAlloca, Builder.getPtrTy(),
3825 ReduceListArgAlloca->getName() + ".ascast");
3826 Value *LocalReduceListAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3827 LocalReduceList, Builder.getPtrTy(),
3828 LocalReduceList->getName() + ".ascast");
3829
3830 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3831 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3832 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3833
3834 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3835 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3836 Type *IndexTy = Builder.getIndexTy(
3837 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3838 for (auto En : enumerate(ReductionInfos)) {
3839 const ReductionInfo &RI = En.value();
3840 Value *ByRefAlloc;
3841
3842 if (!IsByRef.empty() && IsByRef[En.index()]) {
3843 InsertPointTy OldIP = Builder.saveIP();
3844 Builder.restoreIP(AllocaIP);
3845
3846 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
3847 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
3848 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
3849
3850 Builder.restoreIP(OldIP);
3851 }
3852
3853 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
3854 RedListArrayTy, LocalReduceListAddrCast,
3855 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3856 Value *BufferVD =
3857 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3858 // Global = Buffer.VD[Idx];
3859 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3860 ReductionsBufferTy, BufferVD, 0, En.index());
3861
3862 if (!IsByRef.empty() && IsByRef[En.index()]) {
3863 // Get source descriptor from the reduce list argument
3864 Value *ReduceList =
3865 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3866 Value *SrcElementPtrPtr =
3867 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceList,
3868 {ConstantInt::get(IndexTy, 0),
3869 ConstantInt::get(IndexTy, En.index())});
3870 Value *SrcDescriptorAddr =
3871 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
3872
3873 // Copy descriptor from source and update base_ptr to global buffer data
3874 InsertPointOrErrorTy GenResult =
3875 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
3876 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
3877
3878 if (!GenResult)
3879 return GenResult.takeError();
3880
3881 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
3882 } else {
3883 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
3884 }
3885 }
3886
3887 // Call reduce_function(GlobalReduceList, ReduceList)
3888 Value *ReduceList =
3889 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3890 createRuntimeFunctionCall(ReduceFn, {LocalReduceListAddrCast, ReduceList})
3891 ->addFnAttr(Attribute::NoUnwind);
3892 Builder.CreateRetVoid();
3893 Builder.restoreIP(OldIP);
3894 return LtGRFunc;
3895}
3896
3897Expected<Function *> OpenMPIRBuilder::emitGlobalToListCopyFunction(
3898 ArrayRef<ReductionInfo> ReductionInfos, Type *ReductionsBufferTy,
3899 AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
3900 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
3901 LLVMContext &Ctx = M.getContext();
3902 FunctionType *FuncTy = FunctionType::get(
3903 Builder.getVoidTy(),
3904 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
3905 /* IsVarArg */ false);
3906 Function *GtLCFunc =
3908 "_omp_reduction_global_to_list_copy_func", &M);
3909 GtLCFunc->setAttributes(FuncAttrs);
3910 GtLCFunc->addParamAttr(0, Attribute::NoUndef);
3911 GtLCFunc->addParamAttr(1, Attribute::NoUndef);
3912 GtLCFunc->addParamAttr(2, Attribute::NoUndef);
3913
3914 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLCFunc);
3915 Builder.SetInsertPoint(EntryBlock);
3916
3917 // Buffer: global reduction buffer.
3918 Argument *BufferArg = GtLCFunc->getArg(0);
3919 // Idx: index of the buffer.
3920 Argument *IdxArg = GtLCFunc->getArg(1);
3921 // ReduceList: thread local Reduce list.
3922 Argument *ReduceListArg = GtLCFunc->getArg(2);
3923
3924 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
3925 BufferArg->getName() + ".addr");
3926 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
3927 IdxArg->getName() + ".addr");
3928 Value *ReduceListArgAlloca = Builder.CreateAlloca(
3929 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
3930 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3931 BufferArgAlloca, Builder.getPtrTy(),
3932 BufferArgAlloca->getName() + ".ascast");
3933 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3934 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
3935 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
3936 ReduceListArgAlloca, Builder.getPtrTy(),
3937 ReduceListArgAlloca->getName() + ".ascast");
3938 Builder.CreateStore(BufferArg, BufferArgAddrCast);
3939 Builder.CreateStore(IdxArg, IdxArgAddrCast);
3940 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
3941
3942 Value *LocalReduceList =
3943 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
3944 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
3945 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
3946 Type *IndexTy = Builder.getIndexTy(
3947 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
3948 for (auto En : enumerate(ReductionInfos)) {
3949 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
3950 auto *RedListArrayTy =
3951 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
3952 // Reduce element = LocalReduceList[i]
3953 Value *ElemPtrPtr = Builder.CreateInBoundsGEP(
3954 RedListArrayTy, LocalReduceList,
3955 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
3956 // elemptr = ((CopyType*)(elemptrptr)) + I
3957 Value *ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtrPtr);
3958 // Global = Buffer.VD[Idx];
3959 Value *BufferVD =
3960 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
3961 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
3962 ReductionsBufferTy, BufferVD, 0, En.index());
3963
3964 switch (RI.EvaluationKind) {
3965 case EvalKind::Scalar: {
3966 Type *ElemType = RI.ElementType;
3967
3968 if (!IsByRef.empty() && IsByRef[En.index()]) {
3969 ElemType = RI.ByRefElementType;
3970 InsertPointOrErrorTy GenResult =
3971 RI.DataPtrPtrGen(Builder.saveIP(), ElemPtr, ElemPtr);
3972
3973 if (!GenResult)
3974 return GenResult.takeError();
3975
3976 ElemPtr = Builder.CreateLoad(Builder.getPtrTy(), ElemPtr);
3977 }
3978
3979 Value *TargetElement = Builder.CreateLoad(ElemType, GlobValPtr);
3980 Builder.CreateStore(TargetElement, ElemPtr);
3981 break;
3982 }
3983 case EvalKind::Complex: {
3984 Value *SrcRealPtr = Builder.CreateConstInBoundsGEP2_32(
3985 RI.ElementType, GlobValPtr, 0, 0, ".realp");
3986 Value *SrcReal = Builder.CreateLoad(
3987 RI.ElementType->getStructElementType(0), SrcRealPtr, ".real");
3988 Value *SrcImgPtr = Builder.CreateConstInBoundsGEP2_32(
3989 RI.ElementType, GlobValPtr, 0, 1, ".imagp");
3990 Value *SrcImg = Builder.CreateLoad(
3991 RI.ElementType->getStructElementType(1), SrcImgPtr, ".imag");
3992
3993 Value *DestRealPtr = Builder.CreateConstInBoundsGEP2_32(
3994 RI.ElementType, ElemPtr, 0, 0, ".realp");
3995 Value *DestImgPtr = Builder.CreateConstInBoundsGEP2_32(
3996 RI.ElementType, ElemPtr, 0, 1, ".imagp");
3997 Builder.CreateStore(SrcReal, DestRealPtr);
3998 Builder.CreateStore(SrcImg, DestImgPtr);
3999 break;
4000 }
4001 case EvalKind::Aggregate: {
4002 Value *SizeVal =
4003 Builder.getInt64(M.getDataLayout().getTypeStoreSize(RI.ElementType));
4004 Builder.CreateMemCpy(
4005 ElemPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4006 GlobValPtr, M.getDataLayout().getPrefTypeAlign(RI.ElementType),
4007 SizeVal, false);
4008 break;
4009 }
4010 }
4011 }
4012
4013 Builder.CreateRetVoid();
4014 Builder.restoreIP(OldIP);
4015 return GtLCFunc;
4016}
4017
4018Expected<Function *> OpenMPIRBuilder::emitGlobalToListReduceFunction(
4019 ArrayRef<ReductionInfo> ReductionInfos, Function *ReduceFn,
4020 Type *ReductionsBufferTy, AttributeList FuncAttrs, ArrayRef<bool> IsByRef) {
4021 OpenMPIRBuilder::InsertPointTy OldIP = Builder.saveIP();
4022 LLVMContext &Ctx = M.getContext();
4023 auto *FuncTy = FunctionType::get(
4024 Builder.getVoidTy(),
4025 {Builder.getPtrTy(), Builder.getInt32Ty(), Builder.getPtrTy()},
4026 /* IsVarArg */ false);
4027 Function *GtLRFunc =
4029 "_omp_reduction_global_to_list_reduce_func", &M);
4030 GtLRFunc->setAttributes(FuncAttrs);
4031 GtLRFunc->addParamAttr(0, Attribute::NoUndef);
4032 GtLRFunc->addParamAttr(1, Attribute::NoUndef);
4033 GtLRFunc->addParamAttr(2, Attribute::NoUndef);
4034
4035 BasicBlock *EntryBlock = BasicBlock::Create(Ctx, "entry", GtLRFunc);
4036 Builder.SetInsertPoint(EntryBlock);
4037
4038 // Buffer: global reduction buffer.
4039 Argument *BufferArg = GtLRFunc->getArg(0);
4040 // Idx: index of the buffer.
4041 Argument *IdxArg = GtLRFunc->getArg(1);
4042 // ReduceList: thread local Reduce list.
4043 Argument *ReduceListArg = GtLRFunc->getArg(2);
4044
4045 Value *BufferArgAlloca = Builder.CreateAlloca(Builder.getPtrTy(), nullptr,
4046 BufferArg->getName() + ".addr");
4047 Value *IdxArgAlloca = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
4048 IdxArg->getName() + ".addr");
4049 Value *ReduceListArgAlloca = Builder.CreateAlloca(
4050 Builder.getPtrTy(), nullptr, ReduceListArg->getName() + ".addr");
4051 ArrayType *RedListArrayTy =
4052 ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4053
4054 // 1. Build a list of reduction variables.
4055 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4056 Value *LocalReduceList =
4057 Builder.CreateAlloca(RedListArrayTy, nullptr, ".omp.reduction.red_list");
4058
4059 InsertPointTy AllocaIP{EntryBlock, EntryBlock->begin()};
4060
4061 Value *BufferArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4062 BufferArgAlloca, Builder.getPtrTy(),
4063 BufferArgAlloca->getName() + ".ascast");
4064 Value *IdxArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4065 IdxArgAlloca, Builder.getPtrTy(), IdxArgAlloca->getName() + ".ascast");
4066 Value *ReduceListArgAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4067 ReduceListArgAlloca, Builder.getPtrTy(),
4068 ReduceListArgAlloca->getName() + ".ascast");
4069 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4070 LocalReduceList, Builder.getPtrTy(),
4071 LocalReduceList->getName() + ".ascast");
4072
4073 Builder.CreateStore(BufferArg, BufferArgAddrCast);
4074 Builder.CreateStore(IdxArg, IdxArgAddrCast);
4075 Builder.CreateStore(ReduceListArg, ReduceListArgAddrCast);
4076
4077 Value *BufferVal = Builder.CreateLoad(Builder.getPtrTy(), BufferArgAddrCast);
4078 Value *Idxs[] = {Builder.CreateLoad(Builder.getInt32Ty(), IdxArgAddrCast)};
4079 Type *IndexTy = Builder.getIndexTy(
4080 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4081 for (auto En : enumerate(ReductionInfos)) {
4082 const ReductionInfo &RI = En.value();
4083 Value *ByRefAlloc;
4084
4085 if (!IsByRef.empty() && IsByRef[En.index()]) {
4086 InsertPointTy OldIP = Builder.saveIP();
4087 Builder.restoreIP(AllocaIP);
4088
4089 ByRefAlloc = Builder.CreateAlloca(RI.ByRefAllocatedType);
4090 ByRefAlloc = Builder.CreatePointerBitCastOrAddrSpaceCast(
4091 ByRefAlloc, Builder.getPtrTy(), ByRefAlloc->getName() + ".ascast");
4092
4093 Builder.restoreIP(OldIP);
4094 }
4095
4096 Value *TargetElementPtrPtr = Builder.CreateInBoundsGEP(
4097 RedListArrayTy, ReductionList,
4098 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4099 // Global = Buffer.VD[Idx];
4100 Value *BufferVD =
4101 Builder.CreateInBoundsGEP(ReductionsBufferTy, BufferVal, Idxs);
4102 Value *GlobValPtr = Builder.CreateConstInBoundsGEP2_32(
4103 ReductionsBufferTy, BufferVD, 0, En.index());
4104
4105 if (!IsByRef.empty() && IsByRef[En.index()]) {
4106 // Get source descriptor from the reduce list
4107 Value *ReduceListVal =
4108 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4109 Value *SrcElementPtrPtr =
4110 Builder.CreateInBoundsGEP(RedListArrayTy, ReduceListVal,
4111 {ConstantInt::get(IndexTy, 0),
4112 ConstantInt::get(IndexTy, En.index())});
4113 Value *SrcDescriptorAddr =
4114 Builder.CreateLoad(Builder.getPtrTy(), SrcElementPtrPtr);
4115
4116 // Copy descriptor from source and update base_ptr to global buffer data
4117 InsertPointOrErrorTy GenResult =
4118 generateReductionDescriptor(ByRefAlloc, GlobValPtr, SrcDescriptorAddr,
4119 RI.ByRefAllocatedType, RI.DataPtrPtrGen);
4120 if (!GenResult)
4121 return GenResult.takeError();
4122
4123 Builder.CreateStore(ByRefAlloc, TargetElementPtrPtr);
4124 } else {
4125 Builder.CreateStore(GlobValPtr, TargetElementPtrPtr);
4126 }
4127 }
4128
4129 // Call reduce_function(ReduceList, GlobalReduceList)
4130 Value *ReduceList =
4131 Builder.CreateLoad(Builder.getPtrTy(), ReduceListArgAddrCast);
4132 createRuntimeFunctionCall(ReduceFn, {ReduceList, ReductionList})
4133 ->addFnAttr(Attribute::NoUnwind);
4134 Builder.CreateRetVoid();
4135 Builder.restoreIP(OldIP);
4136 return GtLRFunc;
4137}
4138
4139std::string OpenMPIRBuilder::getReductionFuncName(StringRef Name) const {
4140 std::string Suffix =
4141 createPlatformSpecificName({"omp", "reduction", "reduction_func"});
4142 return (Name + Suffix).str();
4143}
4144
4145Expected<Function *> OpenMPIRBuilder::createReductionFunction(
4146 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
4148 AttributeList FuncAttrs) {
4149 auto *FuncTy = FunctionType::get(Builder.getVoidTy(),
4150 {Builder.getPtrTy(), Builder.getPtrTy()},
4151 /* IsVarArg */ false);
4152 std::string Name = getReductionFuncName(ReducerName);
4153 Function *ReductionFunc =
4155 ReductionFunc->setAttributes(FuncAttrs);
4156 ReductionFunc->addParamAttr(0, Attribute::NoUndef);
4157 ReductionFunc->addParamAttr(1, Attribute::NoUndef);
4158 BasicBlock *EntryBB =
4159 BasicBlock::Create(M.getContext(), "entry", ReductionFunc);
4160 Builder.SetInsertPoint(EntryBB);
4161
4162 // Need to alloca memory here and deal with the pointers before getting
4163 // LHS/RHS pointers out
4164 Value *LHSArrayPtr = nullptr;
4165 Value *RHSArrayPtr = nullptr;
4166 Argument *Arg0 = ReductionFunc->getArg(0);
4167 Argument *Arg1 = ReductionFunc->getArg(1);
4168 Type *Arg0Type = Arg0->getType();
4169 Type *Arg1Type = Arg1->getType();
4170
4171 Value *LHSAlloca =
4172 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4173 Value *RHSAlloca =
4174 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4175 Value *LHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4176 LHSAlloca, Arg0Type, LHSAlloca->getName() + ".ascast");
4177 Value *RHSAddrCast = Builder.CreatePointerBitCastOrAddrSpaceCast(
4178 RHSAlloca, Arg1Type, RHSAlloca->getName() + ".ascast");
4179 Builder.CreateStore(Arg0, LHSAddrCast);
4180 Builder.CreateStore(Arg1, RHSAddrCast);
4181 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4182 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4183
4184 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), ReductionInfos.size());
4185 Type *IndexTy = Builder.getIndexTy(
4186 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4187 SmallVector<Value *> LHSPtrs, RHSPtrs;
4188 for (auto En : enumerate(ReductionInfos)) {
4189 const ReductionInfo &RI = En.value();
4190 Value *RHSI8PtrPtr = Builder.CreateInBoundsGEP(
4191 RedArrayTy, RHSArrayPtr,
4192 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4193 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4194 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4195 RHSI8Ptr, RI.PrivateVariable->getType(),
4196 RHSI8Ptr->getName() + ".ascast");
4197
4198 Value *LHSI8PtrPtr = Builder.CreateInBoundsGEP(
4199 RedArrayTy, LHSArrayPtr,
4200 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4201 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4202 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4203 LHSI8Ptr, RI.Variable->getType(), LHSI8Ptr->getName() + ".ascast");
4204
4206 LHSPtrs.emplace_back(LHSPtr);
4207 RHSPtrs.emplace_back(RHSPtr);
4208 } else {
4209 Value *LHS = LHSPtr;
4210 Value *RHS = RHSPtr;
4211
4212 if (!IsByRef.empty() && !IsByRef[En.index()]) {
4213 LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4214 RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4215 }
4216
4217 Value *Reduced;
4218 InsertPointOrErrorTy AfterIP =
4219 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4220 if (!AfterIP)
4221 return AfterIP.takeError();
4222 if (!Builder.GetInsertBlock())
4223 return ReductionFunc;
4224
4225 Builder.restoreIP(*AfterIP);
4226
4227 if (!IsByRef.empty() && !IsByRef[En.index()])
4228 Builder.CreateStore(Reduced, LHSPtr);
4229 }
4230 }
4231
4233 for (auto En : enumerate(ReductionInfos)) {
4234 unsigned Index = En.index();
4235 const ReductionInfo &RI = En.value();
4236 Value *LHSFixupPtr, *RHSFixupPtr;
4237 Builder.restoreIP(RI.ReductionGenClang(
4238 Builder.saveIP(), Index, &LHSFixupPtr, &RHSFixupPtr, ReductionFunc));
4239
4240 // Fix the CallBack code genereated to use the correct Values for the LHS
4241 // and RHS
4242 LHSFixupPtr->replaceUsesWithIf(
4243 LHSPtrs[Index], [ReductionFunc](const Use &U) {
4244 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4245 ReductionFunc;
4246 });
4247 RHSFixupPtr->replaceUsesWithIf(
4248 RHSPtrs[Index], [ReductionFunc](const Use &U) {
4249 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4250 ReductionFunc;
4251 });
4252 }
4253
4254 Builder.CreateRetVoid();
4255 // Compiling with `-O0`, `alloca`s emitted in non-entry blocks are not hoisted
4256 // to the entry block (this is dones for higher opt levels by later passes in
4257 // the pipeline). This has caused issues because non-entry `alloca`s force the
4258 // function to use dynamic stack allocations and we might run out of scratch
4259 // memory.
4260 hoistNonEntryAllocasToEntryBlock(ReductionFunc);
4261
4262 return ReductionFunc;
4263}
4264
4265static void
4267 bool IsGPU) {
4268 for (const OpenMPIRBuilder::ReductionInfo &RI : ReductionInfos) {
4269 (void)RI;
4270 assert(RI.Variable && "expected non-null variable");
4271 assert(RI.PrivateVariable && "expected non-null private variable");
4272 assert((RI.ReductionGen || RI.ReductionGenClang) &&
4273 "expected non-null reduction generator callback");
4274 if (!IsGPU) {
4275 assert(
4276 RI.Variable->getType() == RI.PrivateVariable->getType() &&
4277 "expected variables and their private equivalents to have the same "
4278 "type");
4279 }
4280 assert(RI.Variable->getType()->isPointerTy() &&
4281 "expected variables to be pointers");
4282 }
4283}
4284
4286 const LocationDescription &Loc, InsertPointTy AllocaIP,
4287 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
4288 ArrayRef<bool> IsByRef, bool IsNoWait, bool IsTeamsReduction,
4289 ReductionGenCBKind ReductionGenCBKind, std::optional<omp::GV> GridValue,
4290 unsigned ReductionBufNum, Value *SrcLocInfo) {
4291 if (!updateToLocation(Loc))
4292 return InsertPointTy();
4293 Builder.restoreIP(CodeGenIP);
4294 checkReductionInfos(ReductionInfos, /*IsGPU*/ true);
4295 LLVMContext &Ctx = M.getContext();
4296
4297 // Source location for the ident struct
4298 if (!SrcLocInfo) {
4299 uint32_t SrcLocStrSize;
4300 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4301 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4302 }
4303
4304 if (ReductionInfos.size() == 0)
4305 return Builder.saveIP();
4306
4307 BasicBlock *ContinuationBlock = nullptr;
4309 // Copied code from createReductions
4310 BasicBlock *InsertBlock = Loc.IP.getBlock();
4311 ContinuationBlock =
4312 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4313 InsertBlock->getTerminator()->eraseFromParent();
4314 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4315 }
4316
4317 Function *CurFunc = Builder.GetInsertBlock()->getParent();
4318 AttributeList FuncAttrs;
4319 AttrBuilder AttrBldr(Ctx);
4320 for (auto Attr : CurFunc->getAttributes().getFnAttrs())
4321 AttrBldr.addAttribute(Attr);
4322 AttrBldr.removeAttribute(Attribute::OptimizeNone);
4323 FuncAttrs = FuncAttrs.addFnAttributes(Ctx, AttrBldr);
4324
4325 CodeGenIP = Builder.saveIP();
4326 Expected<Function *> ReductionResult = createReductionFunction(
4327 Builder.GetInsertBlock()->getParent()->getName(), ReductionInfos, IsByRef,
4328 ReductionGenCBKind, FuncAttrs);
4329 if (!ReductionResult)
4330 return ReductionResult.takeError();
4331 Function *ReductionFunc = *ReductionResult;
4332 Builder.restoreIP(CodeGenIP);
4333
4334 // Set the grid value in the config needed for lowering later on
4335 if (GridValue.has_value())
4336 Config.setGridValue(GridValue.value());
4337 else
4338 Config.setGridValue(getGridValue(T, ReductionFunc));
4339
4340 // Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
4341 // RedList, shuffle_reduce_func, interwarp_copy_func);
4342 // or
4343 // Build res = __kmpc_reduce_teams_nowait_simple(<loc>, <gtid>, <lck>);
4344 Value *Res;
4345
4346 // 1. Build a list of reduction variables.
4347 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
4348 auto Size = ReductionInfos.size();
4349 Type *PtrTy = PointerType::get(Ctx, Config.getDefaultTargetAS());
4350 Type *FuncPtrTy =
4351 Builder.getPtrTy(M.getDataLayout().getProgramAddressSpace());
4352 Type *RedArrayTy = ArrayType::get(PtrTy, Size);
4353 CodeGenIP = Builder.saveIP();
4354 Builder.restoreIP(AllocaIP);
4355 Value *ReductionListAlloca =
4356 Builder.CreateAlloca(RedArrayTy, nullptr, ".omp.reduction.red_list");
4357 Value *ReductionList = Builder.CreatePointerBitCastOrAddrSpaceCast(
4358 ReductionListAlloca, PtrTy, ReductionListAlloca->getName() + ".ascast");
4359 Builder.restoreIP(CodeGenIP);
4360 Type *IndexTy = Builder.getIndexTy(
4361 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4362 for (auto En : enumerate(ReductionInfos)) {
4363 const ReductionInfo &RI = En.value();
4364 Value *ElemPtr = Builder.CreateInBoundsGEP(
4365 RedArrayTy, ReductionList,
4366 {ConstantInt::get(IndexTy, 0), ConstantInt::get(IndexTy, En.index())});
4367
4368 Value *PrivateVar = RI.PrivateVariable;
4369 bool IsByRefElem = !IsByRef.empty() && IsByRef[En.index()];
4370 if (IsByRefElem)
4371 PrivateVar = Builder.CreateLoad(RI.ElementType, PrivateVar);
4372
4373 Value *CastElem =
4374 Builder.CreatePointerBitCastOrAddrSpaceCast(PrivateVar, PtrTy);
4375 Builder.CreateStore(CastElem, ElemPtr);
4376 }
4377 CodeGenIP = Builder.saveIP();
4378 Expected<Function *> SarFunc = emitShuffleAndReduceFunction(
4379 ReductionInfos, ReductionFunc, FuncAttrs, IsByRef);
4380
4381 if (!SarFunc)
4382 return SarFunc.takeError();
4383
4384 Expected<Function *> CopyResult =
4385 emitInterWarpCopyFunction(Loc, ReductionInfos, FuncAttrs, IsByRef);
4386 if (!CopyResult)
4387 return CopyResult.takeError();
4388 Function *WcFunc = *CopyResult;
4389 Builder.restoreIP(CodeGenIP);
4390
4391 Value *RL = Builder.CreatePointerBitCastOrAddrSpaceCast(ReductionList, PtrTy);
4392
4393 unsigned MaxDataSize = 0;
4394 SmallVector<Type *> ReductionTypeArgs;
4395 for (auto En : enumerate(ReductionInfos)) {
4396 auto Size = M.getDataLayout().getTypeStoreSize(En.value().ElementType);
4397 if (Size > MaxDataSize)
4398 MaxDataSize = Size;
4399 Type *RedTypeArg = (!IsByRef.empty() && IsByRef[En.index()])
4400 ? En.value().ByRefElementType
4401 : En.value().ElementType;
4402 ReductionTypeArgs.emplace_back(RedTypeArg);
4403 }
4404 Value *ReductionDataSize =
4405 Builder.getInt64(MaxDataSize * ReductionInfos.size());
4406 if (!IsTeamsReduction) {
4407 Value *SarFuncCast =
4408 Builder.CreatePointerBitCastOrAddrSpaceCast(*SarFunc, FuncPtrTy);
4409 Value *WcFuncCast =
4410 Builder.CreatePointerBitCastOrAddrSpaceCast(WcFunc, FuncPtrTy);
4411 Value *Args[] = {SrcLocInfo, ReductionDataSize, RL, SarFuncCast,
4412 WcFuncCast};
4414 RuntimeFunction::OMPRTL___kmpc_nvptx_parallel_reduce_nowait_v2);
4415 Res = createRuntimeFunctionCall(Pv2Ptr, Args);
4416 } else {
4417 CodeGenIP = Builder.saveIP();
4418 StructType *ReductionsBufferTy = StructType::create(
4419 Ctx, ReductionTypeArgs, "struct._globalized_locals_ty");
4420 Function *RedFixedBufferFn = getOrCreateRuntimeFunctionPtr(
4421 RuntimeFunction::OMPRTL___kmpc_reduction_get_fixed_buffer);
4422
4423 Expected<Function *> LtGCFunc = emitListToGlobalCopyFunction(
4424 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4425 if (!LtGCFunc)
4426 return LtGCFunc.takeError();
4427
4428 Expected<Function *> LtGRFunc = emitListToGlobalReduceFunction(
4429 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4430 if (!LtGRFunc)
4431 return LtGRFunc.takeError();
4432
4433 Expected<Function *> GtLCFunc = emitGlobalToListCopyFunction(
4434 ReductionInfos, ReductionsBufferTy, FuncAttrs, IsByRef);
4435 if (!GtLCFunc)
4436 return GtLCFunc.takeError();
4437
4438 Expected<Function *> GtLRFunc = emitGlobalToListReduceFunction(
4439 ReductionInfos, ReductionFunc, ReductionsBufferTy, FuncAttrs, IsByRef);
4440 if (!GtLRFunc)
4441 return GtLRFunc.takeError();
4442
4443 Builder.restoreIP(CodeGenIP);
4444
4445 Value *KernelTeamsReductionPtr = createRuntimeFunctionCall(
4446 RedFixedBufferFn, {}, "_openmp_teams_reductions_buffer_$_$ptr");
4447
4448 Value *Args3[] = {SrcLocInfo,
4449 KernelTeamsReductionPtr,
4450 Builder.getInt32(ReductionBufNum),
4451 ReductionDataSize,
4452 RL,
4453 *SarFunc,
4454 WcFunc,
4455 *LtGCFunc,
4456 *LtGRFunc,
4457 *GtLCFunc,
4458 *GtLRFunc};
4459
4460 Function *TeamsReduceFn = getOrCreateRuntimeFunctionPtr(
4461 RuntimeFunction::OMPRTL___kmpc_nvptx_teams_reduce_nowait_v2);
4462 Res = createRuntimeFunctionCall(TeamsReduceFn, Args3);
4463 }
4464
4465 // 5. Build if (res == 1)
4466 BasicBlock *ExitBB = BasicBlock::Create(Ctx, ".omp.reduction.done");
4467 BasicBlock *ThenBB = BasicBlock::Create(Ctx, ".omp.reduction.then");
4468 Value *Cond = Builder.CreateICmpEQ(Res, Builder.getInt32(1));
4469 Builder.CreateCondBr(Cond, ThenBB, ExitBB);
4470
4471 // 6. Build then branch: where we have reduced values in the master
4472 // thread in each team.
4473 // __kmpc_end_reduce{_nowait}(<gtid>);
4474 // break;
4475 emitBlock(ThenBB, CurFunc);
4476
4477 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4478 for (auto En : enumerate(ReductionInfos)) {
4479 const ReductionInfo &RI = En.value();
4481 Value *RedValue = RI.Variable;
4482 Value *RHS =
4483 Builder.CreatePointerBitCastOrAddrSpaceCast(RI.PrivateVariable, PtrTy);
4484
4486 Value *LHSPtr, *RHSPtr;
4487 Builder.restoreIP(RI.ReductionGenClang(Builder.saveIP(), En.index(),
4488 &LHSPtr, &RHSPtr, CurFunc));
4489
4490 // Fix the CallBack code genereated to use the correct Values for the LHS
4491 // and RHS
4492 LHSPtr->replaceUsesWithIf(RedValue, [ReductionFunc](const Use &U) {
4493 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4494 ReductionFunc;
4495 });
4496 RHSPtr->replaceUsesWithIf(RHS, [ReductionFunc](const Use &U) {
4497 return cast<Instruction>(U.getUser())->getParent()->getParent() ==
4498 ReductionFunc;
4499 });
4500 } else {
4501 if (IsByRef.empty() || !IsByRef[En.index()]) {
4502 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4503 "red.value." + Twine(En.index()));
4504 }
4505 Value *PrivateRedValue = Builder.CreateLoad(
4506 ValueType, RHS, "red.private.value" + Twine(En.index()));
4507 Value *Reduced;
4508 InsertPointOrErrorTy AfterIP =
4509 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4510 if (!AfterIP)
4511 return AfterIP.takeError();
4512 Builder.restoreIP(*AfterIP);
4513
4514 if (!IsByRef.empty() && !IsByRef[En.index()])
4515 Builder.CreateStore(Reduced, RI.Variable);
4516 }
4517 }
4518 emitBlock(ExitBB, CurFunc);
4519 if (ContinuationBlock) {
4520 Builder.CreateBr(ContinuationBlock);
4521 Builder.SetInsertPoint(ContinuationBlock);
4522 }
4523 Config.setEmitLLVMUsed();
4524
4525 return Builder.saveIP();
4526}
4527
4529 Type *VoidTy = Type::getVoidTy(M.getContext());
4530 Type *Int8PtrTy = PointerType::getUnqual(M.getContext());
4531 auto *FuncTy =
4532 FunctionType::get(VoidTy, {Int8PtrTy, Int8PtrTy}, /* IsVarArg */ false);
4534 ".omp.reduction.func", &M);
4535}
4536
4538 Function *ReductionFunc,
4540 IRBuilder<> &Builder, ArrayRef<bool> IsByRef, bool IsGPU) {
4541 Module *Module = ReductionFunc->getParent();
4542 BasicBlock *ReductionFuncBlock =
4543 BasicBlock::Create(Module->getContext(), "", ReductionFunc);
4544 Builder.SetInsertPoint(ReductionFuncBlock);
4545 Value *LHSArrayPtr = nullptr;
4546 Value *RHSArrayPtr = nullptr;
4547 if (IsGPU) {
4548 // Need to alloca memory here and deal with the pointers before getting
4549 // LHS/RHS pointers out
4550 //
4551 Argument *Arg0 = ReductionFunc->getArg(0);
4552 Argument *Arg1 = ReductionFunc->getArg(1);
4553 Type *Arg0Type = Arg0->getType();
4554 Type *Arg1Type = Arg1->getType();
4555
4556 Value *LHSAlloca =
4557 Builder.CreateAlloca(Arg0Type, nullptr, Arg0->getName() + ".addr");
4558 Value *RHSAlloca =
4559 Builder.CreateAlloca(Arg1Type, nullptr, Arg1->getName() + ".addr");
4560 Value *LHSAddrCast =
4561 Builder.CreatePointerBitCastOrAddrSpaceCast(LHSAlloca, Arg0Type);
4562 Value *RHSAddrCast =
4563 Builder.CreatePointerBitCastOrAddrSpaceCast(RHSAlloca, Arg1Type);
4564 Builder.CreateStore(Arg0, LHSAddrCast);
4565 Builder.CreateStore(Arg1, RHSAddrCast);
4566 LHSArrayPtr = Builder.CreateLoad(Arg0Type, LHSAddrCast);
4567 RHSArrayPtr = Builder.CreateLoad(Arg1Type, RHSAddrCast);
4568 } else {
4569 LHSArrayPtr = ReductionFunc->getArg(0);
4570 RHSArrayPtr = ReductionFunc->getArg(1);
4571 }
4572
4573 unsigned NumReductions = ReductionInfos.size();
4574 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4575
4576 for (auto En : enumerate(ReductionInfos)) {
4577 const OpenMPIRBuilder::ReductionInfo &RI = En.value();
4578 Value *LHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4579 RedArrayTy, LHSArrayPtr, 0, En.index());
4580 Value *LHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), LHSI8PtrPtr);
4581 Value *LHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4582 LHSI8Ptr, RI.Variable->getType());
4583 Value *LHS = Builder.CreateLoad(RI.ElementType, LHSPtr);
4584 Value *RHSI8PtrPtr = Builder.CreateConstInBoundsGEP2_64(
4585 RedArrayTy, RHSArrayPtr, 0, En.index());
4586 Value *RHSI8Ptr = Builder.CreateLoad(Builder.getPtrTy(), RHSI8PtrPtr);
4587 Value *RHSPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
4588 RHSI8Ptr, RI.PrivateVariable->getType());
4589 Value *RHS = Builder.CreateLoad(RI.ElementType, RHSPtr);
4590 Value *Reduced;
4592 RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced);
4593 if (!AfterIP)
4594 return AfterIP.takeError();
4595
4596 Builder.restoreIP(*AfterIP);
4597 // TODO: Consider flagging an error.
4598 if (!Builder.GetInsertBlock())
4599 return Error::success();
4600
4601 // store is inside of the reduction region when using by-ref
4602 if (!IsByRef[En.index()])
4603 Builder.CreateStore(Reduced, LHSPtr);
4604 }
4605 Builder.CreateRetVoid();
4606 return Error::success();
4607}
4608
4610 const LocationDescription &Loc, InsertPointTy AllocaIP,
4611 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
4612 bool IsNoWait, bool IsTeamsReduction) {
4613 assert(ReductionInfos.size() == IsByRef.size());
4614 if (Config.isGPU())
4615 return createReductionsGPU(Loc, AllocaIP, Builder.saveIP(), ReductionInfos,
4616 IsByRef, IsNoWait, IsTeamsReduction);
4617
4618 checkReductionInfos(ReductionInfos, /*IsGPU*/ false);
4619
4620 if (!updateToLocation(Loc))
4621 return InsertPointTy();
4622
4623 if (ReductionInfos.size() == 0)
4624 return Builder.saveIP();
4625
4626 BasicBlock *InsertBlock = Loc.IP.getBlock();
4627 BasicBlock *ContinuationBlock =
4628 InsertBlock->splitBasicBlock(Loc.IP.getPoint(), "reduce.finalize");
4629 InsertBlock->getTerminator()->eraseFromParent();
4630
4631 // Create and populate array of type-erased pointers to private reduction
4632 // values.
4633 unsigned NumReductions = ReductionInfos.size();
4634 Type *RedArrayTy = ArrayType::get(Builder.getPtrTy(), NumReductions);
4635 Builder.SetInsertPoint(AllocaIP.getBlock()->getTerminator());
4636 Value *RedArray = Builder.CreateAlloca(RedArrayTy, nullptr, "red.array");
4637
4638 Builder.SetInsertPoint(InsertBlock, InsertBlock->end());
4639
4640 for (auto En : enumerate(ReductionInfos)) {
4641 unsigned Index = En.index();
4642 const ReductionInfo &RI = En.value();
4643 Value *RedArrayElemPtr = Builder.CreateConstInBoundsGEP2_64(
4644 RedArrayTy, RedArray, 0, Index, "red.array.elem." + Twine(Index));
4645 Builder.CreateStore(RI.PrivateVariable, RedArrayElemPtr);
4646 }
4647
4648 // Emit a call to the runtime function that orchestrates the reduction.
4649 // Declare the reduction function in the process.
4650 Type *IndexTy = Builder.getIndexTy(
4651 M.getDataLayout(), M.getDataLayout().getDefaultGlobalsAddressSpace());
4652 Function *Func = Builder.GetInsertBlock()->getParent();
4653 Module *Module = Func->getParent();
4654 uint32_t SrcLocStrSize;
4655 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4656 bool CanGenerateAtomic = all_of(ReductionInfos, [](const ReductionInfo &RI) {
4657 return RI.AtomicReductionGen;
4658 });
4659 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize,
4660 CanGenerateAtomic
4661 ? IdentFlag::OMP_IDENT_FLAG_ATOMIC_REDUCE
4662 : IdentFlag(0));
4663 Value *ThreadId = getOrCreateThreadID(Ident);
4664 Constant *NumVariables = Builder.getInt32(NumReductions);
4665 const DataLayout &DL = Module->getDataLayout();
4666 unsigned RedArrayByteSize = DL.getTypeStoreSize(RedArrayTy);
4667 Constant *RedArraySize = ConstantInt::get(IndexTy, RedArrayByteSize);
4668 Function *ReductionFunc = getFreshReductionFunc(*Module);
4669 Value *Lock = getOMPCriticalRegionLock(".reduction");
4671 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_reduce_nowait
4672 : RuntimeFunction::OMPRTL___kmpc_reduce);
4673 CallInst *ReduceCall =
4674 createRuntimeFunctionCall(ReduceFunc,
4675 {Ident, ThreadId, NumVariables, RedArraySize,
4676 RedArray, ReductionFunc, Lock},
4677 "reduce");
4678
4679 // Create final reduction entry blocks for the atomic and non-atomic case.
4680 // Emit IR that dispatches control flow to one of the blocks based on the
4681 // reduction supporting the atomic mode.
4682 BasicBlock *NonAtomicRedBlock =
4683 BasicBlock::Create(Module->getContext(), "reduce.switch.nonatomic", Func);
4684 BasicBlock *AtomicRedBlock =
4685 BasicBlock::Create(Module->getContext(), "reduce.switch.atomic", Func);
4686 SwitchInst *Switch =
4687 Builder.CreateSwitch(ReduceCall, ContinuationBlock, /* NumCases */ 2);
4688 Switch->addCase(Builder.getInt32(1), NonAtomicRedBlock);
4689 Switch->addCase(Builder.getInt32(2), AtomicRedBlock);
4690
4691 // Populate the non-atomic reduction using the elementwise reduction function.
4692 // This loads the elements from the global and private variables and reduces
4693 // them before storing back the result to the global variable.
4694 Builder.SetInsertPoint(NonAtomicRedBlock);
4695 for (auto En : enumerate(ReductionInfos)) {
4696 const ReductionInfo &RI = En.value();
4698 // We have one less load for by-ref case because that load is now inside of
4699 // the reduction region
4700 Value *RedValue = RI.Variable;
4701 if (!IsByRef[En.index()]) {
4702 RedValue = Builder.CreateLoad(ValueType, RI.Variable,
4703 "red.value." + Twine(En.index()));
4704 }
4705 Value *PrivateRedValue =
4706 Builder.CreateLoad(ValueType, RI.PrivateVariable,
4707 "red.private.value." + Twine(En.index()));
4708 Value *Reduced;
4709 InsertPointOrErrorTy AfterIP =
4710 RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced);
4711 if (!AfterIP)
4712 return AfterIP.takeError();
4713 Builder.restoreIP(*AfterIP);
4714
4715 if (!Builder.GetInsertBlock())
4716 return InsertPointTy();
4717 // for by-ref case, the load is inside of the reduction region
4718 if (!IsByRef[En.index()])
4719 Builder.CreateStore(Reduced, RI.Variable);
4720 }
4721 Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
4722 IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
4723 : RuntimeFunction::OMPRTL___kmpc_end_reduce);
4724 createRuntimeFunctionCall(EndReduceFunc, {Ident, ThreadId, Lock});
4725 Builder.CreateBr(ContinuationBlock);
4726
4727 // Populate the atomic reduction using the atomic elementwise reduction
4728 // function. There are no loads/stores here because they will be happening
4729 // inside the atomic elementwise reduction.
4730 Builder.SetInsertPoint(AtomicRedBlock);
4731 if (CanGenerateAtomic && llvm::none_of(IsByRef, [](bool P) { return P; })) {
4732 for (const ReductionInfo &RI : ReductionInfos) {
4734 Builder.saveIP(), RI.ElementType, RI.Variable, RI.PrivateVariable);
4735 if (!AfterIP)
4736 return AfterIP.takeError();
4737 Builder.restoreIP(*AfterIP);
4738 if (!Builder.GetInsertBlock())
4739 return InsertPointTy();
4740 }
4741 Builder.CreateBr(ContinuationBlock);
4742 } else {
4743 Builder.CreateUnreachable();
4744 }
4745
4746 // Populate the outlined reduction function using the elementwise reduction
4747 // function. Partial values are extracted from the type-erased array of
4748 // pointers to private variables.
4749 Error Err = populateReductionFunction(ReductionFunc, ReductionInfos, Builder,
4750 IsByRef, /*isGPU=*/false);
4751 if (Err)
4752 return Err;
4753
4754 if (!Builder.GetInsertBlock())
4755 return InsertPointTy();
4756
4757 Builder.SetInsertPoint(ContinuationBlock);
4758 return Builder.saveIP();
4759}
4760
4763 BodyGenCallbackTy BodyGenCB,
4764 FinalizeCallbackTy FiniCB) {
4765 if (!updateToLocation(Loc))
4766 return Loc.IP;
4767
4768 Directive OMPD = Directive::OMPD_master;
4769 uint32_t SrcLocStrSize;
4770 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4771 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4772 Value *ThreadId = getOrCreateThreadID(Ident);
4773 Value *Args[] = {Ident, ThreadId};
4774
4775 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
4776 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4777
4778 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
4779 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
4780
4781 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4782 /*Conditional*/ true, /*hasFinalize*/ true);
4783}
4784
4787 BodyGenCallbackTy BodyGenCB,
4788 FinalizeCallbackTy FiniCB, Value *Filter) {
4789 if (!updateToLocation(Loc))
4790 return Loc.IP;
4791
4792 Directive OMPD = Directive::OMPD_masked;
4793 uint32_t SrcLocStrSize;
4794 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
4795 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
4796 Value *ThreadId = getOrCreateThreadID(Ident);
4797 Value *Args[] = {Ident, ThreadId, Filter};
4798 Value *ArgsEnd[] = {Ident, ThreadId};
4799
4800 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_masked);
4801 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
4802
4803 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_masked);
4804 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, ArgsEnd);
4805
4806 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
4807 /*Conditional*/ true, /*hasFinalize*/ true);
4808}
4809
4811 llvm::FunctionCallee Callee,
4813 const llvm::Twine &Name) {
4814 llvm::CallInst *Call = Builder.CreateCall(
4815 Callee, Args, SmallVector<llvm::OperandBundleDef, 1>(), Name);
4816 Call->setDoesNotThrow();
4817 return Call;
4818}
4819
4820// Expects input basic block is dominated by BeforeScanBB.
4821// Once Scan directive is encountered, the code after scan directive should be
4822// dominated by AfterScanBB. Scan directive splits the code sequence to
4823// scan and input phase. Based on whether inclusive or exclusive
4824// clause is used in the scan directive and whether input loop or scan loop
4825// is lowered, it adds jumps to input and scan phase. First Scan loop is the
4826// input loop and second is the scan loop. The code generated handles only
4827// inclusive scans now.
4829 const LocationDescription &Loc, InsertPointTy AllocaIP,
4830 ArrayRef<llvm::Value *> ScanVars, ArrayRef<llvm::Type *> ScanVarsType,
4831 bool IsInclusive, ScanInfo *ScanRedInfo) {
4832 if (ScanRedInfo->OMPFirstScanLoop) {
4833 llvm::Error Err = emitScanBasedDirectiveDeclsIR(AllocaIP, ScanVars,
4834 ScanVarsType, ScanRedInfo);
4835 if (Err)
4836 return Err;
4837 }
4838 if (!updateToLocation(Loc))
4839 return Loc.IP;
4840
4841 llvm::Value *IV = ScanRedInfo->IV;
4842
4843 if (ScanRedInfo->OMPFirstScanLoop) {
4844 // Emit buffer[i] = red; at the end of the input phase.
4845 for (size_t i = 0; i < ScanVars.size(); i++) {
4846 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4847 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4848 Type *DestTy = ScanVarsType[i];
4849 Value *Val = Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4850 Value *Src = Builder.CreateLoad(DestTy, ScanVars[i]);
4851
4852 Builder.CreateStore(Src, Val);
4853 }
4854 }
4855 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
4856 emitBlock(ScanRedInfo->OMPScanDispatch,
4857 Builder.GetInsertBlock()->getParent());
4858
4859 if (!ScanRedInfo->OMPFirstScanLoop) {
4860 IV = ScanRedInfo->IV;
4861 // Emit red = buffer[i]; at the entrance to the scan phase.
4862 // TODO: if exclusive scan, the red = buffer[i-1] needs to be updated.
4863 for (size_t i = 0; i < ScanVars.size(); i++) {
4864 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]];
4865 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4866 Type *DestTy = ScanVarsType[i];
4867 Value *SrcPtr =
4868 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
4869 Value *Src = Builder.CreateLoad(DestTy, SrcPtr);
4870 Builder.CreateStore(Src, ScanVars[i]);
4871 }
4872 }
4873
4874 // TODO: Update it to CreateBr and remove dead blocks
4875 llvm::Value *CmpI = Builder.getInt1(true);
4876 if (ScanRedInfo->OMPFirstScanLoop == IsInclusive) {
4877 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPBeforeScanBlock,
4878 ScanRedInfo->OMPAfterScanBlock);
4879 } else {
4880 Builder.CreateCondBr(CmpI, ScanRedInfo->OMPAfterScanBlock,
4881 ScanRedInfo->OMPBeforeScanBlock);
4882 }
4883 emitBlock(ScanRedInfo->OMPAfterScanBlock,
4884 Builder.GetInsertBlock()->getParent());
4885 Builder.SetInsertPoint(ScanRedInfo->OMPAfterScanBlock);
4886 return Builder.saveIP();
4887}
4888
4889Error OpenMPIRBuilder::emitScanBasedDirectiveDeclsIR(
4890 InsertPointTy AllocaIP, ArrayRef<Value *> ScanVars,
4891 ArrayRef<Type *> ScanVarsType, ScanInfo *ScanRedInfo) {
4892
4893 Builder.restoreIP(AllocaIP);
4894 // Create the shared pointer at alloca IP.
4895 for (size_t i = 0; i < ScanVars.size(); i++) {
4896 llvm::Value *BuffPtr =
4897 Builder.CreateAlloca(Builder.getPtrTy(), nullptr, "vla");
4898 (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]] = BuffPtr;
4899 }
4900
4901 // Allocate temporary buffer by master thread
4902 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4903 InsertPointTy CodeGenIP) -> Error {
4904 Builder.restoreIP(CodeGenIP);
4905 Value *AllocSpan =
4906 Builder.CreateAdd(ScanRedInfo->Span, Builder.getInt32(1));
4907 for (size_t i = 0; i < ScanVars.size(); i++) {
4908 Type *IntPtrTy = Builder.getInt32Ty();
4909 Constant *Allocsize = ConstantExpr::getSizeOf(ScanVarsType[i]);
4910 Allocsize = ConstantExpr::getTruncOrBitCast(Allocsize, IntPtrTy);
4911 Value *Buff = Builder.CreateMalloc(IntPtrTy, ScanVarsType[i], Allocsize,
4912 AllocSpan, nullptr, "arr");
4913 Builder.CreateStore(Buff, (*(ScanRedInfo->ScanBuffPtrs))[ScanVars[i]]);
4914 }
4915 return Error::success();
4916 };
4917 // TODO: Perform finalization actions for variables. This has to be
4918 // called for variables which have destructors/finalizers.
4919 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4920
4921 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit->getTerminator());
4922 llvm::Value *FilterVal = Builder.getInt32(0);
4924 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4925
4926 if (!AfterIP)
4927 return AfterIP.takeError();
4928 Builder.restoreIP(*AfterIP);
4929 BasicBlock *InputBB = Builder.GetInsertBlock();
4930 if (InputBB->hasTerminator())
4931 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4932 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4933 if (!AfterIP)
4934 return AfterIP.takeError();
4935 Builder.restoreIP(*AfterIP);
4936
4937 return Error::success();
4938}
4939
4940Error OpenMPIRBuilder::emitScanBasedDirectiveFinalsIR(
4941 ArrayRef<ReductionInfo> ReductionInfos, ScanInfo *ScanRedInfo) {
4942 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4943 InsertPointTy CodeGenIP) -> Error {
4944 Builder.restoreIP(CodeGenIP);
4945 for (ReductionInfo RedInfo : ReductionInfos) {
4946 Value *PrivateVar = RedInfo.PrivateVariable;
4947 Value *OrigVar = RedInfo.Variable;
4948 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[PrivateVar];
4949 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
4950
4951 Type *SrcTy = RedInfo.ElementType;
4952 Value *Val = Builder.CreateInBoundsGEP(SrcTy, Buff, ScanRedInfo->Span,
4953 "arrayOffset");
4954 Value *Src = Builder.CreateLoad(SrcTy, Val);
4955
4956 Builder.CreateStore(Src, OrigVar);
4957 Builder.CreateFree(Buff);
4958 }
4959 return Error::success();
4960 };
4961 // TODO: Perform finalization actions for variables. This has to be
4962 // called for variables which have destructors/finalizers.
4963 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
4964
4965 if (Instruction *TI = ScanRedInfo->OMPScanFinish->getTerminatorOrNull())
4966 Builder.SetInsertPoint(TI);
4967 else
4968 Builder.SetInsertPoint(ScanRedInfo->OMPScanFinish);
4969
4970 llvm::Value *FilterVal = Builder.getInt32(0);
4972 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
4973
4974 if (!AfterIP)
4975 return AfterIP.takeError();
4976 Builder.restoreIP(*AfterIP);
4977 BasicBlock *InputBB = Builder.GetInsertBlock();
4978 if (InputBB->hasTerminator())
4979 Builder.SetInsertPoint(Builder.GetInsertBlock()->getTerminator());
4980 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
4981 if (!AfterIP)
4982 return AfterIP.takeError();
4983 Builder.restoreIP(*AfterIP);
4984 return Error::success();
4985}
4986
4988 const LocationDescription &Loc,
4990 ScanInfo *ScanRedInfo) {
4991
4992 if (!updateToLocation(Loc))
4993 return Loc.IP;
4994 auto BodyGenCB = [&](InsertPointTy AllocaIP,
4995 InsertPointTy CodeGenIP) -> Error {
4996 Builder.restoreIP(CodeGenIP);
4997 Function *CurFn = Builder.GetInsertBlock()->getParent();
4998 // for (int k = 0; k <= ceil(log2(n)); ++k)
4999 llvm::BasicBlock *LoopBB =
5000 BasicBlock::Create(CurFn->getContext(), "omp.outer.log.scan.body");
5001 llvm::BasicBlock *ExitBB =
5002 splitBB(Builder, false, "omp.outer.log.scan.exit");
5004 Builder.GetInsertBlock()->getModule(),
5005 (llvm::Intrinsic::ID)llvm::Intrinsic::log2, Builder.getDoubleTy());
5006 llvm::BasicBlock *InputBB = Builder.GetInsertBlock();
5007 llvm::Value *Arg =
5008 Builder.CreateUIToFP(ScanRedInfo->Span, Builder.getDoubleTy());
5009 llvm::Value *LogVal = emitNoUnwindRuntimeCall(Builder, F, Arg, "");
5011 Builder.GetInsertBlock()->getModule(),
5012 (llvm::Intrinsic::ID)llvm::Intrinsic::ceil, Builder.getDoubleTy());
5013 LogVal = emitNoUnwindRuntimeCall(Builder, F, LogVal, "");
5014 LogVal = Builder.CreateFPToUI(LogVal, Builder.getInt32Ty());
5015 llvm::Value *NMin1 = Builder.CreateNUWSub(
5016 ScanRedInfo->Span,
5017 llvm::ConstantInt::get(ScanRedInfo->Span->getType(), 1));
5018 Builder.SetInsertPoint(InputBB);
5019 Builder.CreateBr(LoopBB);
5020 emitBlock(LoopBB, CurFn);
5021 Builder.SetInsertPoint(LoopBB);
5022
5023 PHINode *Counter = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5024 // size pow2k = 1;
5025 PHINode *Pow2K = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5026 Counter->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 0),
5027 InputBB);
5028 Pow2K->addIncoming(llvm::ConstantInt::get(Builder.getInt32Ty(), 1),
5029 InputBB);
5030 // for (size i = n - 1; i >= 2 ^ k; --i)
5031 // tmp[i] op= tmp[i-pow2k];
5032 llvm::BasicBlock *InnerLoopBB =
5033 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.body");
5034 llvm::BasicBlock *InnerExitBB =
5035 BasicBlock::Create(CurFn->getContext(), "omp.inner.log.scan.exit");
5036 llvm::Value *CmpI = Builder.CreateICmpUGE(NMin1, Pow2K);
5037 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5038 emitBlock(InnerLoopBB, CurFn);
5039 Builder.SetInsertPoint(InnerLoopBB);
5040 PHINode *IVal = Builder.CreatePHI(Builder.getInt32Ty(), 2);
5041 IVal->addIncoming(NMin1, LoopBB);
5042 for (ReductionInfo RedInfo : ReductionInfos) {
5043 Value *ReductionVal = RedInfo.PrivateVariable;
5044 Value *BuffPtr = (*(ScanRedInfo->ScanBuffPtrs))[ReductionVal];
5045 Value *Buff = Builder.CreateLoad(Builder.getPtrTy(), BuffPtr);
5046 Type *DestTy = RedInfo.ElementType;
5047 Value *IV = Builder.CreateAdd(IVal, Builder.getInt32(1));
5048 Value *LHSPtr =
5049 Builder.CreateInBoundsGEP(DestTy, Buff, IV, "arrayOffset");
5050 Value *OffsetIval = Builder.CreateNUWSub(IV, Pow2K);
5051 Value *RHSPtr =
5052 Builder.CreateInBoundsGEP(DestTy, Buff, OffsetIval, "arrayOffset");
5053 Value *LHS = Builder.CreateLoad(DestTy, LHSPtr);
5054 Value *RHS = Builder.CreateLoad(DestTy, RHSPtr);
5055 llvm::Value *Result;
5056 InsertPointOrErrorTy AfterIP =
5057 RedInfo.ReductionGen(Builder.saveIP(), LHS, RHS, Result);
5058 if (!AfterIP)
5059 return AfterIP.takeError();
5060 Builder.CreateStore(Result, LHSPtr);
5061 }
5062 llvm::Value *NextIVal = Builder.CreateNUWSub(
5063 IVal, llvm::ConstantInt::get(Builder.getInt32Ty(), 1));
5064 IVal->addIncoming(NextIVal, Builder.GetInsertBlock());
5065 CmpI = Builder.CreateICmpUGE(NextIVal, Pow2K);
5066 Builder.CreateCondBr(CmpI, InnerLoopBB, InnerExitBB);
5067 emitBlock(InnerExitBB, CurFn);
5068 llvm::Value *Next = Builder.CreateNUWAdd(
5069 Counter, llvm::ConstantInt::get(Counter->getType(), 1));
5070 Counter->addIncoming(Next, Builder.GetInsertBlock());
5071 // pow2k <<= 1;
5072 llvm::Value *NextPow2K = Builder.CreateShl(Pow2K, 1, "", /*HasNUW=*/true);
5073 Pow2K->addIncoming(NextPow2K, Builder.GetInsertBlock());
5074 llvm::Value *Cmp = Builder.CreateICmpNE(Next, LogVal);
5075 Builder.CreateCondBr(Cmp, LoopBB, ExitBB);
5076 Builder.SetInsertPoint(ExitBB->getFirstInsertionPt());
5077 return Error::success();
5078 };
5079
5080 // TODO: Perform finalization actions for variables. This has to be
5081 // called for variables which have destructors/finalizers.
5082 auto FiniCB = [&](InsertPointTy CodeGenIP) { return llvm::Error::success(); };
5083
5084 llvm::Value *FilterVal = Builder.getInt32(0);
5086 createMasked(Builder.saveIP(), BodyGenCB, FiniCB, FilterVal);
5087
5088 if (!AfterIP)
5089 return AfterIP.takeError();
5090 Builder.restoreIP(*AfterIP);
5091 AfterIP = createBarrier(Builder.saveIP(), llvm::omp::OMPD_barrier);
5092
5093 if (!AfterIP)
5094 return AfterIP.takeError();
5095 Builder.restoreIP(*AfterIP);
5096 Error Err = emitScanBasedDirectiveFinalsIR(ReductionInfos, ScanRedInfo);
5097 if (Err)
5098 return Err;
5099
5100 return AfterIP;
5101}
5102
5103Error OpenMPIRBuilder::emitScanBasedDirectiveIR(
5104 llvm::function_ref<Error()> InputLoopGen,
5105 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
5106 ScanInfo *ScanRedInfo) {
5107
5108 {
5109 // Emit loop with input phase:
5110 // for (i: 0..<num_iters>) {
5111 // <input phase>;
5112 // buffer[i] = red;
5113 // }
5114 ScanRedInfo->OMPFirstScanLoop = true;
5115 Error Err = InputLoopGen();
5116 if (Err)
5117 return Err;
5118 }
5119 {
5120 // Emit loop with scan phase:
5121 // for (i: 0..<num_iters>) {
5122 // red = buffer[i];
5123 // <scan phase>;
5124 // }
5125 ScanRedInfo->OMPFirstScanLoop = false;
5126 Error Err = ScanLoopGen(Builder.saveIP());
5127 if (Err)
5128 return Err;
5129 }
5130 return Error::success();
5131}
5132
5133void OpenMPIRBuilder::createScanBBs(ScanInfo *ScanRedInfo) {
5134 Function *Fun = Builder.GetInsertBlock()->getParent();
5135 ScanRedInfo->OMPScanDispatch =
5136 BasicBlock::Create(Fun->getContext(), "omp.inscan.dispatch");
5137 ScanRedInfo->OMPAfterScanBlock =
5138 BasicBlock::Create(Fun->getContext(), "omp.after.scan.bb");
5139 ScanRedInfo->OMPBeforeScanBlock =
5140 BasicBlock::Create(Fun->getContext(), "omp.before.scan.bb");
5141 ScanRedInfo->OMPScanLoopExit =
5142 BasicBlock::Create(Fun->getContext(), "omp.scan.loop.exit");
5143}
5145 DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore,
5146 BasicBlock *PostInsertBefore, const Twine &Name) {
5147 Module *M = F->getParent();
5148 LLVMContext &Ctx = M->getContext();
5149 Type *IndVarTy = TripCount->getType();
5150
5151 // Create the basic block structure.
5152 BasicBlock *Preheader =
5153 BasicBlock::Create(Ctx, "omp_" + Name + ".preheader", F, PreInsertBefore);
5154 BasicBlock *Header =
5155 BasicBlock::Create(Ctx, "omp_" + Name + ".header", F, PreInsertBefore);
5156 BasicBlock *Cond =
5157 BasicBlock::Create(Ctx, "omp_" + Name + ".cond", F, PreInsertBefore);
5158 BasicBlock *Body =
5159 BasicBlock::Create(Ctx, "omp_" + Name + ".body", F, PreInsertBefore);
5160 BasicBlock *Latch =
5161 BasicBlock::Create(Ctx, "omp_" + Name + ".inc", F, PostInsertBefore);
5162 BasicBlock *Exit =
5163 BasicBlock::Create(Ctx, "omp_" + Name + ".exit", F, PostInsertBefore);
5164 BasicBlock *After =
5165 BasicBlock::Create(Ctx, "omp_" + Name + ".after", F, PostInsertBefore);
5166
5167 // Use specified DebugLoc for new instructions.
5168 Builder.SetCurrentDebugLocation(DL);
5169
5170 Builder.SetInsertPoint(Preheader);
5171 Builder.CreateBr(Header);
5172
5173 Builder.SetInsertPoint(Header);
5174 PHINode *IndVarPHI = Builder.CreatePHI(IndVarTy, 2, "omp_" + Name + ".iv");
5175 IndVarPHI->addIncoming(ConstantInt::get(IndVarTy, 0), Preheader);
5176 Builder.CreateBr(Cond);
5177
5178 Builder.SetInsertPoint(Cond);
5179 Value *Cmp =
5180 Builder.CreateICmpULT(IndVarPHI, TripCount, "omp_" + Name + ".cmp");
5181 Builder.CreateCondBr(Cmp, Body, Exit);
5182
5183 Builder.SetInsertPoint(Body);
5184 Builder.CreateBr(Latch);
5185
5186 Builder.SetInsertPoint(Latch);
5187 Value *Next = Builder.CreateAdd(IndVarPHI, ConstantInt::get(IndVarTy, 1),
5188 "omp_" + Name + ".next", /*HasNUW=*/true);
5189 Builder.CreateBr(Header);
5190 IndVarPHI->addIncoming(Next, Latch);
5191
5192 Builder.SetInsertPoint(Exit);
5193 Builder.CreateBr(After);
5194
5195 // Remember and return the canonical control flow.
5196 LoopInfos.emplace_front();
5197 CanonicalLoopInfo *CL = &LoopInfos.front();
5198
5199 CL->Header = Header;
5200 CL->Cond = Cond;
5201 CL->Latch = Latch;
5202 CL->Exit = Exit;
5203
5204#ifndef NDEBUG
5205 CL->assertOK();
5206#endif
5207 return CL;
5208}
5209
5212 LoopBodyGenCallbackTy BodyGenCB,
5213 Value *TripCount, const Twine &Name) {
5214 BasicBlock *BB = Loc.IP.getBlock();
5215 BasicBlock *NextBB = BB->getNextNode();
5216
5217 CanonicalLoopInfo *CL = createLoopSkeleton(Loc.DL, TripCount, BB->getParent(),
5218 NextBB, NextBB, Name);
5219 BasicBlock *After = CL->getAfter();
5220
5221 // If location is not set, don't connect the loop.
5222 if (updateToLocation(Loc)) {
5223 // Split the loop at the insertion point: Branch to the preheader and move
5224 // every following instruction to after the loop (the After BB). Also, the
5225 // new successor is the loop's after block.
5226 spliceBB(Builder, After, /*CreateBranch=*/false);
5227 Builder.CreateBr(CL->getPreheader());
5228 }
5229
5230 // Emit the body content. We do it after connecting the loop to the CFG to
5231 // avoid that the callback encounters degenerate BBs.
5232 if (Error Err = BodyGenCB(CL->getBodyIP(), CL->getIndVar()))
5233 return Err;
5234
5235#ifndef NDEBUG
5236 CL->assertOK();
5237#endif
5238 return CL;
5239}
5240
5242 ScanInfos.emplace_front();
5243 ScanInfo *Result = &ScanInfos.front();
5244 return Result;
5245}
5246
5250 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5251 InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo) {
5252 LocationDescription ComputeLoc =
5253 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5254 updateToLocation(ComputeLoc);
5255
5257
5259 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5260 ScanRedInfo->Span = TripCount;
5261 ScanRedInfo->OMPScanInit = splitBB(Builder, true, "scan.init");
5262 Builder.SetInsertPoint(ScanRedInfo->OMPScanInit);
5263
5264 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5265 Builder.restoreIP(CodeGenIP);
5266 ScanRedInfo->IV = IV;
5267 createScanBBs(ScanRedInfo);
5268 BasicBlock *InputBlock = Builder.GetInsertBlock();
5269 Instruction *Terminator = InputBlock->getTerminator();
5270 assert(Terminator->getNumSuccessors() == 1);
5271 BasicBlock *ContinueBlock = Terminator->getSuccessor(0);
5272 Terminator->setSuccessor(0, ScanRedInfo->OMPScanDispatch);
5273 emitBlock(ScanRedInfo->OMPBeforeScanBlock,
5274 Builder.GetInsertBlock()->getParent());
5275 Builder.CreateBr(ScanRedInfo->OMPScanLoopExit);
5276 emitBlock(ScanRedInfo->OMPScanLoopExit,
5277 Builder.GetInsertBlock()->getParent());
5278 Builder.CreateBr(ContinueBlock);
5279 Builder.SetInsertPoint(
5280 ScanRedInfo->OMPBeforeScanBlock->getFirstInsertionPt());
5281 return BodyGenCB(Builder.saveIP(), IV);
5282 };
5283
5284 const auto &&InputLoopGen = [&]() -> Error {
5286 Builder.saveIP(), BodyGen, Start, Stop, Step, IsSigned, InclusiveStop,
5287 ComputeIP, Name, true, ScanRedInfo);
5288 if (!LoopInfo)
5289 return LoopInfo.takeError();
5290 Result.push_back(*LoopInfo);
5291 Builder.restoreIP((*LoopInfo)->getAfterIP());
5292 return Error::success();
5293 };
5294 const auto &&ScanLoopGen = [&](LocationDescription Loc) -> Error {
5296 createCanonicalLoop(Loc, BodyGen, Start, Stop, Step, IsSigned,
5297 InclusiveStop, ComputeIP, Name, true, ScanRedInfo);
5298 if (!LoopInfo)
5299 return LoopInfo.takeError();
5300 Result.push_back(*LoopInfo);
5301 Builder.restoreIP((*LoopInfo)->getAfterIP());
5302 ScanRedInfo->OMPScanFinish = Builder.GetInsertBlock();
5303 return Error::success();
5304 };
5305 Error Err = emitScanBasedDirectiveIR(InputLoopGen, ScanLoopGen, ScanRedInfo);
5306 if (Err)
5307 return Err;
5308 return Result;
5309}
5310
5312 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
5313 bool IsSigned, bool InclusiveStop, const Twine &Name) {
5314
5315 // Consider the following difficulties (assuming 8-bit signed integers):
5316 // * Adding \p Step to the loop counter which passes \p Stop may overflow:
5317 // DO I = 1, 100, 50
5318 /// * A \p Step of INT_MIN cannot not be normalized to a positive direction:
5319 // DO I = 100, 0, -128
5320
5321 // Start, Stop and Step must be of the same integer type.
5322 auto *IndVarTy = cast<IntegerType>(Start->getType());
5323 assert(IndVarTy == Stop->getType() && "Stop type mismatch");
5324 assert(IndVarTy == Step->getType() && "Step type mismatch");
5325
5327
5328 ConstantInt *Zero = ConstantInt::get(IndVarTy, 0);
5329 ConstantInt *One = ConstantInt::get(IndVarTy, 1);
5330
5331 // Like Step, but always positive.
5332 Value *Incr = Step;
5333
5334 // Distance between Start and Stop; always positive.
5335 Value *Span;
5336
5337 // Condition whether there are no iterations are executed at all, e.g. because
5338 // UB < LB.
5339 Value *ZeroCmp;
5340
5341 if (IsSigned) {
5342 // Ensure that increment is positive. If not, negate and invert LB and UB.
5343 Value *IsNeg = Builder.CreateICmpSLT(Step, Zero);
5344 Incr = Builder.CreateSelect(IsNeg, Builder.CreateNeg(Step), Step);
5345 Value *LB = Builder.CreateSelect(IsNeg, Stop, Start);
5346 Value *UB = Builder.CreateSelect(IsNeg, Start, Stop);
5347 Span = Builder.CreateSub(UB, LB, "", false, true);
5348 ZeroCmp = Builder.CreateICmp(
5349 InclusiveStop ? CmpInst::ICMP_SLT : CmpInst::ICMP_SLE, UB, LB);
5350 } else {
5351 Span = Builder.CreateSub(Stop, Start, "", true);
5352 ZeroCmp = Builder.CreateICmp(
5353 InclusiveStop ? CmpInst::ICMP_ULT : CmpInst::ICMP_ULE, Stop, Start);
5354 }
5355
5356 Value *CountIfLooping;
5357 if (InclusiveStop) {
5358 CountIfLooping = Builder.CreateAdd(Builder.CreateUDiv(Span, Incr), One);
5359 } else {
5360 // Avoid incrementing past stop since it could overflow.
5361 Value *CountIfTwo = Builder.CreateAdd(
5362 Builder.CreateUDiv(Builder.CreateSub(Span, One), Incr), One);
5363 Value *OneCmp = Builder.CreateICmp(CmpInst::ICMP_ULE, Span, Incr);
5364 CountIfLooping = Builder.CreateSelect(OneCmp, One, CountIfTwo);
5365 }
5366
5367 return Builder.CreateSelect(ZeroCmp, Zero, CountIfLooping,
5368 "omp_" + Name + ".tripcount");
5369}
5370
5373 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
5374 InsertPointTy ComputeIP, const Twine &Name, bool InScan,
5375 ScanInfo *ScanRedInfo) {
5376 LocationDescription ComputeLoc =
5377 ComputeIP.isSet() ? LocationDescription(ComputeIP, Loc.DL) : Loc;
5378
5380 ComputeLoc, Start, Stop, Step, IsSigned, InclusiveStop, Name);
5381
5382 auto BodyGen = [=](InsertPointTy CodeGenIP, Value *IV) {
5383 Builder.restoreIP(CodeGenIP);
5384 Value *Span = Builder.CreateMul(IV, Step);
5385 Value *IndVar = Builder.CreateAdd(Span, Start);
5386 if (InScan)
5387 ScanRedInfo->IV = IndVar;
5388 return BodyGenCB(Builder.saveIP(), IndVar);
5389 };
5390 LocationDescription LoopLoc =
5391 ComputeIP.isSet()
5392 ? Loc
5393 : LocationDescription(Builder.saveIP(),
5394 Builder.getCurrentDebugLocation());
5395 return createCanonicalLoop(LoopLoc, BodyGen, TripCount, Name);
5396}
5397
5398// Returns an LLVM function to call for initializing loop bounds using OpenMP
5399// static scheduling for composite `distribute parallel for` depending on
5400// `type`. Only i32 and i64 are supported by the runtime. Always interpret
5401// integers as unsigned similarly to CanonicalLoopInfo.
5402static FunctionCallee
5404 OpenMPIRBuilder &OMPBuilder) {
5405 unsigned Bitwidth = Ty->getIntegerBitWidth();
5406 if (Bitwidth == 32)
5407 return OMPBuilder.getOrCreateRuntimeFunction(
5408 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_4u);
5409 if (Bitwidth == 64)
5410 return OMPBuilder.getOrCreateRuntimeFunction(
5411 M, omp::RuntimeFunction::OMPRTL___kmpc_dist_for_static_init_8u);
5412 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5413}
5414
5415// Returns an LLVM function to call for initializing loop bounds using OpenMP
5416// static scheduling depending on `type`. Only i32 and i64 are supported by the
5417// runtime. Always interpret integers as unsigned similarly to
5418// CanonicalLoopInfo.
5420 OpenMPIRBuilder &OMPBuilder) {
5421 unsigned Bitwidth = Ty->getIntegerBitWidth();
5422 if (Bitwidth == 32)
5423 return OMPBuilder.getOrCreateRuntimeFunction(
5424 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
5425 if (Bitwidth == 64)
5426 return OMPBuilder.getOrCreateRuntimeFunction(
5427 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
5428 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
5429}
5430
5431OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::applyStaticWorkshareLoop(
5432 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5433 WorksharingLoopType LoopType, bool NeedsBarrier, bool HasDistSchedule,
5434 OMPScheduleType DistScheduleSchedType) {
5435 assert(CLI->isValid() && "Requires a valid canonical loop");
5436 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
5437 "Require dedicated allocate IP");
5438
5439 // Set up the source location value for OpenMP runtime.
5440 Builder.restoreIP(CLI->getPreheaderIP());
5441 Builder.SetCurrentDebugLocation(DL);
5442
5443 uint32_t SrcLocStrSize;
5444 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5445 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5446
5447 // Declare useful OpenMP runtime functions.
5448 Value *IV = CLI->getIndVar();
5449 Type *IVTy = IV->getType();
5450 FunctionCallee StaticInit =
5451 LoopType == WorksharingLoopType::DistributeForStaticLoop
5452 ? getKmpcDistForStaticInitForType(IVTy, M, *this)
5453 : getKmpcForStaticInitForType(IVTy, M, *this);
5454 FunctionCallee StaticFini =
5455 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5456
5457 // Allocate space for computed loop bounds as expected by the "init" function.
5458 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
5459
5460 Type *I32Type = Type::getInt32Ty(M.getContext());
5461 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5462 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
5463 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
5464 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
5465 CLI->setLastIter(PLastIter);
5466
5467 // At the end of the preheader, prepare for calling the "init" function by
5468 // storing the current loop bounds into the allocated space. A canonical loop
5469 // always iterates from 0 to trip-count with step 1. Note that "init" expects
5470 // and produces an inclusive upper bound.
5471 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5472 Constant *Zero = ConstantInt::get(IVTy, 0);
5473 Constant *One = ConstantInt::get(IVTy, 1);
5474 Builder.CreateStore(Zero, PLowerBound);
5475 Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
5476 Builder.CreateStore(UpperBound, PUpperBound);
5477 Builder.CreateStore(One, PStride);
5478
5479 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5480
5481 OMPScheduleType SchedType =
5482 (LoopType == WorksharingLoopType::DistributeStaticLoop)
5483 ? OMPScheduleType::OrderedDistribute
5485 Constant *SchedulingType =
5486 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5487
5488 // Call the "init" function and update the trip count of the loop with the
5489 // value it produced.
5490 auto BuildInitCall = [LoopType, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5491 PUpperBound, IVTy, PStride, One, Zero, StaticInit,
5492 this](Value *SchedulingType, auto &Builder) {
5493 SmallVector<Value *, 10> Args({SrcLoc, ThreadNum, SchedulingType, PLastIter,
5494 PLowerBound, PUpperBound});
5495 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5496 Value *PDistUpperBound =
5497 Builder.CreateAlloca(IVTy, nullptr, "p.distupperbound");
5498 Args.push_back(PDistUpperBound);
5499 }
5500 Args.append({PStride, One, Zero});
5501 createRuntimeFunctionCall(StaticInit, Args);
5502 };
5503 BuildInitCall(SchedulingType, Builder);
5504 if (HasDistSchedule &&
5505 LoopType != WorksharingLoopType::DistributeStaticLoop) {
5506 Constant *DistScheduleSchedType = ConstantInt::get(
5507 I32Type, static_cast<int>(omp::OMPScheduleType::OrderedDistribute));
5508 // We want to emit a second init function call for the dist_schedule clause
5509 // to the Distribute construct. This should only be done however if a
5510 // Workshare Loop is nested within a Distribute Construct
5511 BuildInitCall(DistScheduleSchedType, Builder);
5512 }
5513 Value *LowerBound = Builder.CreateLoad(IVTy, PLowerBound);
5514 Value *InclusiveUpperBound = Builder.CreateLoad(IVTy, PUpperBound);
5515 Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
5516 Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
5517 CLI->setTripCount(TripCount);
5518
5519 // Update all uses of the induction variable except the one in the condition
5520 // block that compares it with the actual upper bound, and the increment in
5521 // the latch block.
5522
5523 CLI->mapIndVar([&](Instruction *OldIV) -> Value * {
5524 Builder.SetInsertPoint(CLI->getBody(),
5525 CLI->getBody()->getFirstInsertionPt());
5526 Builder.SetCurrentDebugLocation(DL);
5527 return Builder.CreateAdd(OldIV, LowerBound);
5528 });
5529
5530 // In the "exit" block, call the "fini" function.
5531 Builder.SetInsertPoint(CLI->getExit(),
5532 CLI->getExit()->getTerminator()->getIterator());
5533 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5534
5535 // Add the barrier if requested.
5536 if (NeedsBarrier) {
5537 InsertPointOrErrorTy BarrierIP =
5539 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
5540 /* CheckCancelFlag */ false);
5541 if (!BarrierIP)
5542 return BarrierIP.takeError();
5543 }
5544
5545 InsertPointTy AfterIP = CLI->getAfterIP();
5546 CLI->invalidate();
5547
5548 return AfterIP;
5549}
5550
5551static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup,
5552 LoopInfo &LI);
5553static void addLoopMetadata(CanonicalLoopInfo *Loop,
5554 ArrayRef<Metadata *> Properties);
5555
5557 LLVMContext &Ctx, Loop *Loop,
5559 SmallVector<Metadata *> &LoopMDList) {
5560 SmallSet<BasicBlock *, 8> Reachable;
5561
5562 // Get the basic blocks from the loop in which memref instructions
5563 // can be found.
5564 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
5565 // preferably without running any passes.
5566 for (BasicBlock *Block : Loop->getBlocks()) {
5567 if (Block == CLI->getCond() || Block == CLI->getHeader())
5568 continue;
5569 Reachable.insert(Block);
5570 }
5571
5572 // Add access group metadata to memory-access instructions.
5573 MDNode *AccessGroup = MDNode::getDistinct(Ctx, {});
5574 for (BasicBlock *BB : Reachable)
5575 addAccessGroupMetadata(BB, AccessGroup, LoopInfo);
5576 // TODO: If the loop has existing parallel access metadata, have
5577 // to combine two lists.
5578 LoopMDList.push_back(MDNode::get(
5579 Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
5580}
5581
5583OpenMPIRBuilder::applyStaticChunkedWorkshareLoop(
5584 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5585 bool NeedsBarrier, Value *ChunkSize, OMPScheduleType SchedType,
5586 Value *DistScheduleChunkSize, OMPScheduleType DistScheduleSchedType) {
5587 assert(CLI->isValid() && "Requires a valid canonical loop");
5588 assert((ChunkSize || DistScheduleChunkSize) && "Chunk size is required");
5589
5590 LLVMContext &Ctx = CLI->getFunction()->getContext();
5591 Value *IV = CLI->getIndVar();
5592 Value *OrigTripCount = CLI->getTripCount();
5593 Type *IVTy = IV->getType();
5594 assert(IVTy->getIntegerBitWidth() <= 64 &&
5595 "Max supported tripcount bitwidth is 64 bits");
5596 Type *InternalIVTy = IVTy->getIntegerBitWidth() <= 32 ? Type::getInt32Ty(Ctx)
5597 : Type::getInt64Ty(Ctx);
5598 Type *I32Type = Type::getInt32Ty(M.getContext());
5599 Constant *Zero = ConstantInt::get(InternalIVTy, 0);
5600 Constant *One = ConstantInt::get(InternalIVTy, 1);
5601
5602 Function *F = CLI->getFunction();
5603 // Blocks must have terminators.
5604 // FIXME: Don't run analyses on incomplete/invalid IR.
5606 for (BasicBlock &BB : *F)
5607 if (!BB.hasTerminator())
5608 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
5610 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
5611 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
5612 LoopAnalysis LIA;
5613 LoopInfo &&LI = LIA.run(*F, FAM);
5614 for (Instruction *I : UIs)
5615 I->eraseFromParent();
5616 Loop *L = LI.getLoopFor(CLI->getHeader());
5617 SmallVector<Metadata *> LoopMDList;
5618 if (ChunkSize || DistScheduleChunkSize)
5619 applyParallelAccessesMetadata(CLI, Ctx, L, LI, LoopMDList);
5620 addLoopMetadata(CLI, LoopMDList);
5621
5622 // Declare useful OpenMP runtime functions.
5623 FunctionCallee StaticInit =
5624 getKmpcForStaticInitForType(InternalIVTy, M, *this);
5625 FunctionCallee StaticFini =
5626 getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
5627
5628 // Allocate space for computed loop bounds as expected by the "init" function.
5629 Builder.restoreIP(AllocaIP);
5630 Builder.SetCurrentDebugLocation(DL);
5631 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
5632 Value *PLowerBound =
5633 Builder.CreateAlloca(InternalIVTy, nullptr, "p.lowerbound");
5634 Value *PUpperBound =
5635 Builder.CreateAlloca(InternalIVTy, nullptr, "p.upperbound");
5636 Value *PStride = Builder.CreateAlloca(InternalIVTy, nullptr, "p.stride");
5637 CLI->setLastIter(PLastIter);
5638
5639 // Set up the source location value for the OpenMP runtime.
5640 Builder.restoreIP(CLI->getPreheaderIP());
5641 Builder.SetCurrentDebugLocation(DL);
5642
5643 // TODO: Detect overflow in ubsan or max-out with current tripcount.
5644 Value *CastedChunkSize = Builder.CreateZExtOrTrunc(
5645 ChunkSize ? ChunkSize : Zero, InternalIVTy, "chunksize");
5646 Value *CastedDistScheduleChunkSize = Builder.CreateZExtOrTrunc(
5647 DistScheduleChunkSize ? DistScheduleChunkSize : Zero, InternalIVTy,
5648 "distschedulechunksize");
5649 Value *CastedTripCount =
5650 Builder.CreateZExt(OrigTripCount, InternalIVTy, "tripcount");
5651
5652 Constant *SchedulingType =
5653 ConstantInt::get(I32Type, static_cast<int>(SchedType));
5654 Constant *DistSchedulingType =
5655 ConstantInt::get(I32Type, static_cast<int>(DistScheduleSchedType));
5656 Builder.CreateStore(Zero, PLowerBound);
5657 Value *OrigUpperBound = Builder.CreateSub(CastedTripCount, One);
5658 Value *IsTripCountZero = Builder.CreateICmpEQ(CastedTripCount, Zero);
5659 Value *UpperBound =
5660 Builder.CreateSelect(IsTripCountZero, Zero, OrigUpperBound);
5661 Builder.CreateStore(UpperBound, PUpperBound);
5662 Builder.CreateStore(One, PStride);
5663
5664 // Call the "init" function and update the trip count of the loop with the
5665 // value it produced.
5666 uint32_t SrcLocStrSize;
5667 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5668 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5669 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
5670 auto BuildInitCall = [StaticInit, SrcLoc, ThreadNum, PLastIter, PLowerBound,
5671 PUpperBound, PStride, One,
5672 this](Value *SchedulingType, Value *ChunkSize,
5673 auto &Builder) {
5675 StaticInit, {/*loc=*/SrcLoc, /*global_tid=*/ThreadNum,
5676 /*schedtype=*/SchedulingType, /*plastiter=*/PLastIter,
5677 /*plower=*/PLowerBound, /*pupper=*/PUpperBound,
5678 /*pstride=*/PStride, /*incr=*/One,
5679 /*chunk=*/ChunkSize});
5680 };
5681 BuildInitCall(SchedulingType, CastedChunkSize, Builder);
5682 if (DistScheduleSchedType != OMPScheduleType::None &&
5683 SchedType != OMPScheduleType::OrderedDistributeChunked &&
5684 SchedType != OMPScheduleType::OrderedDistribute) {
5685 // We want to emit a second init function call for the dist_schedule clause
5686 // to the Distribute construct. This should only be done however if a
5687 // Workshare Loop is nested within a Distribute Construct
5688 BuildInitCall(DistSchedulingType, CastedDistScheduleChunkSize, Builder);
5689 }
5690
5691 // Load values written by the "init" function.
5692 Value *FirstChunkStart =
5693 Builder.CreateLoad(InternalIVTy, PLowerBound, "omp_firstchunk.lb");
5694 Value *FirstChunkStop =
5695 Builder.CreateLoad(InternalIVTy, PUpperBound, "omp_firstchunk.ub");
5696 Value *FirstChunkEnd = Builder.CreateAdd(FirstChunkStop, One);
5697 Value *ChunkRange =
5698 Builder.CreateSub(FirstChunkEnd, FirstChunkStart, "omp_chunk.range");
5699 Value *NextChunkStride =
5700 Builder.CreateLoad(InternalIVTy, PStride, "omp_dispatch.stride");
5701
5702 // Create outer "dispatch" loop for enumerating the chunks.
5703 BasicBlock *DispatchEnter = splitBB(Builder, true);
5704 Value *DispatchCounter;
5705
5706 // It is safe to assume this didn't return an error because the callback
5707 // passed into createCanonicalLoop is the only possible error source, and it
5708 // always returns success.
5709 CanonicalLoopInfo *DispatchCLI = cantFail(createCanonicalLoop(
5710 {Builder.saveIP(), DL},
5711 [&](InsertPointTy BodyIP, Value *Counter) {
5712 DispatchCounter = Counter;
5713 return Error::success();
5714 },
5715 FirstChunkStart, CastedTripCount, NextChunkStride,
5716 /*IsSigned=*/false, /*InclusiveStop=*/false, /*ComputeIP=*/{},
5717 "dispatch"));
5718
5719 // Remember the BasicBlocks of the dispatch loop we need, then invalidate to
5720 // not have to preserve the canonical invariant.
5721 BasicBlock *DispatchBody = DispatchCLI->getBody();
5722 BasicBlock *DispatchLatch = DispatchCLI->getLatch();
5723 BasicBlock *DispatchExit = DispatchCLI->getExit();
5724 BasicBlock *DispatchAfter = DispatchCLI->getAfter();
5725 DispatchCLI->invalidate();
5726
5727 // Rewire the original loop to become the chunk loop inside the dispatch loop.
5728 redirectTo(DispatchAfter, CLI->getAfter(), DL);
5729 redirectTo(CLI->getExit(), DispatchLatch, DL);
5730 redirectTo(DispatchBody, DispatchEnter, DL);
5731
5732 // Prepare the prolog of the chunk loop.
5733 Builder.restoreIP(CLI->getPreheaderIP());
5734 Builder.SetCurrentDebugLocation(DL);
5735
5736 // Compute the number of iterations of the chunk loop.
5737 Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
5738 Value *ChunkEnd = Builder.CreateAdd(DispatchCounter, ChunkRange);
5739 Value *IsLastChunk =
5740 Builder.CreateICmpUGE(ChunkEnd, CastedTripCount, "omp_chunk.is_last");
5741 Value *CountUntilOrigTripCount =
5742 Builder.CreateSub(CastedTripCount, DispatchCounter);
5743 Value *ChunkTripCount = Builder.CreateSelect(
5744 IsLastChunk, CountUntilOrigTripCount, ChunkRange, "omp_chunk.tripcount");
5745 Value *BackcastedChunkTC =
5746 Builder.CreateTrunc(ChunkTripCount, IVTy, "omp_chunk.tripcount.trunc");
5747 CLI->setTripCount(BackcastedChunkTC);
5748
5749 // Update all uses of the induction variable except the one in the condition
5750 // block that compares it with the actual upper bound, and the increment in
5751 // the latch block.
5752 Value *BackcastedDispatchCounter =
5753 Builder.CreateTrunc(DispatchCounter, IVTy, "omp_dispatch.iv.trunc");
5754 CLI->mapIndVar([&](Instruction *) -> Value * {
5755 Builder.restoreIP(CLI->getBodyIP());
5756 return Builder.CreateAdd(IV, BackcastedDispatchCounter);
5757 });
5758
5759 // In the "exit" block, call the "fini" function.
5760 Builder.SetInsertPoint(DispatchExit, DispatchExit->getFirstInsertionPt());
5761 createRuntimeFunctionCall(StaticFini, {SrcLoc, ThreadNum});
5762
5763 // Add the barrier if requested.
5764 if (NeedsBarrier) {
5765 InsertPointOrErrorTy AfterIP =
5766 createBarrier(LocationDescription(Builder.saveIP(), DL), OMPD_for,
5767 /*ForceSimpleCall=*/false, /*CheckCancelFlag=*/false);
5768 if (!AfterIP)
5769 return AfterIP.takeError();
5770 }
5771
5772#ifndef NDEBUG
5773 // Even though we currently do not support applying additional methods to it,
5774 // the chunk loop should remain a canonical loop.
5775 CLI->assertOK();
5776#endif
5777
5778 return InsertPointTy(DispatchAfter, DispatchAfter->getFirstInsertionPt());
5779}
5780
5781// Returns an LLVM function to call for executing an OpenMP static worksharing
5782// for loop depending on `type`. Only i32 and i64 are supported by the runtime.
5783// Always interpret integers as unsigned similarly to CanonicalLoopInfo.
5784static FunctionCallee
5786 WorksharingLoopType LoopType) {
5787 unsigned Bitwidth = Ty->getIntegerBitWidth();
5788 Module &M = OMPBuilder->M;
5789 switch (LoopType) {
5790 case WorksharingLoopType::ForStaticLoop:
5791 if (Bitwidth == 32)
5792 return OMPBuilder->getOrCreateRuntimeFunction(
5793 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_4u);
5794 if (Bitwidth == 64)
5795 return OMPBuilder->getOrCreateRuntimeFunction(
5796 M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_loop_8u);
5797 break;
5798 case WorksharingLoopType::DistributeStaticLoop:
5799 if (Bitwidth == 32)
5800 return OMPBuilder->getOrCreateRuntimeFunction(
5801 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_4u);
5802 if (Bitwidth == 64)
5803 return OMPBuilder->getOrCreateRuntimeFunction(
5804 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_static_loop_8u);
5805 break;
5806 case WorksharingLoopType::DistributeForStaticLoop:
5807 if (Bitwidth == 32)
5808 return OMPBuilder->getOrCreateRuntimeFunction(
5809 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_4u);
5810 if (Bitwidth == 64)
5811 return OMPBuilder->getOrCreateRuntimeFunction(
5812 M, omp::RuntimeFunction::OMPRTL___kmpc_distribute_for_static_loop_8u);
5813 break;
5814 }
5815 if (Bitwidth != 32 && Bitwidth != 64) {
5816 llvm_unreachable("Unknown OpenMP loop iterator bitwidth");
5817 }
5818 llvm_unreachable("Unknown type of OpenMP worksharing loop");
5819}
5820
5821// Inserts a call to proper OpenMP Device RTL function which handles
5822// loop worksharing.
5824 WorksharingLoopType LoopType,
5825 BasicBlock *InsertBlock, Value *Ident,
5826 Value *LoopBodyArg, Value *TripCount,
5827 Function &LoopBodyFn, bool NoLoop) {
5828 Type *TripCountTy = TripCount->getType();
5829 Module &M = OMPBuilder->M;
5830 IRBuilder<> &Builder = OMPBuilder->Builder;
5831 FunctionCallee RTLFn =
5832 getKmpcForStaticLoopForType(TripCountTy, OMPBuilder, LoopType);
5833 SmallVector<Value *, 8> RealArgs;
5834 RealArgs.push_back(Ident);
5835 RealArgs.push_back(&LoopBodyFn);
5836 RealArgs.push_back(LoopBodyArg);
5837 RealArgs.push_back(TripCount);
5838 if (LoopType == WorksharingLoopType::DistributeStaticLoop) {
5839 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5840 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5841 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5842 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5843 return;
5844 }
5845 FunctionCallee RTLNumThreads = OMPBuilder->getOrCreateRuntimeFunction(
5846 M, omp::RuntimeFunction::OMPRTL_omp_get_num_threads);
5847 Builder.restoreIP({InsertBlock, std::prev(InsertBlock->end())});
5848 Value *NumThreads = OMPBuilder->createRuntimeFunctionCall(RTLNumThreads, {});
5849
5850 RealArgs.push_back(
5851 Builder.CreateZExtOrTrunc(NumThreads, TripCountTy, "num.threads.cast"));
5852 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5853 if (LoopType == WorksharingLoopType::DistributeForStaticLoop) {
5854 RealArgs.push_back(ConstantInt::get(TripCountTy, 0));
5855 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), NoLoop));
5856 } else {
5857 RealArgs.push_back(ConstantInt::get(Builder.getInt8Ty(), 0));
5858 }
5859
5860 OMPBuilder->createRuntimeFunctionCall(RTLFn, RealArgs);
5861}
5862
5864 OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident,
5865 Function &OutlinedFn, const SmallVector<Instruction *, 4> &ToBeDeleted,
5866 WorksharingLoopType LoopType, bool NoLoop) {
5867 IRBuilder<> &Builder = OMPIRBuilder->Builder;
5868 BasicBlock *Preheader = CLI->getPreheader();
5869 Value *TripCount = CLI->getTripCount();
5870
5871 // After loop body outling, the loop body contains only set up
5872 // of loop body argument structure and the call to the outlined
5873 // loop body function. Firstly, we need to move setup of loop body args
5874 // into loop preheader.
5875 Preheader->splice(std::prev(Preheader->end()), CLI->getBody(),
5876 CLI->getBody()->begin(), std::prev(CLI->getBody()->end()));
5877
5878 // The next step is to remove the whole loop. We do not it need anymore.
5879 // That's why make an unconditional branch from loop preheader to loop
5880 // exit block
5881 Builder.restoreIP({Preheader, Preheader->end()});
5882 Builder.SetCurrentDebugLocation(Preheader->getTerminator()->getDebugLoc());
5883 Preheader->getTerminator()->eraseFromParent();
5884 Builder.CreateBr(CLI->getExit());
5885
5886 // Delete dead loop blocks
5887 OpenMPIRBuilder::OutlineInfo CleanUpInfo;
5888 SmallPtrSet<BasicBlock *, 32> RegionBlockSet;
5889 SmallVector<BasicBlock *, 32> BlocksToBeRemoved;
5890 CleanUpInfo.EntryBB = CLI->getHeader();
5891 CleanUpInfo.ExitBB = CLI->getExit();
5892 CleanUpInfo.collectBlocks(RegionBlockSet, BlocksToBeRemoved);
5893 DeleteDeadBlocks(BlocksToBeRemoved);
5894
5895 // Find the instruction which corresponds to loop body argument structure
5896 // and remove the call to loop body function instruction.
5897 Value *LoopBodyArg;
5898 User *OutlinedFnUser = OutlinedFn.getUniqueUndroppableUser();
5899 assert(OutlinedFnUser &&
5900 "Expected unique undroppable user of outlined function");
5901 CallInst *OutlinedFnCallInstruction = dyn_cast<CallInst>(OutlinedFnUser);
5902 assert(OutlinedFnCallInstruction && "Expected outlined function call");
5903 assert((OutlinedFnCallInstruction->getParent() == Preheader) &&
5904 "Expected outlined function call to be located in loop preheader");
5905 // Check in case no argument structure has been passed.
5906 if (OutlinedFnCallInstruction->arg_size() > 1)
5907 LoopBodyArg = OutlinedFnCallInstruction->getArgOperand(1);
5908 else
5909 LoopBodyArg = Constant::getNullValue(Builder.getPtrTy());
5910 OutlinedFnCallInstruction->eraseFromParent();
5911
5912 createTargetLoopWorkshareCall(OMPIRBuilder, LoopType, Preheader, Ident,
5913 LoopBodyArg, TripCount, OutlinedFn, NoLoop);
5914
5915 for (auto &ToBeDeletedItem : ToBeDeleted)
5916 ToBeDeletedItem->eraseFromParent();
5917 CLI->invalidate();
5918}
5919
5920OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::applyWorkshareLoopTarget(
5921 DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP,
5922 WorksharingLoopType LoopType, bool NoLoop) {
5923 uint32_t SrcLocStrSize;
5924 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
5925 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
5926
5927 OutlineInfo OI;
5928 OI.OuterAllocaBB = CLI->getPreheader();
5929 Function *OuterFn = CLI->getPreheader()->getParent();
5930
5931 // Instructions which need to be deleted at the end of code generation
5932 SmallVector<Instruction *, 4> ToBeDeleted;
5933
5934 OI.OuterAllocaBB = AllocaIP.getBlock();
5935
5936 // Mark the body loop as region which needs to be extracted
5937 OI.EntryBB = CLI->getBody();
5938 OI.ExitBB = CLI->getLatch()->splitBasicBlockBefore(CLI->getLatch()->begin(),
5939 "omp.prelatch");
5940
5941 // Prepare loop body for extraction
5942 Builder.restoreIP({CLI->getPreheader(), CLI->getPreheader()->begin()});
5943
5944 // Insert new loop counter variable which will be used only in loop
5945 // body.
5946 AllocaInst *NewLoopCnt = Builder.CreateAlloca(CLI->getIndVarType(), 0, "");
5947 Instruction *NewLoopCntLoad =
5948 Builder.CreateLoad(CLI->getIndVarType(), NewLoopCnt);
5949 // New loop counter instructions are redundant in the loop preheader when
5950 // code generation for workshare loop is finshed. That's why mark them as
5951 // ready for deletion.
5952 ToBeDeleted.push_back(NewLoopCntLoad);
5953 ToBeDeleted.push_back(NewLoopCnt);
5954
5955 // Analyse loop body region. Find all input variables which are used inside
5956 // loop body region.
5957 SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
5959 OI.collectBlocks(ParallelRegionBlockSet, Blocks);
5960
5961 CodeExtractorAnalysisCache CEAC(*OuterFn);
5962 CodeExtractor Extractor(Blocks,
5963 /* DominatorTree */ nullptr,
5964 /* AggregateArgs */ true,
5965 /* BlockFrequencyInfo */ nullptr,
5966 /* BranchProbabilityInfo */ nullptr,
5967 /* AssumptionCache */ nullptr,
5968 /* AllowVarArgs */ true,
5969 /* AllowAlloca */ true,
5970 /* AllocationBlock */ CLI->getPreheader(),
5971 /* Suffix */ ".omp_wsloop",
5972 /* AggrArgsIn0AddrSpace */ true);
5973
5974 BasicBlock *CommonExit = nullptr;
5975 SetVector<Value *> SinkingCands, HoistingCands;
5976
5977 // Find allocas outside the loop body region which are used inside loop
5978 // body
5979 Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
5980
5981 // We need to model loop body region as the function f(cnt, loop_arg).
5982 // That's why we replace loop induction variable by the new counter
5983 // which will be one of loop body function argument
5985 CLI->getIndVar()->user_end());
5986 for (auto Use : Users) {
5987 if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
5988 if (ParallelRegionBlockSet.count(Inst->getParent())) {
5989 Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
5990 }
5991 }
5992 }
5993 // Make sure that loop counter variable is not merged into loop body
5994 // function argument structure and it is passed as separate variable
5995 OI.ExcludeArgsFromAggregate.push_back(NewLoopCntLoad);
5996
5997 // PostOutline CB is invoked when loop body function is outlined and
5998 // loop body is replaced by call to outlined function. We need to add
5999 // call to OpenMP device rtl inside loop preheader. OpenMP device rtl
6000 // function will handle loop control logic.
6001 //
6002 OI.PostOutlineCB = [=, ToBeDeletedVec =
6003 std::move(ToBeDeleted)](Function &OutlinedFn) {
6004 workshareLoopTargetCallback(this, CLI, Ident, OutlinedFn, ToBeDeletedVec,
6005 LoopType, NoLoop);
6006 };
6007 addOutlineInfo(std::move(OI));
6008 return CLI->getAfterIP();
6009}
6010
6013 bool NeedsBarrier, omp::ScheduleKind SchedKind, Value *ChunkSize,
6014 bool HasSimdModifier, bool HasMonotonicModifier,
6015 bool HasNonmonotonicModifier, bool HasOrderedClause,
6016 WorksharingLoopType LoopType, bool NoLoop, bool HasDistSchedule,
6017 Value *DistScheduleChunkSize) {
6018 if (Config.isTargetDevice())
6019 return applyWorkshareLoopTarget(DL, CLI, AllocaIP, LoopType, NoLoop);
6020 OMPScheduleType EffectiveScheduleType = computeOpenMPScheduleType(
6021 SchedKind, ChunkSize, HasSimdModifier, HasMonotonicModifier,
6022 HasNonmonotonicModifier, HasOrderedClause, DistScheduleChunkSize);
6023
6024 bool IsOrdered = (EffectiveScheduleType & OMPScheduleType::ModifierOrdered) ==
6025 OMPScheduleType::ModifierOrdered;
6026 OMPScheduleType DistScheduleSchedType = OMPScheduleType::None;
6027 if (HasDistSchedule) {
6028 DistScheduleSchedType = DistScheduleChunkSize
6029 ? OMPScheduleType::OrderedDistributeChunked
6030 : OMPScheduleType::OrderedDistribute;
6031 }
6032 switch (EffectiveScheduleType & ~OMPScheduleType::ModifierMask) {
6033 case OMPScheduleType::BaseStatic:
6034 case OMPScheduleType::BaseDistribute:
6035 assert((!ChunkSize || !DistScheduleChunkSize) &&
6036 "No chunk size with static-chunked schedule");
6037 if (IsOrdered && !HasDistSchedule)
6038 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6039 NeedsBarrier, ChunkSize);
6040 // FIXME: Monotonicity ignored?
6041 if (DistScheduleChunkSize)
6042 return applyStaticChunkedWorkshareLoop(
6043 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6044 DistScheduleChunkSize, DistScheduleSchedType);
6045 return applyStaticWorkshareLoop(DL, CLI, AllocaIP, LoopType, NeedsBarrier,
6046 HasDistSchedule);
6047
6048 case OMPScheduleType::BaseStaticChunked:
6049 case OMPScheduleType::BaseDistributeChunked:
6050 if (IsOrdered && !HasDistSchedule)
6051 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6052 NeedsBarrier, ChunkSize);
6053 // FIXME: Monotonicity ignored?
6054 return applyStaticChunkedWorkshareLoop(
6055 DL, CLI, AllocaIP, NeedsBarrier, ChunkSize, EffectiveScheduleType,
6056 DistScheduleChunkSize, DistScheduleSchedType);
6057
6058 case OMPScheduleType::BaseRuntime:
6059 case OMPScheduleType::BaseAuto:
6060 case OMPScheduleType::BaseGreedy:
6061 case OMPScheduleType::BaseBalanced:
6062 case OMPScheduleType::BaseSteal:
6063 case OMPScheduleType::BaseRuntimeSimd:
6064 assert(!ChunkSize &&
6065 "schedule type does not support user-defined chunk sizes");
6066 [[fallthrough]];
6067 case OMPScheduleType::BaseGuidedSimd:
6068 case OMPScheduleType::BaseDynamicChunked:
6069 case OMPScheduleType::BaseGuidedChunked:
6070 case OMPScheduleType::BaseGuidedIterativeChunked:
6071 case OMPScheduleType::BaseGuidedAnalyticalChunked:
6072 case OMPScheduleType::BaseStaticBalancedChunked:
6073 return applyDynamicWorkshareLoop(DL, CLI, AllocaIP, EffectiveScheduleType,
6074 NeedsBarrier, ChunkSize);
6075
6076 default:
6077 llvm_unreachable("Unknown/unimplemented schedule kind");
6078 }
6079}
6080
6081/// Returns an LLVM function to call for initializing loop bounds using OpenMP
6082/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6083/// the runtime. Always interpret integers as unsigned similarly to
6084/// CanonicalLoopInfo.
6085static FunctionCallee
6087 unsigned Bitwidth = Ty->getIntegerBitWidth();
6088 if (Bitwidth == 32)
6089 return OMPBuilder.getOrCreateRuntimeFunction(
6090 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
6091 if (Bitwidth == 64)
6092 return OMPBuilder.getOrCreateRuntimeFunction(
6093 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
6094 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6095}
6096
6097/// Returns an LLVM function to call for updating the next loop using OpenMP
6098/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
6099/// the runtime. Always interpret integers as unsigned similarly to
6100/// CanonicalLoopInfo.
6101static FunctionCallee
6103 unsigned Bitwidth = Ty->getIntegerBitWidth();
6104 if (Bitwidth == 32)
6105 return OMPBuilder.getOrCreateRuntimeFunction(
6106 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
6107 if (Bitwidth == 64)
6108 return OMPBuilder.getOrCreateRuntimeFunction(
6109 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
6110 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6111}
6112
6113/// Returns an LLVM function to call for finalizing the dynamic loop using
6114/// depending on `type`. Only i32 and i64 are supported by the runtime. Always
6115/// interpret integers as unsigned similarly to CanonicalLoopInfo.
6116static FunctionCallee
6118 unsigned Bitwidth = Ty->getIntegerBitWidth();
6119 if (Bitwidth == 32)
6120 return OMPBuilder.getOrCreateRuntimeFunction(
6121 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_4u);
6122 if (Bitwidth == 64)
6123 return OMPBuilder.getOrCreateRuntimeFunction(
6124 M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_fini_8u);
6125 llvm_unreachable("unknown OpenMP loop iterator bitwidth");
6126}
6127
6129OpenMPIRBuilder::applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
6130 InsertPointTy AllocaIP,
6131 OMPScheduleType SchedType,
6132 bool NeedsBarrier, Value *Chunk) {
6133 assert(CLI->isValid() && "Requires a valid canonical loop");
6134 assert(!isConflictIP(AllocaIP, CLI->getPreheaderIP()) &&
6135 "Require dedicated allocate IP");
6137 "Require valid schedule type");
6138
6139 bool Ordered = (SchedType & OMPScheduleType::ModifierOrdered) ==
6140 OMPScheduleType::ModifierOrdered;
6141
6142 // Set up the source location value for OpenMP runtime.
6143 Builder.SetCurrentDebugLocation(DL);
6144
6145 uint32_t SrcLocStrSize;
6146 Constant *SrcLocStr = getOrCreateSrcLocStr(DL, SrcLocStrSize);
6147 Value *SrcLoc = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
6148
6149 // Declare useful OpenMP runtime functions.
6150 Value *IV = CLI->getIndVar();
6151 Type *IVTy = IV->getType();
6152 FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
6153 FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
6154
6155 // Allocate space for computed loop bounds as expected by the "init" function.
6156 Builder.SetInsertPoint(AllocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca());
6157 Type *I32Type = Type::getInt32Ty(M.getContext());
6158 Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
6159 Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
6160 Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
6161 Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
6162 CLI->setLastIter(PLastIter);
6163
6164 // At the end of the preheader, prepare for calling the "init" function by
6165 // storing the current loop bounds into the allocated space. A canonical loop
6166 // always iterates from 0 to trip-count with step 1. Note that "init" expects
6167 // and produces an inclusive upper bound.
6168 BasicBlock *PreHeader = CLI->getPreheader();
6169 Builder.SetInsertPoint(PreHeader->getTerminator());
6170 Constant *One = ConstantInt::get(IVTy, 1);
6171 Builder.CreateStore(One, PLowerBound);
6172 Value *UpperBound = CLI->getTripCount();
6173 Builder.CreateStore(UpperBound, PUpperBound);
6174 Builder.CreateStore(One, PStride);
6175
6176 BasicBlock *Header = CLI->getHeader();
6177 BasicBlock *Exit = CLI->getExit();
6178 BasicBlock *Cond = CLI->getCond();
6179 BasicBlock *Latch = CLI->getLatch();
6180 InsertPointTy AfterIP = CLI->getAfterIP();
6181
6182 // The CLI will be "broken" in the code below, as the loop is no longer
6183 // a valid canonical loop.
6184
6185 if (!Chunk)
6186 Chunk = One;
6187
6188 Value *ThreadNum = getOrCreateThreadID(SrcLoc);
6189
6190 Constant *SchedulingType =
6191 ConstantInt::get(I32Type, static_cast<int>(SchedType));
6192
6193 // Call the "init" function.
6194 createRuntimeFunctionCall(DynamicInit, {SrcLoc, ThreadNum, SchedulingType,
6195 /* LowerBound */ One, UpperBound,
6196 /* step */ One, Chunk});
6197
6198 // An outer loop around the existing one.
6199 BasicBlock *OuterCond = BasicBlock::Create(
6200 PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
6201 PreHeader->getParent());
6202 // This needs to be 32-bit always, so can't use the IVTy Zero above.
6203 Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
6205 DynamicNext,
6206 {SrcLoc, ThreadNum, PLastIter, PLowerBound, PUpperBound, PStride});
6207 Constant *Zero32 = ConstantInt::get(I32Type, 0);
6208 Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
6209 Value *LowerBound =
6210 Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
6211 Builder.CreateCondBr(MoreWork, Header, Exit);
6212
6213 // Change PHI-node in loop header to use outer cond rather than preheader,
6214 // and set IV to the LowerBound.
6215 Instruction *Phi = &Header->front();
6216 auto *PI = cast<PHINode>(Phi);
6217 PI->setIncomingBlock(0, OuterCond);
6218 PI->setIncomingValue(0, LowerBound);
6219
6220 // Then set the pre-header to jump to the OuterCond
6221 Instruction *Term = PreHeader->getTerminator();
6222 auto *Br = cast<UncondBrInst>(Term);
6223 Br->setSuccessor(OuterCond);
6224
6225 // Modify the inner condition:
6226 // * Use the UpperBound returned from the DynamicNext call.
6227 // * jump to the loop outer loop when done with one of the inner loops.
6228 Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
6229 UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
6230 Instruction *Comp = &*Builder.GetInsertPoint();
6231 auto *CI = cast<CmpInst>(Comp);
6232 CI->setOperand(1, UpperBound);
6233 // Redirect the inner exit to branch to outer condition.
6234 Instruction *Branch = &Cond->back();
6235 auto *BI = cast<CondBrInst>(Branch);
6236 assert(BI->getSuccessor(1) == Exit);
6237 BI->setSuccessor(1, OuterCond);
6238
6239 // Call the "fini" function if "ordered" is present in wsloop directive.
6240 if (Ordered) {
6241 Builder.SetInsertPoint(&Latch->back());
6242 FunctionCallee DynamicFini = getKmpcForDynamicFiniForType(IVTy, M, *this);
6243 createRuntimeFunctionCall(DynamicFini, {SrcLoc, ThreadNum});
6244 }
6245
6246 // Add the barrier if requested.
6247 if (NeedsBarrier) {
6248 Builder.SetInsertPoint(&Exit->back());
6249 InsertPointOrErrorTy BarrierIP =
6251 omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
6252 /* CheckCancelFlag */ false);
6253 if (!BarrierIP)
6254 return BarrierIP.takeError();
6255 }
6256
6257 CLI->invalidate();
6258 return AfterIP;
6259}
6260
6261/// Redirect all edges that branch to \p OldTarget to \p NewTarget. That is,
6262/// after this \p OldTarget will be orphaned.
6264 BasicBlock *NewTarget, DebugLoc DL) {
6265 for (BasicBlock *Pred : make_early_inc_range(predecessors(OldTarget)))
6266 redirectTo(Pred, NewTarget, DL);
6267}
6268
6269/// Determine which blocks in \p BBs are reachable from outside and remove the
6270/// ones that are not reachable from the function.
6273 auto HasRemainingUses = [&BBsToErase](BasicBlock *BB) {
6274 for (Use &U : BB->uses()) {
6275 auto *UseInst = dyn_cast<Instruction>(U.getUser());
6276 if (!UseInst)
6277 continue;
6278 if (BBsToErase.count(UseInst->getParent()))
6279 continue;
6280 return true;
6281 }
6282 return false;
6283 };
6284
6285 while (BBsToErase.remove_if(HasRemainingUses)) {
6286 // Try again if anything was removed.
6287 }
6288
6289 SmallVector<BasicBlock *, 7> BBVec(BBsToErase.begin(), BBsToErase.end());
6290 DeleteDeadBlocks(BBVec);
6291}
6292
6293CanonicalLoopInfo *
6295 InsertPointTy ComputeIP) {
6296 assert(Loops.size() >= 1 && "At least one loop required");
6297 size_t NumLoops = Loops.size();
6298
6299 // Nothing to do if there is already just one loop.
6300 if (NumLoops == 1)
6301 return Loops.front();
6302
6303 CanonicalLoopInfo *Outermost = Loops.front();
6304 CanonicalLoopInfo *Innermost = Loops.back();
6305 BasicBlock *OrigPreheader = Outermost->getPreheader();
6306 BasicBlock *OrigAfter = Outermost->getAfter();
6307 Function *F = OrigPreheader->getParent();
6308
6309 // Loop control blocks that may become orphaned later.
6310 SmallVector<BasicBlock *, 12> OldControlBBs;
6311 OldControlBBs.reserve(6 * Loops.size());
6313 Loop->collectControlBlocks(OldControlBBs);
6314
6315 // Setup the IRBuilder for inserting the trip count computation.
6316 Builder.SetCurrentDebugLocation(DL);
6317 if (ComputeIP.isSet())
6318 Builder.restoreIP(ComputeIP);
6319 else
6320 Builder.restoreIP(Outermost->getPreheaderIP());
6321
6322 // Derive the collapsed' loop trip count.
6323 // TODO: Find common/largest indvar type.
6324 Value *CollapsedTripCount = nullptr;
6325 for (CanonicalLoopInfo *L : Loops) {
6326 assert(L->isValid() &&
6327 "All loops to collapse must be valid canonical loops");
6328 Value *OrigTripCount = L->getTripCount();
6329 if (!CollapsedTripCount) {
6330 CollapsedTripCount = OrigTripCount;
6331 continue;
6332 }
6333
6334 // TODO: Enable UndefinedSanitizer to diagnose an overflow here.
6335 CollapsedTripCount =
6336 Builder.CreateNUWMul(CollapsedTripCount, OrigTripCount);
6337 }
6338
6339 // Create the collapsed loop control flow.
6340 CanonicalLoopInfo *Result =
6341 createLoopSkeleton(DL, CollapsedTripCount, F,
6342 OrigPreheader->getNextNode(), OrigAfter, "collapsed");
6343
6344 // Build the collapsed loop body code.
6345 // Start with deriving the input loop induction variables from the collapsed
6346 // one, using a divmod scheme. To preserve the original loops' order, the
6347 // innermost loop use the least significant bits.
6348 Builder.restoreIP(Result->getBodyIP());
6349
6350 Value *Leftover = Result->getIndVar();
6351 SmallVector<Value *> NewIndVars;
6352 NewIndVars.resize(NumLoops);
6353 for (int i = NumLoops - 1; i >= 1; --i) {
6354 Value *OrigTripCount = Loops[i]->getTripCount();
6355
6356 Value *NewIndVar = Builder.CreateURem(Leftover, OrigTripCount);
6357 NewIndVars[i] = NewIndVar;
6358
6359 Leftover = Builder.CreateUDiv(Leftover, OrigTripCount);
6360 }
6361 // Outermost loop gets all the remaining bits.
6362 NewIndVars[0] = Leftover;
6363
6364 // Construct the loop body control flow.
6365 // We progressively construct the branch structure following in direction of
6366 // the control flow, from the leading in-between code, the loop nest body, the
6367 // trailing in-between code, and rejoining the collapsed loop's latch.
6368 // ContinueBlock and ContinuePred keep track of the source(s) of next edge. If
6369 // the ContinueBlock is set, continue with that block. If ContinuePred, use
6370 // its predecessors as sources.
6371 BasicBlock *ContinueBlock = Result->getBody();
6372 BasicBlock *ContinuePred = nullptr;
6373 auto ContinueWith = [&ContinueBlock, &ContinuePred, DL](BasicBlock *Dest,
6374 BasicBlock *NextSrc) {
6375 if (ContinueBlock)
6376 redirectTo(ContinueBlock, Dest, DL);
6377 else
6378 redirectAllPredecessorsTo(ContinuePred, Dest, DL);
6379
6380 ContinueBlock = nullptr;
6381 ContinuePred = NextSrc;
6382 };
6383
6384 // The code before the nested loop of each level.
6385 // Because we are sinking it into the nest, it will be executed more often
6386 // that the original loop. More sophisticated schemes could keep track of what
6387 // the in-between code is and instantiate it only once per thread.
6388 for (size_t i = 0; i < NumLoops - 1; ++i)
6389 ContinueWith(Loops[i]->getBody(), Loops[i + 1]->getHeader());
6390
6391 // Connect the loop nest body.
6392 ContinueWith(Innermost->getBody(), Innermost->getLatch());
6393
6394 // The code after the nested loop at each level.
6395 for (size_t i = NumLoops - 1; i > 0; --i)
6396 ContinueWith(Loops[i]->getAfter(), Loops[i - 1]->getLatch());
6397
6398 // Connect the finished loop to the collapsed loop latch.
6399 ContinueWith(Result->getLatch(), nullptr);
6400
6401 // Replace the input loops with the new collapsed loop.
6402 redirectTo(Outermost->getPreheader(), Result->getPreheader(), DL);
6403 redirectTo(Result->getAfter(), Outermost->getAfter(), DL);
6404
6405 // Replace the input loop indvars with the derived ones.
6406 for (size_t i = 0; i < NumLoops; ++i)
6407 Loops[i]->getIndVar()->replaceAllUsesWith(NewIndVars[i]);
6408
6409 // Remove unused parts of the input loops.
6410 removeUnusedBlocksFromParent(OldControlBBs);
6411
6412 for (CanonicalLoopInfo *L : Loops)
6413 L->invalidate();
6414
6415#ifndef NDEBUG
6416 Result->assertOK();
6417#endif
6418 return Result;
6419}
6420
6421std::vector<CanonicalLoopInfo *>
6423 ArrayRef<Value *> TileSizes) {
6424 assert(TileSizes.size() == Loops.size() &&
6425 "Must pass as many tile sizes as there are loops");
6426 int NumLoops = Loops.size();
6427 assert(NumLoops >= 1 && "At least one loop to tile required");
6428
6429 CanonicalLoopInfo *OutermostLoop = Loops.front();
6430 CanonicalLoopInfo *InnermostLoop = Loops.back();
6431 Function *F = OutermostLoop->getBody()->getParent();
6432 BasicBlock *InnerEnter = InnermostLoop->getBody();
6433 BasicBlock *InnerLatch = InnermostLoop->getLatch();
6434
6435 // Loop control blocks that may become orphaned later.
6436 SmallVector<BasicBlock *, 12> OldControlBBs;
6437 OldControlBBs.reserve(6 * Loops.size());
6439 Loop->collectControlBlocks(OldControlBBs);
6440
6441 // Collect original trip counts and induction variable to be accessible by
6442 // index. Also, the structure of the original loops is not preserved during
6443 // the construction of the tiled loops, so do it before we scavenge the BBs of
6444 // any original CanonicalLoopInfo.
6445 SmallVector<Value *, 4> OrigTripCounts, OrigIndVars;
6446 for (CanonicalLoopInfo *L : Loops) {
6447 assert(L->isValid() && "All input loops must be valid canonical loops");
6448 OrigTripCounts.push_back(L->getTripCount());
6449 OrigIndVars.push_back(L->getIndVar());
6450 }
6451
6452 // Collect the code between loop headers. These may contain SSA definitions
6453 // that are used in the loop nest body. To be usable with in the innermost
6454 // body, these BasicBlocks will be sunk into the loop nest body. That is,
6455 // these instructions may be executed more often than before the tiling.
6456 // TODO: It would be sufficient to only sink them into body of the
6457 // corresponding tile loop.
6459 for (int i = 0; i < NumLoops - 1; ++i) {
6460 CanonicalLoopInfo *Surrounding = Loops[i];
6461 CanonicalLoopInfo *Nested = Loops[i + 1];
6462
6463 BasicBlock *EnterBB = Surrounding->getBody();
6464 BasicBlock *ExitBB = Nested->getHeader();
6465 InbetweenCode.emplace_back(EnterBB, ExitBB);
6466 }
6467
6468 // Compute the trip counts of the floor loops.
6469 Builder.SetCurrentDebugLocation(DL);
6470 Builder.restoreIP(OutermostLoop->getPreheaderIP());
6471 SmallVector<Value *, 4> FloorCompleteCount, FloorCount, FloorRems;
6472 for (int i = 0; i < NumLoops; ++i) {
6473 Value *TileSize = TileSizes[i];
6474 Value *OrigTripCount = OrigTripCounts[i];
6475 Type *IVType = OrigTripCount->getType();
6476
6477 Value *FloorCompleteTripCount = Builder.CreateUDiv(OrigTripCount, TileSize);
6478 Value *FloorTripRem = Builder.CreateURem(OrigTripCount, TileSize);
6479
6480 // 0 if tripcount divides the tilesize, 1 otherwise.
6481 // 1 means we need an additional iteration for a partial tile.
6482 //
6483 // Unfortunately we cannot just use the roundup-formula
6484 // (tripcount + tilesize - 1)/tilesize
6485 // because the summation might overflow. We do not want introduce undefined
6486 // behavior when the untiled loop nest did not.
6487 Value *FloorTripOverflow =
6488 Builder.CreateICmpNE(FloorTripRem, ConstantInt::get(IVType, 0));
6489
6490 FloorTripOverflow = Builder.CreateZExt(FloorTripOverflow, IVType);
6491 Value *FloorTripCount =
6492 Builder.CreateAdd(FloorCompleteTripCount, FloorTripOverflow,
6493 "omp_floor" + Twine(i) + ".tripcount", true);
6494
6495 // Remember some values for later use.
6496 FloorCompleteCount.push_back(FloorCompleteTripCount);
6497 FloorCount.push_back(FloorTripCount);
6498 FloorRems.push_back(FloorTripRem);
6499 }
6500
6501 // Generate the new loop nest, from the outermost to the innermost.
6502 std::vector<CanonicalLoopInfo *> Result;
6503 Result.reserve(NumLoops * 2);
6504
6505 // The basic block of the surrounding loop that enters the nest generated
6506 // loop.
6507 BasicBlock *Enter = OutermostLoop->getPreheader();
6508
6509 // The basic block of the surrounding loop where the inner code should
6510 // continue.
6511 BasicBlock *Continue = OutermostLoop->getAfter();
6512
6513 // Where the next loop basic block should be inserted.
6514 BasicBlock *OutroInsertBefore = InnermostLoop->getExit();
6515
6516 auto EmbeddNewLoop =
6517 [this, DL, F, InnerEnter, &Enter, &Continue, &OutroInsertBefore](
6518 Value *TripCount, const Twine &Name) -> CanonicalLoopInfo * {
6519 CanonicalLoopInfo *EmbeddedLoop = createLoopSkeleton(
6520 DL, TripCount, F, InnerEnter, OutroInsertBefore, Name);
6521 redirectTo(Enter, EmbeddedLoop->getPreheader(), DL);
6522 redirectTo(EmbeddedLoop->getAfter(), Continue, DL);
6523
6524 // Setup the position where the next embedded loop connects to this loop.
6525 Enter = EmbeddedLoop->getBody();
6526 Continue = EmbeddedLoop->getLatch();
6527 OutroInsertBefore = EmbeddedLoop->getLatch();
6528 return EmbeddedLoop;
6529 };
6530
6531 auto EmbeddNewLoops = [&Result, &EmbeddNewLoop](ArrayRef<Value *> TripCounts,
6532 const Twine &NameBase) {
6533 for (auto P : enumerate(TripCounts)) {
6534 CanonicalLoopInfo *EmbeddedLoop =
6535 EmbeddNewLoop(P.value(), NameBase + Twine(P.index()));
6536 Result.push_back(EmbeddedLoop);
6537 }
6538 };
6539
6540 EmbeddNewLoops(FloorCount, "floor");
6541
6542 // Within the innermost floor loop, emit the code that computes the tile
6543 // sizes.
6544 Builder.SetInsertPoint(Enter->getTerminator());
6545 SmallVector<Value *, 4> TileCounts;
6546 for (int i = 0; i < NumLoops; ++i) {
6547 CanonicalLoopInfo *FloorLoop = Result[i];
6548 Value *TileSize = TileSizes[i];
6549
6550 Value *FloorIsEpilogue =
6551 Builder.CreateICmpEQ(FloorLoop->getIndVar(), FloorCompleteCount[i]);
6552 Value *TileTripCount =
6553 Builder.CreateSelect(FloorIsEpilogue, FloorRems[i], TileSize);
6554
6555 TileCounts.push_back(TileTripCount);
6556 }
6557
6558 // Create the tile loops.
6559 EmbeddNewLoops(TileCounts, "tile");
6560
6561 // Insert the inbetween code into the body.
6562 BasicBlock *BodyEnter = Enter;
6563 BasicBlock *BodyEntered = nullptr;
6564 for (std::pair<BasicBlock *, BasicBlock *> P : InbetweenCode) {
6565 BasicBlock *EnterBB = P.first;
6566 BasicBlock *ExitBB = P.second;
6567
6568 if (BodyEnter)
6569 redirectTo(BodyEnter, EnterBB, DL);
6570 else
6571 redirectAllPredecessorsTo(BodyEntered, EnterBB, DL);
6572
6573 BodyEnter = nullptr;
6574 BodyEntered = ExitBB;
6575 }
6576
6577 // Append the original loop nest body into the generated loop nest body.
6578 if (BodyEnter)
6579 redirectTo(BodyEnter, InnerEnter, DL);
6580 else
6581 redirectAllPredecessorsTo(BodyEntered, InnerEnter, DL);
6583
6584 // Replace the original induction variable with an induction variable computed
6585 // from the tile and floor induction variables.
6586 Builder.restoreIP(Result.back()->getBodyIP());
6587 for (int i = 0; i < NumLoops; ++i) {
6588 CanonicalLoopInfo *FloorLoop = Result[i];
6589 CanonicalLoopInfo *TileLoop = Result[NumLoops + i];
6590 Value *OrigIndVar = OrigIndVars[i];
6591 Value *Size = TileSizes[i];
6592
6593 Value *Scale =
6594 Builder.CreateMul(Size, FloorLoop->getIndVar(), {}, /*HasNUW=*/true);
6595 Value *Shift =
6596 Builder.CreateAdd(Scale, TileLoop->getIndVar(), {}, /*HasNUW=*/true);
6597 OrigIndVar->replaceAllUsesWith(Shift);
6598 }
6599
6600 // Remove unused parts of the original loops.
6601 removeUnusedBlocksFromParent(OldControlBBs);
6602
6603 for (CanonicalLoopInfo *L : Loops)
6604 L->invalidate();
6605
6606#ifndef NDEBUG
6607 for (CanonicalLoopInfo *GenL : Result)
6608 GenL->assertOK();
6609#endif
6610 return Result;
6611}
6612
6613/// Attach metadata \p Properties to the basic block described by \p BB. If the
6614/// basic block already has metadata, the basic block properties are appended.
6616 ArrayRef<Metadata *> Properties) {
6617 // Nothing to do if no property to attach.
6618 if (Properties.empty())
6619 return;
6620
6621 LLVMContext &Ctx = BB->getContext();
6622 SmallVector<Metadata *> NewProperties;
6623 NewProperties.push_back(nullptr);
6624
6625 // If the basic block already has metadata, prepend it to the new metadata.
6626 MDNode *Existing = BB->getTerminator()->getMetadata(LLVMContext::MD_loop);
6627 if (Existing)
6628 append_range(NewProperties, drop_begin(Existing->operands(), 1));
6629
6630 append_range(NewProperties, Properties);
6631 MDNode *BasicBlockID = MDNode::getDistinct(Ctx, NewProperties);
6632 BasicBlockID->replaceOperandWith(0, BasicBlockID);
6633
6634 BB->getTerminator()->setMetadata(LLVMContext::MD_loop, BasicBlockID);
6635}
6636
6637/// Attach loop metadata \p Properties to the loop described by \p Loop. If the
6638/// loop already has metadata, the loop properties are appended.
6640 ArrayRef<Metadata *> Properties) {
6641 assert(Loop->isValid() && "Expecting a valid CanonicalLoopInfo");
6642
6643 // Attach metadata to the loop's latch
6644 BasicBlock *Latch = Loop->getLatch();
6645 assert(Latch && "A valid CanonicalLoopInfo must have a unique latch");
6646 addBasicBlockMetadata(Latch, Properties);
6647}
6648
6649/// Attach llvm.access.group metadata to the memref instructions of \p Block
6651 LoopInfo &LI) {
6652 for (Instruction &I : *Block) {
6653 if (I.mayReadOrWriteMemory()) {
6654 // TODO: This instruction may already have access group from
6655 // other pragmas e.g. #pragma clang loop vectorize. Append
6656 // so that the existing metadata is not overwritten.
6657 I.setMetadata(LLVMContext::MD_access_group, AccessGroup);
6658 }
6659 }
6660}
6661
6662CanonicalLoopInfo *
6664 CanonicalLoopInfo *firstLoop = Loops.front();
6665 CanonicalLoopInfo *lastLoop = Loops.back();
6666 Function *F = firstLoop->getPreheader()->getParent();
6667
6668 // Loop control blocks that will become orphaned later
6669 SmallVector<BasicBlock *> oldControlBBs;
6671 Loop->collectControlBlocks(oldControlBBs);
6672
6673 // Collect original trip counts
6674 SmallVector<Value *> origTripCounts;
6675 for (CanonicalLoopInfo *L : Loops) {
6676 assert(L->isValid() && "All input loops must be valid canonical loops");
6677 origTripCounts.push_back(L->getTripCount());
6678 }
6679
6680 Builder.SetCurrentDebugLocation(DL);
6681
6682 // Compute max trip count.
6683 // The fused loop will be from 0 to max(origTripCounts)
6684 BasicBlock *TCBlock = BasicBlock::Create(F->getContext(), "omp.fuse.comp.tc",
6685 F, firstLoop->getHeader());
6686 Builder.SetInsertPoint(TCBlock);
6687 Value *fusedTripCount = nullptr;
6688 for (CanonicalLoopInfo *L : Loops) {
6689 assert(L->isValid() && "All loops to fuse must be valid canonical loops");
6690 Value *origTripCount = L->getTripCount();
6691 if (!fusedTripCount) {
6692 fusedTripCount = origTripCount;
6693 continue;
6694 }
6695 Value *condTP = Builder.CreateICmpSGT(fusedTripCount, origTripCount);
6696 fusedTripCount = Builder.CreateSelect(condTP, fusedTripCount, origTripCount,
6697 ".omp.fuse.tc");
6698 }
6699
6700 // Generate new loop
6701 CanonicalLoopInfo *fused =
6702 createLoopSkeleton(DL, fusedTripCount, F, firstLoop->getBody(),
6703 lastLoop->getLatch(), "fused");
6704
6705 // Replace original loops with the fused loop
6706 // Preheader and After are not considered inside the CLI.
6707 // These are used to compute the individual TCs of the loops
6708 // so they have to be put before the resulting fused loop.
6709 // Moving them up for readability.
6710 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6711 Loops[i]->getPreheader()->moveBefore(TCBlock);
6712 Loops[i]->getAfter()->moveBefore(TCBlock);
6713 }
6714 lastLoop->getPreheader()->moveBefore(TCBlock);
6715
6716 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6717 redirectTo(Loops[i]->getPreheader(), Loops[i]->getAfter(), DL);
6718 redirectTo(Loops[i]->getAfter(), Loops[i + 1]->getPreheader(), DL);
6719 }
6720 redirectTo(lastLoop->getPreheader(), TCBlock, DL);
6721 redirectTo(TCBlock, fused->getPreheader(), DL);
6722 redirectTo(fused->getAfter(), lastLoop->getAfter(), DL);
6723
6724 // Build the fused body
6725 // Create new Blocks with conditions that jump to the original loop bodies
6727 SmallVector<Value *> condValues;
6728 for (size_t i = 0; i < Loops.size(); ++i) {
6729 BasicBlock *condBlock = BasicBlock::Create(
6730 F->getContext(), "omp.fused.inner.cond", F, Loops[i]->getBody());
6731 Builder.SetInsertPoint(condBlock);
6732 Value *condValue =
6733 Builder.CreateICmpSLT(fused->getIndVar(), origTripCounts[i]);
6734 condBBs.push_back(condBlock);
6735 condValues.push_back(condValue);
6736 }
6737 // Join the condition blocks with the bodies of the original loops
6738 redirectTo(fused->getBody(), condBBs[0], DL);
6739 for (size_t i = 0; i < Loops.size() - 1; ++i) {
6740 Builder.SetInsertPoint(condBBs[i]);
6741 Builder.CreateCondBr(condValues[i], Loops[i]->getBody(), condBBs[i + 1]);
6742 redirectAllPredecessorsTo(Loops[i]->getLatch(), condBBs[i + 1], DL);
6743 // Replace the IV with the fused IV
6744 Loops[i]->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6745 }
6746 // Last body jumps to the created end body block
6747 Builder.SetInsertPoint(condBBs.back());
6748 Builder.CreateCondBr(condValues.back(), lastLoop->getBody(),
6749 fused->getLatch());
6750 redirectAllPredecessorsTo(lastLoop->getLatch(), fused->getLatch(), DL);
6751 // Replace the IV with the fused IV
6752 lastLoop->getIndVar()->replaceAllUsesWith(fused->getIndVar());
6753
6754 // The loop latch must have only one predecessor. Currently it is branched to
6755 // from both the last condition block and the last loop body
6756 fused->getLatch()->splitBasicBlockBefore(fused->getLatch()->begin(),
6757 "omp.fused.pre_latch");
6758
6759 // Remove unused parts
6760 removeUnusedBlocksFromParent(oldControlBBs);
6761
6762 // Invalidate old CLIs
6763 for (CanonicalLoopInfo *L : Loops)
6764 L->invalidate();
6765
6766#ifndef NDEBUG
6767 fused->assertOK();
6768#endif
6769 return fused;
6770}
6771
6773 LLVMContext &Ctx = Builder.getContext();
6775 Loop, {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6776 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.full"))});
6777}
6778
6780 LLVMContext &Ctx = Builder.getContext();
6782 Loop, {
6783 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
6784 });
6785}
6786
6787void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
6788 Value *IfCond, ValueToValueMapTy &VMap,
6789 LoopAnalysis &LIA, LoopInfo &LI, Loop *L,
6790 const Twine &NamePrefix) {
6791 Function *F = CanonicalLoop->getFunction();
6792
6793 // We can't do
6794 // if (cond) {
6795 // simd_loop;
6796 // } else {
6797 // non_simd_loop;
6798 // }
6799 // because then the CanonicalLoopInfo would only point to one of the loops:
6800 // leading to other constructs operating on the same loop to malfunction.
6801 // Instead generate
6802 // while (...) {
6803 // if (cond) {
6804 // simd_body;
6805 // } else {
6806 // not_simd_body;
6807 // }
6808 // }
6809 // At least for simple loops, LLVM seems able to hoist the if out of the loop
6810 // body at -O3
6811
6812 // Define where if branch should be inserted
6813 auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
6814
6815 // Create additional blocks for the if statement
6816 BasicBlock *Cond = SplitBeforeIt->getParent();
6817 llvm::LLVMContext &C = Cond->getContext();
6819 C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
6821 C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
6822
6823 // Create if condition branch.
6824 Builder.SetInsertPoint(SplitBeforeIt);
6825 Instruction *BrInstr =
6826 Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
6827 InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
6828 // Then block contains branch to omp loop body which needs to be vectorized
6829 spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
6830 ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
6831
6832 Builder.SetInsertPoint(ElseBlock);
6833
6834 // Clone loop for the else branch
6836
6837 SmallVector<BasicBlock *, 8> ExistingBlocks;
6838 ExistingBlocks.reserve(L->getNumBlocks() + 1);
6839 ExistingBlocks.push_back(ThenBlock);
6840 ExistingBlocks.append(L->block_begin(), L->block_end());
6841 // Cond is the block that has the if clause condition
6842 // LoopCond is omp_loop.cond
6843 // LoopHeader is omp_loop.header
6844 BasicBlock *LoopCond = Cond->getUniquePredecessor();
6845 BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
6846 assert(LoopCond && LoopHeader && "Invalid loop structure");
6847 for (BasicBlock *Block : ExistingBlocks) {
6848 if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
6849 Block == LoopHeader || Block == LoopCond || Block == Cond) {
6850 continue;
6851 }
6852 BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
6853
6854 // fix name not to be omp.if.then
6855 if (Block == ThenBlock)
6856 NewBB->setName(NamePrefix + ".if.else");
6857
6858 NewBB->moveBefore(CanonicalLoop->getExit());
6859 VMap[Block] = NewBB;
6860 NewBlocks.push_back(NewBB);
6861 }
6862 remapInstructionsInBlocks(NewBlocks, VMap);
6863 Builder.CreateBr(NewBlocks.front());
6864
6865 // The loop latch must have only one predecessor. Currently it is branched to
6866 // from both the 'then' and 'else' branches.
6867 L->getLoopLatch()->splitBasicBlockBefore(L->getLoopLatch()->begin(),
6868 NamePrefix + ".pre_latch");
6869
6870 // Ensure that the then block is added to the loop so we add the attributes in
6871 // the next step
6872 L->addBasicBlockToLoop(ThenBlock, LI);
6873}
6874
6875unsigned
6877 const StringMap<bool> &Features) {
6878 if (TargetTriple.isX86()) {
6879 if (Features.lookup("avx512f"))
6880 return 512;
6881 else if (Features.lookup("avx"))
6882 return 256;
6883 return 128;
6884 }
6885 if (TargetTriple.isPPC())
6886 return 128;
6887 if (TargetTriple.isWasm())
6888 return 128;
6889 return 0;
6890}
6891
6893 MapVector<Value *, Value *> AlignedVars,
6894 Value *IfCond, OrderKind Order,
6895 ConstantInt *Simdlen, ConstantInt *Safelen) {
6896 LLVMContext &Ctx = Builder.getContext();
6897
6898 Function *F = CanonicalLoop->getFunction();
6899
6900 // Blocks must have terminators.
6901 // FIXME: Don't run analyses on incomplete/invalid IR.
6903 for (BasicBlock &BB : *F)
6904 if (!BB.hasTerminator())
6905 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
6906
6907 // TODO: We should not rely on pass manager. Currently we use pass manager
6908 // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
6909 // object. We should have a method which returns all blocks between
6910 // CanonicalLoopInfo::getHeader() and CanonicalLoopInfo::getAfter()
6912 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
6913 FAM.registerPass([]() { return LoopAnalysis(); });
6914 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
6915
6916 LoopAnalysis LIA;
6917 LoopInfo &&LI = LIA.run(*F, FAM);
6918
6919 for (Instruction *I : UIs)
6920 I->eraseFromParent();
6921
6922 Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
6923 if (AlignedVars.size()) {
6924 InsertPointTy IP = Builder.saveIP();
6925 for (auto &AlignedItem : AlignedVars) {
6926 Value *AlignedPtr = AlignedItem.first;
6927 Value *Alignment = AlignedItem.second;
6928 Instruction *loadInst = dyn_cast<Instruction>(AlignedPtr);
6929 Builder.SetInsertPoint(loadInst->getNextNode());
6930 Builder.CreateAlignmentAssumption(F->getDataLayout(), AlignedPtr,
6931 Alignment);
6932 }
6933 Builder.restoreIP(IP);
6934 }
6935
6936 if (IfCond) {
6937 ValueToValueMapTy VMap;
6938 createIfVersion(CanonicalLoop, IfCond, VMap, LIA, LI, L, "simd");
6939 }
6940
6942
6943 // Get the basic blocks from the loop in which memref instructions
6944 // can be found.
6945 // TODO: Generalize getting all blocks inside a CanonicalizeLoopInfo,
6946 // preferably without running any passes.
6947 for (BasicBlock *Block : L->getBlocks()) {
6948 if (Block == CanonicalLoop->getCond() ||
6949 Block == CanonicalLoop->getHeader())
6950 continue;
6951 Reachable.insert(Block);
6952 }
6953
6954 SmallVector<Metadata *> LoopMDList;
6955
6956 // In presence of finite 'safelen', it may be unsafe to mark all
6957 // the memory instructions parallel, because loop-carried
6958 // dependences of 'safelen' iterations are possible.
6959 // If clause order(concurrent) is specified then the memory instructions
6960 // are marked parallel even if 'safelen' is finite.
6961 if ((Safelen == nullptr) || (Order == OrderKind::OMP_ORDER_concurrent))
6962 applyParallelAccessesMetadata(CanonicalLoop, Ctx, L, LI, LoopMDList);
6963
6964 // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
6965 // versions so we can't add the loop attributes in that case.
6966 if (IfCond) {
6967 // we can still add llvm.loop.parallel_access
6968 addLoopMetadata(CanonicalLoop, LoopMDList);
6969 return;
6970 }
6971
6972 // Use the above access group metadata to create loop level
6973 // metadata, which should be distinct for each loop.
6974 ConstantAsMetadata *BoolConst =
6976 LoopMDList.push_back(MDNode::get(
6977 Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"), BoolConst}));
6978
6979 if (Simdlen || Safelen) {
6980 // If both simdlen and safelen clauses are specified, the value of the
6981 // simdlen parameter must be less than or equal to the value of the safelen
6982 // parameter. Therefore, use safelen only in the absence of simdlen.
6983 ConstantInt *VectorizeWidth = Simdlen == nullptr ? Safelen : Simdlen;
6984 LoopMDList.push_back(
6985 MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.width"),
6986 ConstantAsMetadata::get(VectorizeWidth)}));
6987 }
6988
6989 addLoopMetadata(CanonicalLoop, LoopMDList);
6990}
6991
6992/// Create the TargetMachine object to query the backend for optimization
6993/// preferences.
6994///
6995/// Ideally, this would be passed from the front-end to the OpenMPBuilder, but
6996/// e.g. Clang does not pass it to its CodeGen layer and creates it only when
6997/// needed for the LLVM pass pipline. We use some default options to avoid
6998/// having to pass too many settings from the frontend that probably do not
6999/// matter.
7000///
7001/// Currently, TargetMachine is only used sometimes by the unrollLoopPartial
7002/// method. If we are going to use TargetMachine for more purposes, especially
7003/// those that are sensitive to TargetOptions, RelocModel and CodeModel, it
7004/// might become be worth requiring front-ends to pass on their TargetMachine,
7005/// or at least cache it between methods. Note that while fontends such as Clang
7006/// have just a single main TargetMachine per translation unit, "target-cpu" and
7007/// "target-features" that determine the TargetMachine are per-function and can
7008/// be overrided using __attribute__((target("OPTIONS"))).
7009static std::unique_ptr<TargetMachine>
7011 Module *M = F->getParent();
7012
7013 StringRef CPU = F->getFnAttribute("target-cpu").getValueAsString();
7014 StringRef Features = F->getFnAttribute("target-features").getValueAsString();
7015 const llvm::Triple &Triple = M->getTargetTriple();
7016
7017 std::string Error;
7019 if (!TheTarget)
7020 return {};
7021
7023 return std::unique_ptr<TargetMachine>(TheTarget->createTargetMachine(
7024 Triple, CPU, Features, Options, /*RelocModel=*/std::nullopt,
7025 /*CodeModel=*/std::nullopt, OptLevel));
7026}
7027
7028/// Heuristically determine the best-performant unroll factor for \p CLI. This
7029/// depends on the target processor. We are re-using the same heuristics as the
7030/// LoopUnrollPass.
7032 Function *F = CLI->getFunction();
7033
7034 // Assume the user requests the most aggressive unrolling, even if the rest of
7035 // the code is optimized using a lower setting.
7037 std::unique_ptr<TargetMachine> TM = createTargetMachine(F, OptLevel);
7038
7039 // Blocks must have terminators.
7040 // FIXME: Don't run analyses on incomplete/invalid IR.
7042 for (BasicBlock &BB : *F)
7043 if (!BB.hasTerminator())
7044 UIs.push_back(new UnreachableInst(F->getContext(), &BB));
7045
7047 FAM.registerPass([]() { return TargetLibraryAnalysis(); });
7048 FAM.registerPass([]() { return AssumptionAnalysis(); });
7049 FAM.registerPass([]() { return DominatorTreeAnalysis(); });
7050 FAM.registerPass([]() { return LoopAnalysis(); });
7051 FAM.registerPass([]() { return ScalarEvolutionAnalysis(); });
7052 FAM.registerPass([]() { return PassInstrumentationAnalysis(); });
7053 TargetIRAnalysis TIRA;
7054 if (TM)
7055 TIRA = TargetIRAnalysis(
7056 [&](const Function &F) { return TM->getTargetTransformInfo(F); });
7057 FAM.registerPass([&]() { return TIRA; });
7058
7059 TargetIRAnalysis::Result &&TTI = TIRA.run(*F, FAM);
7061 ScalarEvolution &&SE = SEA.run(*F, FAM);
7063 DominatorTree &&DT = DTA.run(*F, FAM);
7064 LoopAnalysis LIA;
7065 LoopInfo &&LI = LIA.run(*F, FAM);
7067 AssumptionCache &&AC = ACT.run(*F, FAM);
7069
7070 for (Instruction *I : UIs)
7071 I->eraseFromParent();
7072
7073 Loop *L = LI.getLoopFor(CLI->getHeader());
7074 assert(L && "Expecting CanonicalLoopInfo to be recognized as a loop");
7075
7077 L, SE, TTI,
7078 /*BlockFrequencyInfo=*/nullptr,
7079 /*ProfileSummaryInfo=*/nullptr, ORE, static_cast<int>(OptLevel),
7080 /*UserThreshold=*/std::nullopt,
7081 /*UserCount=*/std::nullopt,
7082 /*UserAllowPartial=*/true,
7083 /*UserAllowRuntime=*/true,
7084 /*UserUpperBound=*/std::nullopt,
7085 /*UserFullUnrollMaxCount=*/std::nullopt);
7086
7087 UP.Force = true;
7088
7089 // Account for additional optimizations taking place before the LoopUnrollPass
7090 // would unroll the loop.
7093
7094 // Use normal unroll factors even if the rest of the code is optimized for
7095 // size.
7098
7099 LLVM_DEBUG(dbgs() << "Unroll heuristic thresholds:\n"
7100 << " Threshold=" << UP.Threshold << "\n"
7101 << " PartialThreshold=" << UP.PartialThreshold << "\n"
7102 << " OptSizeThreshold=" << UP.OptSizeThreshold << "\n"
7103 << " PartialOptSizeThreshold="
7104 << UP.PartialOptSizeThreshold << "\n");
7105
7106 // Disable peeling.
7109 /*UserAllowPeeling=*/false,
7110 /*UserAllowProfileBasedPeeling=*/false,
7111 /*UnrollingSpecficValues=*/false);
7112
7114 CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
7115
7116 // Assume that reads and writes to stack variables can be eliminated by
7117 // Mem2Reg, SROA or LICM. That is, don't count them towards the loop body's
7118 // size.
7119 for (BasicBlock *BB : L->blocks()) {
7120 for (Instruction &I : *BB) {
7121 Value *Ptr;
7122 if (auto *Load = dyn_cast<LoadInst>(&I)) {
7123 Ptr = Load->getPointerOperand();
7124 } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
7125 Ptr = Store->getPointerOperand();
7126 } else
7127 continue;
7128
7129 Ptr = Ptr->stripPointerCasts();
7130
7131 if (auto *Alloca = dyn_cast<AllocaInst>(Ptr)) {
7132 if (Alloca->getParent() == &F->getEntryBlock())
7133 EphValues.insert(&I);
7134 }
7135 }
7136 }
7137
7138 UnrollCostEstimator UCE(L, TTI, EphValues, UP.BEInsns);
7139
7140 // Loop is not unrollable if the loop contains certain instructions.
7141 if (!UCE.canUnroll()) {
7142 LLVM_DEBUG(dbgs() << "Loop not considered unrollable\n");
7143 return 1;
7144 }
7145
7146 LLVM_DEBUG(dbgs() << "Estimated loop size is " << UCE.getRolledLoopSize()
7147 << "\n");
7148
7149 // TODO: Determine trip count of \p CLI if constant, computeUnrollCount might
7150 // be able to use it.
7151 int TripCount = 0;
7152 int MaxTripCount = 0;
7153 bool MaxOrZero = false;
7154 unsigned TripMultiple = 0;
7155
7156 computeUnrollCount(L, TTI, DT, &LI, &AC, SE, EphValues, &ORE, TripCount,
7157 MaxTripCount, MaxOrZero, TripMultiple, UCE, UP, PP);
7158 unsigned Factor = UP.Count;
7159 LLVM_DEBUG(dbgs() << "Suggesting unroll factor of " << Factor << "\n");
7160
7161 // This function returns 1 to signal to not unroll a loop.
7162 if (Factor == 0)
7163 return 1;
7164 return Factor;
7165}
7166
7168 int32_t Factor,
7169 CanonicalLoopInfo **UnrolledCLI) {
7170 assert(Factor >= 0 && "Unroll factor must not be negative");
7171
7172 Function *F = Loop->getFunction();
7173 LLVMContext &Ctx = F->getContext();
7174
7175 // If the unrolled loop is not used for another loop-associated directive, it
7176 // is sufficient to add metadata for the LoopUnrollPass.
7177 if (!UnrolledCLI) {
7178 SmallVector<Metadata *, 2> LoopMetadata;
7179 LoopMetadata.push_back(
7180 MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")));
7181
7182 if (Factor >= 1) {
7184 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7185 LoopMetadata.push_back(MDNode::get(
7186 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst}));
7187 }
7188
7189 addLoopMetadata(Loop, LoopMetadata);
7190 return;
7191 }
7192
7193 // Heuristically determine the unroll factor.
7194 if (Factor == 0)
7196
7197 // No change required with unroll factor 1.
7198 if (Factor == 1) {
7199 *UnrolledCLI = Loop;
7200 return;
7201 }
7202
7203 assert(Factor >= 2 &&
7204 "unrolling only makes sense with a factor of 2 or larger");
7205
7206 Type *IndVarTy = Loop->getIndVarType();
7207
7208 // Apply partial unrolling by tiling the loop by the unroll-factor, then fully
7209 // unroll the inner loop.
7210 Value *FactorVal =
7211 ConstantInt::get(IndVarTy, APInt(IndVarTy->getIntegerBitWidth(), Factor,
7212 /*isSigned=*/false));
7213 std::vector<CanonicalLoopInfo *> LoopNest =
7214 tileLoops(DL, {Loop}, {FactorVal});
7215 assert(LoopNest.size() == 2 && "Expect 2 loops after tiling");
7216 *UnrolledCLI = LoopNest[0];
7217 CanonicalLoopInfo *InnerLoop = LoopNest[1];
7218
7219 // LoopUnrollPass can only fully unroll loops with constant trip count.
7220 // Unroll by the unroll factor with a fallback epilog for the remainder
7221 // iterations if necessary.
7223 ConstantInt::get(Type::getInt32Ty(Ctx), APInt(32, Factor)));
7225 InnerLoop,
7226 {MDNode::get(Ctx, MDString::get(Ctx, "llvm.loop.unroll.enable")),
7228 Ctx, {MDString::get(Ctx, "llvm.loop.unroll.count"), FactorConst})});
7229
7230#ifndef NDEBUG
7231 (*UnrolledCLI)->assertOK();
7232#endif
7233}
7234
7237 llvm::Value *BufSize, llvm::Value *CpyBuf,
7238 llvm::Value *CpyFn, llvm::Value *DidIt) {
7239 if (!updateToLocation(Loc))
7240 return Loc.IP;
7241
7242 uint32_t SrcLocStrSize;
7243 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7244 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7245 Value *ThreadId = getOrCreateThreadID(Ident);
7246
7247 llvm::Value *DidItLD = Builder.CreateLoad(Builder.getInt32Ty(), DidIt);
7248
7249 Value *Args[] = {Ident, ThreadId, BufSize, CpyBuf, CpyFn, DidItLD};
7250
7251 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_copyprivate);
7252 createRuntimeFunctionCall(Fn, Args);
7253
7254 return Builder.saveIP();
7255}
7256
7258 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7259 FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef<llvm::Value *> CPVars,
7261
7262 if (!updateToLocation(Loc))
7263 return Loc.IP;
7264
7265 // If needed allocate and initialize `DidIt` with 0.
7266 // DidIt: flag variable: 1=single thread; 0=not single thread.
7267 llvm::Value *DidIt = nullptr;
7268 if (!CPVars.empty()) {
7269 DidIt = Builder.CreateAlloca(llvm::Type::getInt32Ty(Builder.getContext()));
7270 Builder.CreateStore(Builder.getInt32(0), DidIt);
7271 }
7272
7273 Directive OMPD = Directive::OMPD_single;
7274 uint32_t SrcLocStrSize;
7275 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7276 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7277 Value *ThreadId = getOrCreateThreadID(Ident);
7278 Value *Args[] = {Ident, ThreadId};
7279
7280 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_single);
7281 Instruction *EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7282
7283 Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_single);
7284 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7285
7286 auto FiniCBWrapper = [&](InsertPointTy IP) -> Error {
7287 if (Error Err = FiniCB(IP))
7288 return Err;
7289
7290 // The thread that executes the single region must set `DidIt` to 1.
7291 // This is used by __kmpc_copyprivate, to know if the caller is the
7292 // single thread or not.
7293 if (DidIt)
7294 Builder.CreateStore(Builder.getInt32(1), DidIt);
7295
7296 return Error::success();
7297 };
7298
7299 // generates the following:
7300 // if (__kmpc_single()) {
7301 // .... single region ...
7302 // __kmpc_end_single
7303 // }
7304 // __kmpc_copyprivate
7305 // __kmpc_barrier
7306
7307 InsertPointOrErrorTy AfterIP =
7308 EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCBWrapper,
7309 /*Conditional*/ true,
7310 /*hasFinalize*/ true);
7311 if (!AfterIP)
7312 return AfterIP.takeError();
7313
7314 if (DidIt) {
7315 for (size_t I = 0, E = CPVars.size(); I < E; ++I)
7316 // NOTE BufSize is currently unused, so just pass 0.
7318 /*BufSize=*/ConstantInt::get(Int64, 0), CPVars[I],
7319 CPFuncs[I], DidIt);
7320 // NOTE __kmpc_copyprivate already inserts a barrier
7321 } else if (!IsNowait) {
7322 InsertPointOrErrorTy AfterIP =
7324 omp::Directive::OMPD_unknown, /* ForceSimpleCall */ false,
7325 /* CheckCancelFlag */ false);
7326 if (!AfterIP)
7327 return AfterIP.takeError();
7328 }
7329 return Builder.saveIP();
7330}
7331
7333 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7334 FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
7335
7336 if (!updateToLocation(Loc))
7337 return Loc.IP;
7338
7339 Directive OMPD = Directive::OMPD_critical;
7340 uint32_t SrcLocStrSize;
7341 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7342 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7343 Value *ThreadId = getOrCreateThreadID(Ident);
7344 Value *LockVar = getOMPCriticalRegionLock(CriticalName);
7345 Value *Args[] = {Ident, ThreadId, LockVar};
7346
7347 SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
7348 Function *RTFn = nullptr;
7349 if (HintInst) {
7350 // Add Hint to entry Args and create call
7351 EnterArgs.push_back(HintInst);
7352 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
7353 } else {
7354 RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
7355 }
7356 Instruction *EntryCall = createRuntimeFunctionCall(RTFn, EnterArgs);
7357
7358 Function *ExitRTLFn =
7359 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
7360 Instruction *ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7361
7362 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7363 /*Conditional*/ false, /*hasFinalize*/ true);
7364}
7365
7368 InsertPointTy AllocaIP, unsigned NumLoops,
7369 ArrayRef<llvm::Value *> StoreValues,
7370 const Twine &Name, bool IsDependSource) {
7371 assert(
7372 llvm::all_of(StoreValues,
7373 [](Value *SV) { return SV->getType()->isIntegerTy(64); }) &&
7374 "OpenMP runtime requires depend vec with i64 type");
7375
7376 if (!updateToLocation(Loc))
7377 return Loc.IP;
7378
7379 // Allocate space for vector and generate alloc instruction.
7380 auto *ArrI64Ty = ArrayType::get(Int64, NumLoops);
7381 Builder.restoreIP(AllocaIP);
7382 AllocaInst *ArgsBase = Builder.CreateAlloca(ArrI64Ty, nullptr, Name);
7383 ArgsBase->setAlignment(Align(8));
7385
7386 // Store the index value with offset in depend vector.
7387 for (unsigned I = 0; I < NumLoops; ++I) {
7388 Value *DependAddrGEPIter = Builder.CreateInBoundsGEP(
7389 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(I)});
7390 StoreInst *STInst = Builder.CreateStore(StoreValues[I], DependAddrGEPIter);
7391 STInst->setAlignment(Align(8));
7392 }
7393
7394 Value *DependBaseAddrGEP = Builder.CreateInBoundsGEP(
7395 ArrI64Ty, ArgsBase, {Builder.getInt64(0), Builder.getInt64(0)});
7396
7397 uint32_t SrcLocStrSize;
7398 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7399 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7400 Value *ThreadId = getOrCreateThreadID(Ident);
7401 Value *Args[] = {Ident, ThreadId, DependBaseAddrGEP};
7402
7403 Function *RTLFn = nullptr;
7404 if (IsDependSource)
7405 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_post);
7406 else
7407 RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_doacross_wait);
7408 createRuntimeFunctionCall(RTLFn, Args);
7409
7410 return Builder.saveIP();
7411}
7412
7414 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
7415 FinalizeCallbackTy FiniCB, bool IsThreads) {
7416 if (!updateToLocation(Loc))
7417 return Loc.IP;
7418
7419 Directive OMPD = Directive::OMPD_ordered;
7420 Instruction *EntryCall = nullptr;
7421 Instruction *ExitCall = nullptr;
7422
7423 if (IsThreads) {
7424 uint32_t SrcLocStrSize;
7425 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7426 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7427 Value *ThreadId = getOrCreateThreadID(Ident);
7428 Value *Args[] = {Ident, ThreadId};
7429
7430 Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_ordered);
7431 EntryCall = createRuntimeFunctionCall(EntryRTLFn, Args);
7432
7433 Function *ExitRTLFn =
7434 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_ordered);
7435 ExitCall = createRuntimeFunctionCall(ExitRTLFn, Args);
7436 }
7437
7438 return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
7439 /*Conditional*/ false, /*hasFinalize*/ true);
7440}
7441
7442OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::EmitOMPInlinedRegion(
7443 Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
7444 BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
7445 bool HasFinalize, bool IsCancellable) {
7446
7447 if (HasFinalize)
7448 FinalizationStack.push_back({FiniCB, OMPD, IsCancellable});
7449
7450 // Create inlined region's entry and body blocks, in preparation
7451 // for conditional creation
7452 BasicBlock *EntryBB = Builder.GetInsertBlock();
7453 Instruction *SplitPos = EntryBB->getTerminatorOrNull();
7455 SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
7456 BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
7457 BasicBlock *FiniBB =
7458 EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
7459
7460 Builder.SetInsertPoint(EntryBB->getTerminator());
7461 emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
7462
7463 // generate body
7464 if (Error Err = BodyGenCB(/* AllocaIP */ InsertPointTy(),
7465 /* CodeGenIP */ Builder.saveIP()))
7466 return Err;
7467
7468 // emit exit call and do any needed finalization.
7469 auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
7470 assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
7471 FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
7472 "Unexpected control flow graph state!!");
7473 InsertPointOrErrorTy AfterIP =
7474 emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
7475 if (!AfterIP)
7476 return AfterIP.takeError();
7477
7478 // If we are skipping the region of a non conditional, remove the exit
7479 // block, and clear the builder's insertion point.
7480 assert(SplitPos->getParent() == ExitBB &&
7481 "Unexpected Insertion point location!");
7482 auto merged = MergeBlockIntoPredecessor(ExitBB);
7483 BasicBlock *ExitPredBB = SplitPos->getParent();
7484 auto InsertBB = merged ? ExitPredBB : ExitBB;
7486 SplitPos->eraseFromParent();
7487 Builder.SetInsertPoint(InsertBB);
7488
7489 return Builder.saveIP();
7490}
7491
7492OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
7493 Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
7494 // if nothing to do, Return current insertion point.
7495 if (!Conditional || !EntryCall)
7496 return Builder.saveIP();
7497
7498 BasicBlock *EntryBB = Builder.GetInsertBlock();
7499 Value *CallBool = Builder.CreateIsNotNull(EntryCall);
7500 auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
7501 auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
7502
7503 // Emit thenBB and set the Builder's insertion point there for
7504 // body generation next. Place the block after the current block.
7505 Function *CurFn = EntryBB->getParent();
7506 CurFn->insert(std::next(EntryBB->getIterator()), ThenBB);
7507
7508 // Move Entry branch to end of ThenBB, and replace with conditional
7509 // branch (If-stmt)
7510 Instruction *EntryBBTI = EntryBB->getTerminator();
7511 Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
7512 EntryBBTI->removeFromParent();
7513 Builder.SetInsertPoint(UI);
7514 Builder.Insert(EntryBBTI);
7515 UI->eraseFromParent();
7516 Builder.SetInsertPoint(ThenBB->getTerminator());
7517
7518 // return an insertion point to ExitBB.
7519 return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
7520}
7521
7522OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitCommonDirectiveExit(
7523 omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
7524 bool HasFinalize) {
7525
7526 Builder.restoreIP(FinIP);
7527
7528 // If there is finalization to do, emit it before the exit call
7529 if (HasFinalize) {
7530 assert(!FinalizationStack.empty() &&
7531 "Unexpected finalization stack state!");
7532
7533 FinalizationInfo Fi = FinalizationStack.pop_back_val();
7534 assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
7535
7536 if (Error Err = Fi.mergeFiniBB(Builder, FinIP.getBlock()))
7537 return std::move(Err);
7538
7539 // Exit condition: insertion point is before the terminator of the new Fini
7540 // block
7541 Builder.SetInsertPoint(FinIP.getBlock()->getTerminator());
7542 }
7543
7544 if (!ExitCall)
7545 return Builder.saveIP();
7546
7547 // place the Exitcall as last instruction before Finalization block terminator
7548 ExitCall->removeFromParent();
7549 Builder.Insert(ExitCall);
7550
7551 return IRBuilder<>::InsertPoint(ExitCall->getParent(),
7552 ExitCall->getIterator());
7553}
7554
7556 InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
7557 llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
7558 if (!IP.isSet())
7559 return IP;
7560
7562
7563 // creates the following CFG structure
7564 // OMP_Entry : (MasterAddr != PrivateAddr)?
7565 // F T
7566 // | \
7567 // | copin.not.master
7568 // | /
7569 // v /
7570 // copyin.not.master.end
7571 // |
7572 // v
7573 // OMP.Entry.Next
7574
7575 BasicBlock *OMP_Entry = IP.getBlock();
7576 Function *CurFn = OMP_Entry->getParent();
7577 BasicBlock *CopyBegin =
7578 BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
7579 BasicBlock *CopyEnd = nullptr;
7580
7581 // If entry block is terminated, split to preserve the branch to following
7582 // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
7584 CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
7585 "copyin.not.master.end");
7586 OMP_Entry->getTerminator()->eraseFromParent();
7587 } else {
7588 CopyEnd =
7589 BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
7590 }
7591
7592 Builder.SetInsertPoint(OMP_Entry);
7593 Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
7594 Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
7595 Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
7596 Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
7597
7598 Builder.SetInsertPoint(CopyBegin);
7599 if (BranchtoEnd)
7600 Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
7601
7602 return Builder.saveIP();
7603}
7604
7606 Value *Size, Value *Allocator,
7607 std::string Name) {
7610
7611 uint32_t SrcLocStrSize;
7612 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7613 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7614 Value *ThreadId = getOrCreateThreadID(Ident);
7615 Value *Args[] = {ThreadId, Size, Allocator};
7616
7617 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
7618
7619 return createRuntimeFunctionCall(Fn, Args, Name);
7620}
7621
7623 Value *Addr, Value *Allocator,
7624 std::string Name) {
7627
7628 uint32_t SrcLocStrSize;
7629 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7630 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7631 Value *ThreadId = getOrCreateThreadID(Ident);
7632 Value *Args[] = {ThreadId, Addr, Allocator};
7633 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
7634 return createRuntimeFunctionCall(Fn, Args, Name);
7635}
7636
7638 const LocationDescription &Loc, Value *InteropVar,
7639 omp::OMPInteropType InteropType, Value *Device, Value *NumDependences,
7640 Value *DependenceAddress, bool HaveNowaitClause) {
7643
7644 uint32_t SrcLocStrSize;
7645 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7646 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7647 Value *ThreadId = getOrCreateThreadID(Ident);
7648 if (Device == nullptr)
7649 Device = Constant::getAllOnesValue(Int32);
7650 Constant *InteropTypeVal = ConstantInt::get(Int32, (int)InteropType);
7651 if (NumDependences == nullptr) {
7652 NumDependences = ConstantInt::get(Int32, 0);
7653 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7654 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7655 }
7656 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7657 Value *Args[] = {
7658 Ident, ThreadId, InteropVar, InteropTypeVal,
7659 Device, NumDependences, DependenceAddress, HaveNowaitClauseVal};
7660
7661 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_init);
7662
7663 return createRuntimeFunctionCall(Fn, Args);
7664}
7665
7667 const LocationDescription &Loc, Value *InteropVar, Value *Device,
7668 Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause) {
7671
7672 uint32_t SrcLocStrSize;
7673 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7674 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7675 Value *ThreadId = getOrCreateThreadID(Ident);
7676 if (Device == nullptr)
7677 Device = Constant::getAllOnesValue(Int32);
7678 if (NumDependences == nullptr) {
7679 NumDependences = ConstantInt::get(Int32, 0);
7680 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7681 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7682 }
7683 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7684 Value *Args[] = {
7685 Ident, ThreadId, InteropVar, Device,
7686 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7687
7688 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_destroy);
7689
7690 return createRuntimeFunctionCall(Fn, Args);
7691}
7692
7694 Value *InteropVar, Value *Device,
7695 Value *NumDependences,
7696 Value *DependenceAddress,
7697 bool HaveNowaitClause) {
7700 uint32_t SrcLocStrSize;
7701 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7702 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7703 Value *ThreadId = getOrCreateThreadID(Ident);
7704 if (Device == nullptr)
7705 Device = Constant::getAllOnesValue(Int32);
7706 if (NumDependences == nullptr) {
7707 NumDependences = ConstantInt::get(Int32, 0);
7708 PointerType *PointerTypeVar = PointerType::getUnqual(M.getContext());
7709 DependenceAddress = ConstantPointerNull::get(PointerTypeVar);
7710 }
7711 Value *HaveNowaitClauseVal = ConstantInt::get(Int32, HaveNowaitClause);
7712 Value *Args[] = {
7713 Ident, ThreadId, InteropVar, Device,
7714 NumDependences, DependenceAddress, HaveNowaitClauseVal};
7715
7716 Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___tgt_interop_use);
7717
7718 return createRuntimeFunctionCall(Fn, Args);
7719}
7720
7723 llvm::ConstantInt *Size, const llvm::Twine &Name) {
7726
7727 uint32_t SrcLocStrSize;
7728 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7729 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7730 Value *ThreadId = getOrCreateThreadID(Ident);
7731 Constant *ThreadPrivateCache =
7732 getOrCreateInternalVariable(Int8PtrPtr, Name.str());
7733 llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
7734
7735 Function *Fn =
7736 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
7737
7738 return createRuntimeFunctionCall(Fn, Args);
7739}
7740
7742 const LocationDescription &Loc,
7744 assert(!Attrs.MaxThreads.empty() && !Attrs.MaxTeams.empty() &&
7745 "expected num_threads and num_teams to be specified");
7746
7747 if (!updateToLocation(Loc))
7748 return Loc.IP;
7749
7750 uint32_t SrcLocStrSize;
7751 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
7752 Constant *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
7753 Constant *IsSPMDVal = ConstantInt::getSigned(Int8, Attrs.ExecFlags);
7754 Constant *UseGenericStateMachineVal = ConstantInt::getSigned(
7755 Int8, Attrs.ExecFlags != omp::OMP_TGT_EXEC_MODE_SPMD);
7756 Constant *MayUseNestedParallelismVal = ConstantInt::getSigned(Int8, true);
7757 Constant *DebugIndentionLevelVal = ConstantInt::getSigned(Int16, 0);
7758
7759 Function *DebugKernelWrapper = Builder.GetInsertBlock()->getParent();
7760 Function *Kernel = DebugKernelWrapper;
7761
7762 // We need to strip the debug prefix to get the correct kernel name.
7763 StringRef KernelName = Kernel->getName();
7764 const std::string DebugPrefix = "_debug__";
7765 if (KernelName.ends_with(DebugPrefix)) {
7766 KernelName = KernelName.drop_back(DebugPrefix.length());
7767 Kernel = M.getFunction(KernelName);
7768 assert(Kernel && "Expected the real kernel to exist");
7769 }
7770
7771 // Manifest the launch configuration in the metadata matching the kernel
7772 // environment.
7773 if (Attrs.MinTeams > 1 || Attrs.MaxTeams.front() > 0)
7774 writeTeamsForKernel(T, *Kernel, Attrs.MinTeams, Attrs.MaxTeams.front());
7775
7776 // If MaxThreads not set, select the maximum between the default workgroup
7777 // size and the MinThreads value.
7778 int32_t MaxThreadsVal = Attrs.MaxThreads.front();
7779 if (MaxThreadsVal < 0) {
7780 if (hasGridValue(T)) {
7781 MaxThreadsVal =
7782 std::max(int32_t(getGridValue(T, Kernel).GV_Default_WG_Size),
7783 Attrs.MinThreads);
7784 } else {
7785 MaxThreadsVal = Attrs.MinThreads;
7786 }
7787 }
7788
7789 if (MaxThreadsVal > 0)
7790 writeThreadBoundsForKernel(T, *Kernel, Attrs.MinThreads, MaxThreadsVal);
7791
7792 Constant *MinThreads = ConstantInt::getSigned(Int32, Attrs.MinThreads);
7793 Constant *MaxThreads = ConstantInt::getSigned(Int32, MaxThreadsVal);
7794 Constant *MinTeams = ConstantInt::getSigned(Int32, Attrs.MinTeams);
7795 Constant *MaxTeams = ConstantInt::getSigned(Int32, Attrs.MaxTeams.front());
7796 Constant *ReductionDataSize =
7797 ConstantInt::getSigned(Int32, Attrs.ReductionDataSize);
7798 Constant *ReductionBufferLength =
7799 ConstantInt::getSigned(Int32, Attrs.ReductionBufferLength);
7800
7802 omp::RuntimeFunction::OMPRTL___kmpc_target_init);
7803 const DataLayout &DL = Fn->getDataLayout();
7804
7805 Twine DynamicEnvironmentName = KernelName + "_dynamic_environment";
7806 Constant *DynamicEnvironmentInitializer =
7807 ConstantStruct::get(DynamicEnvironment, {DebugIndentionLevelVal});
7808 GlobalVariable *DynamicEnvironmentGV = new GlobalVariable(
7809 M, DynamicEnvironment, /*IsConstant=*/false, GlobalValue::WeakODRLinkage,
7810 DynamicEnvironmentInitializer, DynamicEnvironmentName,
7811 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7812 DL.getDefaultGlobalsAddressSpace());
7813 DynamicEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7814
7815 Constant *DynamicEnvironment =
7816 DynamicEnvironmentGV->getType() == DynamicEnvironmentPtr
7817 ? DynamicEnvironmentGV
7818 : ConstantExpr::getAddrSpaceCast(DynamicEnvironmentGV,
7819 DynamicEnvironmentPtr);
7820
7821 Constant *ConfigurationEnvironmentInitializer = ConstantStruct::get(
7822 ConfigurationEnvironment, {
7823 UseGenericStateMachineVal,
7824 MayUseNestedParallelismVal,
7825 IsSPMDVal,
7826 MinThreads,
7827 MaxThreads,
7828 MinTeams,
7829 MaxTeams,
7830 ReductionDataSize,
7831 ReductionBufferLength,
7832 });
7833 Constant *KernelEnvironmentInitializer = ConstantStruct::get(
7834 KernelEnvironment, {
7835 ConfigurationEnvironmentInitializer,
7836 Ident,
7837 DynamicEnvironment,
7838 });
7839 std::string KernelEnvironmentName =
7840 (KernelName + "_kernel_environment").str();
7841 GlobalVariable *KernelEnvironmentGV = new GlobalVariable(
7842 M, KernelEnvironment, /*IsConstant=*/true, GlobalValue::WeakODRLinkage,
7843 KernelEnvironmentInitializer, KernelEnvironmentName,
7844 /*InsertBefore=*/nullptr, GlobalValue::NotThreadLocal,
7845 DL.getDefaultGlobalsAddressSpace());
7846 KernelEnvironmentGV->setVisibility(GlobalValue::ProtectedVisibility);
7847
7848 Constant *KernelEnvironment =
7849 KernelEnvironmentGV->getType() == KernelEnvironmentPtr
7850 ? KernelEnvironmentGV
7851 : ConstantExpr::getAddrSpaceCast(KernelEnvironmentGV,
7852 KernelEnvironmentPtr);
7853 Value *KernelLaunchEnvironment =
7854 DebugKernelWrapper->getArg(DebugKernelWrapper->arg_size() - 1);
7855 Type *KernelLaunchEnvParamTy = Fn->getFunctionType()->getParamType(1);
7856 KernelLaunchEnvironment =
7857 KernelLaunchEnvironment->getType() == KernelLaunchEnvParamTy
7858 ? KernelLaunchEnvironment
7859 : Builder.CreateAddrSpaceCast(KernelLaunchEnvironment,
7860 KernelLaunchEnvParamTy);
7861 CallInst *ThreadKind = createRuntimeFunctionCall(
7862 Fn, {KernelEnvironment, KernelLaunchEnvironment});
7863
7864 Value *ExecUserCode = Builder.CreateICmpEQ(
7865 ThreadKind, Constant::getAllOnesValue(ThreadKind->getType()),
7866 "exec_user_code");
7867
7868 // ThreadKind = __kmpc_target_init(...)
7869 // if (ThreadKind == -1)
7870 // user_code
7871 // else
7872 // return;
7873
7874 auto *UI = Builder.CreateUnreachable();
7875 BasicBlock *CheckBB = UI->getParent();
7876 BasicBlock *UserCodeEntryBB = CheckBB->splitBasicBlock(UI, "user_code.entry");
7877
7878 BasicBlock *WorkerExitBB = BasicBlock::Create(
7879 CheckBB->getContext(), "worker.exit", CheckBB->getParent());
7880 Builder.SetInsertPoint(WorkerExitBB);
7881 Builder.CreateRetVoid();
7882
7883 auto *CheckBBTI = CheckBB->getTerminator();
7884 Builder.SetInsertPoint(CheckBBTI);
7885 Builder.CreateCondBr(ExecUserCode, UI->getParent(), WorkerExitBB);
7886
7887 CheckBBTI->eraseFromParent();
7888 UI->eraseFromParent();
7889
7890 // Continue in the "user_code" block, see diagram above and in
7891 // openmp/libomptarget/deviceRTLs/common/include/target.h .
7892 return InsertPointTy(UserCodeEntryBB, UserCodeEntryBB->getFirstInsertionPt());
7893}
7894
7896 int32_t TeamsReductionDataSize,
7897 int32_t TeamsReductionBufferLength) {
7898 if (!updateToLocation(Loc))
7899 return;
7900
7902 omp::RuntimeFunction::OMPRTL___kmpc_target_deinit);
7903
7905
7906 if (!TeamsReductionBufferLength || !TeamsReductionDataSize)
7907 return;
7908
7909 Function *Kernel = Builder.GetInsertBlock()->getParent();
7910 // We need to strip the debug prefix to get the correct kernel name.
7911 StringRef KernelName = Kernel->getName();
7912 const std::string DebugPrefix = "_debug__";
7913 if (KernelName.ends_with(DebugPrefix))
7914 KernelName = KernelName.drop_back(DebugPrefix.length());
7915 auto *KernelEnvironmentGV =
7916 M.getNamedGlobal((KernelName + "_kernel_environment").str());
7917 assert(KernelEnvironmentGV && "Expected kernel environment global\n");
7918 auto *KernelEnvironmentInitializer = KernelEnvironmentGV->getInitializer();
7919 auto *NewInitializer = ConstantFoldInsertValueInstruction(
7920 KernelEnvironmentInitializer,
7921 ConstantInt::get(Int32, TeamsReductionDataSize), {0, 7});
7922 NewInitializer = ConstantFoldInsertValueInstruction(
7923 NewInitializer, ConstantInt::get(Int32, TeamsReductionBufferLength),
7924 {0, 8});
7925 KernelEnvironmentGV->setInitializer(NewInitializer);
7926}
7927
7928static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value,
7929 bool Min) {
7930 if (Kernel.hasFnAttribute(Name)) {
7931 int32_t OldLimit = Kernel.getFnAttributeAsParsedInteger(Name);
7932 Value = Min ? std::min(OldLimit, Value) : std::max(OldLimit, Value);
7933 }
7934 Kernel.addFnAttr(Name, llvm::utostr(Value));
7935}
7936
7937std::pair<int32_t, int32_t>
7939 int32_t ThreadLimit =
7940 Kernel.getFnAttributeAsParsedInteger("omp_target_thread_limit");
7941
7942 if (T.isAMDGPU()) {
7943 const auto &Attr = Kernel.getFnAttribute("amdgpu-flat-work-group-size");
7944 if (!Attr.isValid() || !Attr.isStringAttribute())
7945 return {0, ThreadLimit};
7946 auto [LBStr, UBStr] = Attr.getValueAsString().split(',');
7947 int32_t LB, UB;
7948 if (!llvm::to_integer(UBStr, UB, 10))
7949 return {0, ThreadLimit};
7950 UB = ThreadLimit ? std::min(ThreadLimit, UB) : UB;
7951 if (!llvm::to_integer(LBStr, LB, 10))
7952 return {0, UB};
7953 return {LB, UB};
7954 }
7955
7956 if (Kernel.hasFnAttribute(NVVMAttr::MaxNTID)) {
7957 int32_t UB = Kernel.getFnAttributeAsParsedInteger(NVVMAttr::MaxNTID);
7958 return {0, ThreadLimit ? std::min(ThreadLimit, UB) : UB};
7959 }
7960 return {0, ThreadLimit};
7961}
7962
7964 Function &Kernel, int32_t LB,
7965 int32_t UB) {
7966 Kernel.addFnAttr("omp_target_thread_limit", std::to_string(UB));
7967
7968 if (T.isAMDGPU()) {
7969 Kernel.addFnAttr("amdgpu-flat-work-group-size",
7970 llvm::utostr(LB) + "," + llvm::utostr(UB));
7971 return;
7972 }
7973
7975}
7976
7977std::pair<int32_t, int32_t>
7979 // TODO: Read from backend annotations if available.
7980 return {0, Kernel.getFnAttributeAsParsedInteger("omp_target_num_teams")};
7981}
7982
7984 int32_t LB, int32_t UB) {
7985 if (T.isNVPTX())
7986 if (UB > 0)
7988 if (T.isAMDGPU())
7989 Kernel.addFnAttr("amdgpu-max-num-workgroups", llvm::utostr(LB) + ",1,1");
7990
7991 Kernel.addFnAttr("omp_target_num_teams", std::to_string(LB));
7992}
7993
7994void OpenMPIRBuilder::setOutlinedTargetRegionFunctionAttributes(
7995 Function *OutlinedFn) {
7996 if (Config.isTargetDevice()) {
7998 // TODO: Determine if DSO local can be set to true.
7999 OutlinedFn->setDSOLocal(false);
8001 if (T.isAMDGCN())
8003 else if (T.isNVPTX())
8005 else if (T.isSPIRV())
8007 }
8008}
8009
8010Constant *OpenMPIRBuilder::createOutlinedFunctionID(Function *OutlinedFn,
8011 StringRef EntryFnIDName) {
8012 if (Config.isTargetDevice()) {
8013 assert(OutlinedFn && "The outlined function must exist if embedded");
8014 return OutlinedFn;
8015 }
8016
8017 return new GlobalVariable(
8018 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::WeakAnyLinkage,
8019 Constant::getNullValue(Builder.getInt8Ty()), EntryFnIDName);
8020}
8021
8022Constant *OpenMPIRBuilder::createTargetRegionEntryAddr(Function *OutlinedFn,
8023 StringRef EntryFnName) {
8024 if (OutlinedFn)
8025 return OutlinedFn;
8026
8027 assert(!M.getGlobalVariable(EntryFnName, true) &&
8028 "Named kernel already exists?");
8029 return new GlobalVariable(
8030 M, Builder.getInt8Ty(), /*isConstant=*/true, GlobalValue::InternalLinkage,
8031 Constant::getNullValue(Builder.getInt8Ty()), EntryFnName);
8032}
8033
8035 TargetRegionEntryInfo &EntryInfo,
8036 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
8037 Function *&OutlinedFn, Constant *&OutlinedFnID) {
8038
8039 SmallString<64> EntryFnName;
8040 OffloadInfoManager.getTargetRegionEntryFnName(EntryFnName, EntryInfo);
8041
8042 if (Config.isTargetDevice() || !Config.openMPOffloadMandatory()) {
8043 Expected<Function *> CBResult = GenerateFunctionCallback(EntryFnName);
8044 if (!CBResult)
8045 return CBResult.takeError();
8046 OutlinedFn = *CBResult;
8047 } else {
8048 OutlinedFn = nullptr;
8049 }
8050
8051 // If this target outline function is not an offload entry, we don't need to
8052 // register it. This may be in the case of a false if clause, or if there are
8053 // no OpenMP targets.
8054 if (!IsOffloadEntry)
8055 return Error::success();
8056
8057 std::string EntryFnIDName =
8058 Config.isTargetDevice()
8059 ? std::string(EntryFnName)
8060 : createPlatformSpecificName({EntryFnName, "region_id"});
8061
8062 OutlinedFnID = registerTargetRegionFunction(EntryInfo, OutlinedFn,
8063 EntryFnName, EntryFnIDName);
8064 return Error::success();
8065}
8066
8068 TargetRegionEntryInfo &EntryInfo, Function *OutlinedFn,
8069 StringRef EntryFnName, StringRef EntryFnIDName) {
8070 if (OutlinedFn)
8071 setOutlinedTargetRegionFunctionAttributes(OutlinedFn);
8072 auto OutlinedFnID = createOutlinedFunctionID(OutlinedFn, EntryFnIDName);
8073 auto EntryAddr = createTargetRegionEntryAddr(OutlinedFn, EntryFnName);
8074 OffloadInfoManager.registerTargetRegionEntryInfo(
8075 EntryInfo, EntryAddr, OutlinedFnID,
8077 return OutlinedFnID;
8078}
8079
8081 const LocationDescription &Loc, InsertPointTy AllocaIP,
8082 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
8083 TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
8084 CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
8086 BodyGenTy BodyGenType)>
8087 BodyGenCB,
8088 function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
8089 if (!updateToLocation(Loc))
8090 return InsertPointTy();
8091
8092 Builder.restoreIP(CodeGenIP);
8093
8094 bool IsStandAlone = !BodyGenCB;
8095 MapInfosTy *MapInfo;
8096 // Generate the code for the opening of the data environment. Capture all the
8097 // arguments of the runtime call by reference because they are used in the
8098 // closing of the region.
8099 auto BeginThenGen = [&](InsertPointTy AllocaIP,
8100 InsertPointTy CodeGenIP) -> Error {
8101 MapInfo = &GenMapInfoCB(Builder.saveIP());
8102 if (Error Err = emitOffloadingArrays(
8103 AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
8104 /*IsNonContiguous=*/true, DeviceAddrCB))
8105 return Err;
8106
8107 TargetDataRTArgs RTArgs;
8109
8110 // Emit the number of elements in the offloading arrays.
8111 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8112
8113 // Source location for the ident struct
8114 if (!SrcLocInfo) {
8115 uint32_t SrcLocStrSize;
8116 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8117 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8118 }
8119
8120 SmallVector<llvm::Value *, 13> OffloadingArgs = {
8121 SrcLocInfo, DeviceID,
8122 PointerNum, RTArgs.BasePointersArray,
8123 RTArgs.PointersArray, RTArgs.SizesArray,
8124 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8125 RTArgs.MappersArray};
8126
8127 if (IsStandAlone) {
8128 assert(MapperFunc && "MapperFunc missing for standalone target data");
8129
8130 auto TaskBodyCB = [&](Value *, Value *,
8132 if (Info.HasNoWait) {
8133 OffloadingArgs.append({llvm::Constant::getNullValue(Int32),
8137 }
8138
8140 OffloadingArgs);
8141
8142 if (Info.HasNoWait) {
8143 BasicBlock *OffloadContBlock =
8144 BasicBlock::Create(Builder.getContext(), "omp_offload.cont");
8145 Function *CurFn = Builder.GetInsertBlock()->getParent();
8146 emitBlock(OffloadContBlock, CurFn, /*IsFinished=*/true);
8147 Builder.restoreIP(Builder.saveIP());
8148 }
8149 return Error::success();
8150 };
8151
8152 bool RequiresOuterTargetTask = Info.HasNoWait;
8153 if (!RequiresOuterTargetTask)
8154 cantFail(TaskBodyCB(/*DeviceID=*/nullptr, /*RTLoc=*/nullptr,
8155 /*TargetTaskAllocaIP=*/{}));
8156 else
8157 cantFail(emitTargetTask(TaskBodyCB, DeviceID, SrcLocInfo, AllocaIP,
8158 /*Dependencies=*/{}, RTArgs, Info.HasNoWait));
8159 } else {
8160 Function *BeginMapperFunc = getOrCreateRuntimeFunctionPtr(
8161 omp::OMPRTL___tgt_target_data_begin_mapper);
8162
8163 createRuntimeFunctionCall(BeginMapperFunc, OffloadingArgs);
8164
8165 for (auto DeviceMap : Info.DevicePtrInfoMap) {
8166 if (isa<AllocaInst>(DeviceMap.second.second)) {
8167 auto *LI =
8168 Builder.CreateLoad(Builder.getPtrTy(), DeviceMap.second.first);
8169 Builder.CreateStore(LI, DeviceMap.second.second);
8170 }
8171 }
8172
8173 // If device pointer privatization is required, emit the body of the
8174 // region here. It will have to be duplicated: with and without
8175 // privatization.
8176 InsertPointOrErrorTy AfterIP =
8177 BodyGenCB(Builder.saveIP(), BodyGenTy::Priv);
8178 if (!AfterIP)
8179 return AfterIP.takeError();
8180 Builder.restoreIP(*AfterIP);
8181 }
8182 return Error::success();
8183 };
8184
8185 // If we need device pointer privatization, we need to emit the body of the
8186 // region with no privatization in the 'else' branch of the conditional.
8187 // Otherwise, we don't have to do anything.
8188 auto BeginElseGen = [&](InsertPointTy AllocaIP,
8189 InsertPointTy CodeGenIP) -> Error {
8190 InsertPointOrErrorTy AfterIP =
8191 BodyGenCB(Builder.saveIP(), BodyGenTy::DupNoPriv);
8192 if (!AfterIP)
8193 return AfterIP.takeError();
8194 Builder.restoreIP(*AfterIP);
8195 return Error::success();
8196 };
8197
8198 // Generate code for the closing of the data region.
8199 auto EndThenGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8200 TargetDataRTArgs RTArgs;
8201 Info.EmitDebug = !MapInfo->Names.empty();
8202 emitOffloadingArraysArgument(Builder, RTArgs, Info, /*ForEndCall=*/true);
8203
8204 // Emit the number of elements in the offloading arrays.
8205 Value *PointerNum = Builder.getInt32(Info.NumberOfPtrs);
8206
8207 // Source location for the ident struct
8208 if (!SrcLocInfo) {
8209 uint32_t SrcLocStrSize;
8210 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
8211 SrcLocInfo = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
8212 }
8213
8214 Value *OffloadingArgs[] = {SrcLocInfo, DeviceID,
8215 PointerNum, RTArgs.BasePointersArray,
8216 RTArgs.PointersArray, RTArgs.SizesArray,
8217 RTArgs.MapTypesArray, RTArgs.MapNamesArray,
8218 RTArgs.MappersArray};
8219 Function *EndMapperFunc =
8220 getOrCreateRuntimeFunctionPtr(omp::OMPRTL___tgt_target_data_end_mapper);
8221
8222 createRuntimeFunctionCall(EndMapperFunc, OffloadingArgs);
8223 return Error::success();
8224 };
8225
8226 // We don't have to do anything to close the region if the if clause evaluates
8227 // to false.
8228 auto EndElseGen = [&](InsertPointTy AllocaIP, InsertPointTy CodeGenIP) {
8229 return Error::success();
8230 };
8231
8232 Error Err = [&]() -> Error {
8233 if (BodyGenCB) {
8234 Error Err = [&]() {
8235 if (IfCond)
8236 return emitIfClause(IfCond, BeginThenGen, BeginElseGen, AllocaIP);
8237 return BeginThenGen(AllocaIP, Builder.saveIP());
8238 }();
8239
8240 if (Err)
8241 return Err;
8242
8243 // If we don't require privatization of device pointers, we emit the body
8244 // in between the runtime calls. This avoids duplicating the body code.
8245 InsertPointOrErrorTy AfterIP =
8246 BodyGenCB(Builder.saveIP(), BodyGenTy::NoPriv);
8247 if (!AfterIP)
8248 return AfterIP.takeError();
8249 restoreIPandDebugLoc(Builder, *AfterIP);
8250
8251 if (IfCond)
8252 return emitIfClause(IfCond, EndThenGen, EndElseGen, AllocaIP);
8253 return EndThenGen(AllocaIP, Builder.saveIP());
8254 }
8255 if (IfCond)
8256 return emitIfClause(IfCond, BeginThenGen, EndElseGen, AllocaIP);
8257 return BeginThenGen(AllocaIP, Builder.saveIP());
8258 }();
8259
8260 if (Err)
8261 return Err;
8262
8263 return Builder.saveIP();
8264}
8265
8268 bool IsGPUDistribute) {
8269 assert((IVSize == 32 || IVSize == 64) &&
8270 "IV size is not compatible with the omp runtime");
8271 RuntimeFunction Name;
8272 if (IsGPUDistribute)
8273 Name = IVSize == 32
8274 ? (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_4
8275 : omp::OMPRTL___kmpc_distribute_static_init_4u)
8276 : (IVSigned ? omp::OMPRTL___kmpc_distribute_static_init_8
8277 : omp::OMPRTL___kmpc_distribute_static_init_8u);
8278 else
8279 Name = IVSize == 32 ? (IVSigned ? omp::OMPRTL___kmpc_for_static_init_4
8280 : omp::OMPRTL___kmpc_for_static_init_4u)
8281 : (IVSigned ? omp::OMPRTL___kmpc_for_static_init_8
8282 : omp::OMPRTL___kmpc_for_static_init_8u);
8283
8284 return getOrCreateRuntimeFunction(M, Name);
8285}
8286
8288 bool IVSigned) {
8289 assert((IVSize == 32 || IVSize == 64) &&
8290 "IV size is not compatible with the omp runtime");
8291 RuntimeFunction Name = IVSize == 32
8292 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_4
8293 : omp::OMPRTL___kmpc_dispatch_init_4u)
8294 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_init_8
8295 : omp::OMPRTL___kmpc_dispatch_init_8u);
8296
8297 return getOrCreateRuntimeFunction(M, Name);
8298}
8299
8301 bool IVSigned) {
8302 assert((IVSize == 32 || IVSize == 64) &&
8303 "IV size is not compatible with the omp runtime");
8304 RuntimeFunction Name = IVSize == 32
8305 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_4
8306 : omp::OMPRTL___kmpc_dispatch_next_4u)
8307 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_next_8
8308 : omp::OMPRTL___kmpc_dispatch_next_8u);
8309
8310 return getOrCreateRuntimeFunction(M, Name);
8311}
8312
8314 bool IVSigned) {
8315 assert((IVSize == 32 || IVSize == 64) &&
8316 "IV size is not compatible with the omp runtime");
8317 RuntimeFunction Name = IVSize == 32
8318 ? (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_4
8319 : omp::OMPRTL___kmpc_dispatch_fini_4u)
8320 : (IVSigned ? omp::OMPRTL___kmpc_dispatch_fini_8
8321 : omp::OMPRTL___kmpc_dispatch_fini_8u);
8322
8323 return getOrCreateRuntimeFunction(M, Name);
8324}
8325
8327 return getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_dispatch_deinit);
8328}
8329
8331 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func,
8332 DenseMap<Value *, std::tuple<Value *, unsigned>> &ValueReplacementMap) {
8333
8334 DISubprogram *NewSP = Func->getSubprogram();
8335 if (!NewSP)
8336 return;
8337
8339
8340 auto GetUpdatedDIVariable = [&](DILocalVariable *OldVar, unsigned arg) {
8341 DILocalVariable *&NewVar = RemappedVariables[OldVar];
8342 // Only use cached variable if the arg number matches. This is important
8343 // so that DIVariable created for privatized variables are not discarded.
8344 if (NewVar && (arg == NewVar->getArg()))
8345 return NewVar;
8346
8348 Builder.getContext(), OldVar->getScope(), OldVar->getName(),
8349 OldVar->getFile(), OldVar->getLine(), OldVar->getType(), arg,
8350 OldVar->getFlags(), OldVar->getAlignInBits(), OldVar->getAnnotations());
8351 return NewVar;
8352 };
8353
8354 auto UpdateDebugRecord = [&](auto *DR) {
8355 DILocalVariable *OldVar = DR->getVariable();
8356 unsigned ArgNo = 0;
8357 for (auto Loc : DR->location_ops()) {
8358 auto Iter = ValueReplacementMap.find(Loc);
8359 if (Iter != ValueReplacementMap.end()) {
8360 DR->replaceVariableLocationOp(Loc, std::get<0>(Iter->second));
8361 ArgNo = std::get<1>(Iter->second) + 1;
8362 }
8363 }
8364 if (ArgNo != 0)
8365 DR->setVariable(GetUpdatedDIVariable(OldVar, ArgNo));
8366 };
8367
8368 // The location and scope of variable intrinsics and records still point to
8369 // the parent function of the target region. Update them.
8370 for (Instruction &I : instructions(Func)) {
8372 "Unexpected debug intrinsic");
8373 for (DbgVariableRecord &DVR : filterDbgVars(I.getDbgRecordRange()))
8374 UpdateDebugRecord(&DVR);
8375 }
8376 // An extra argument is passed to the device. Create the debug data for it.
8377 if (OMPBuilder.Config.isTargetDevice()) {
8378 DICompileUnit *CU = NewSP->getUnit();
8379 Module *M = Func->getParent();
8380 DIBuilder DB(*M, true, CU);
8381 DIType *VoidPtrTy =
8382 DB.createQualifiedType(dwarf::DW_TAG_pointer_type, nullptr);
8383 unsigned ArgNo = Func->arg_size();
8384 DILocalVariable *Var = DB.createParameterVariable(
8385 NewSP, "dyn_ptr", ArgNo, NewSP->getFile(), /*LineNo=*/0, VoidPtrTy,
8386 /*AlwaysPreserve=*/false, DINode::DIFlags::FlagArtificial);
8387 auto Loc = DILocation::get(Func->getContext(), 0, 0, NewSP, 0);
8388 Argument *LastArg = Func->getArg(Func->arg_size() - 1);
8389 DB.insertDeclare(LastArg, Var, DB.createExpression(), Loc,
8390 &(*Func->begin()));
8391 }
8392}
8393
8395 if (Operator::getOpcode(V) == Instruction::AddrSpaceCast)
8396 return cast<Operator>(V)->getOperand(0);
8397 return V;
8398}
8399
8401 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
8403 StringRef FuncName, SmallVectorImpl<Value *> &Inputs,
8406 SmallVector<Type *> ParameterTypes;
8407 if (OMPBuilder.Config.isTargetDevice()) {
8408 // All parameters to target devices are passed as pointers
8409 // or i64. This assumes 64-bit address spaces/pointers.
8410 for (auto &Arg : Inputs)
8411 ParameterTypes.push_back(Arg->getType()->isPointerTy()
8412 ? Arg->getType()
8413 : Type::getInt64Ty(Builder.getContext()));
8414 } else {
8415 for (auto &Arg : Inputs)
8416 ParameterTypes.push_back(Arg->getType());
8417 }
8418
8419 // The implicit dyn_ptr argument is always the last parameter on both host
8420 // and device so the argument counts match without runtime manipulation.
8421 auto *PtrTy = PointerType::getUnqual(Builder.getContext());
8422 ParameterTypes.push_back(PtrTy);
8423
8424 auto BB = Builder.GetInsertBlock();
8425 auto M = BB->getModule();
8426 auto FuncType = FunctionType::get(Builder.getVoidTy(), ParameterTypes,
8427 /*isVarArg*/ false);
8428 auto Func =
8429 Function::Create(FuncType, GlobalValue::InternalLinkage, FuncName, M);
8430
8431 // Forward target-cpu and target-features function attributes from the
8432 // original function to the new outlined function.
8433 Function *ParentFn = Builder.GetInsertBlock()->getParent();
8434
8435 auto TargetCpuAttr = ParentFn->getFnAttribute("target-cpu");
8436 if (TargetCpuAttr.isStringAttribute())
8437 Func->addFnAttr(TargetCpuAttr);
8438
8439 auto TargetFeaturesAttr = ParentFn->getFnAttribute("target-features");
8440 if (TargetFeaturesAttr.isStringAttribute())
8441 Func->addFnAttr(TargetFeaturesAttr);
8442
8443 if (OMPBuilder.Config.isTargetDevice()) {
8444 Value *ExecMode =
8445 OMPBuilder.emitKernelExecutionMode(FuncName, DefaultAttrs.ExecFlags);
8446 OMPBuilder.emitUsed("llvm.compiler.used", {ExecMode});
8447 }
8448
8449 // Save insert point.
8450 IRBuilder<>::InsertPointGuard IPG(Builder);
8451 // We will generate the entries in the outlined function but the debug
8452 // location may still be pointing to the parent function. Reset it now.
8453 Builder.SetCurrentDebugLocation(llvm::DebugLoc());
8454
8455 // Generate the region into the function.
8456 BasicBlock *EntryBB = BasicBlock::Create(Builder.getContext(), "entry", Func);
8457 Builder.SetInsertPoint(EntryBB);
8458
8459 // Insert target init call in the device compilation pass.
8460 if (OMPBuilder.Config.isTargetDevice())
8461 Builder.restoreIP(OMPBuilder.createTargetInit(Builder, DefaultAttrs));
8462
8463 BasicBlock *UserCodeEntryBB = Builder.GetInsertBlock();
8464
8465 // As we embed the user code in the middle of our target region after we
8466 // generate entry code, we must move what allocas we can into the entry
8467 // block to avoid possible breaking optimisations for device
8468 if (OMPBuilder.Config.isTargetDevice())
8470
8471 // Insert target deinit call in the device compilation pass.
8472 BasicBlock *OutlinedBodyBB =
8473 splitBB(Builder, /*CreateBranch=*/true, "outlined.body");
8475 Builder.saveIP(),
8476 OpenMPIRBuilder::InsertPointTy(OutlinedBodyBB, OutlinedBodyBB->begin()));
8477 if (!AfterIP)
8478 return AfterIP.takeError();
8479 Builder.restoreIP(*AfterIP);
8480 if (OMPBuilder.Config.isTargetDevice())
8481 OMPBuilder.createTargetDeinit(Builder);
8482
8483 // Insert return instruction.
8484 Builder.CreateRetVoid();
8485
8486 // New Alloca IP at entry point of created device function.
8487 Builder.SetInsertPoint(EntryBB->getFirstNonPHIIt());
8488 auto AllocaIP = Builder.saveIP();
8489
8490 Builder.SetInsertPoint(UserCodeEntryBB->getFirstNonPHIOrDbg());
8491
8492 // Do not include the artificial dyn_ptr argument.
8493 const auto &ArgRange = make_range(Func->arg_begin(), Func->arg_end() - 1);
8494
8496
8497 auto ReplaceValue = [](Value *Input, Value *InputCopy, Function *Func) {
8498 // Things like GEP's can come in the form of Constants. Constants and
8499 // ConstantExpr's do not have access to the knowledge of what they're
8500 // contained in, so we must dig a little to find an instruction so we
8501 // can tell if they're used inside of the function we're outlining. We
8502 // also replace the original constant expression with a new instruction
8503 // equivalent; an instruction as it allows easy modification in the
8504 // following loop, as we can now know the constant (instruction) is
8505 // owned by our target function and replaceUsesOfWith can now be invoked
8506 // on it (cannot do this with constants it seems). A brand new one also
8507 // allows us to be cautious as it is perhaps possible the old expression
8508 // was used inside of the function but exists and is used externally
8509 // (unlikely by the nature of a Constant, but still).
8510 // NOTE: We cannot remove dead constants that have been rewritten to
8511 // instructions at this stage, we run the risk of breaking later lowering
8512 // by doing so as we could still be in the process of lowering the module
8513 // from MLIR to LLVM-IR and the MLIR lowering may still require the original
8514 // constants we have created rewritten versions of.
8515 if (auto *Const = dyn_cast<Constant>(Input))
8516 convertUsersOfConstantsToInstructions(Const, Func, false);
8517
8518 // Collect users before iterating over them to avoid invalidating the
8519 // iteration in case a user uses Input more than once (e.g. a call
8520 // instruction).
8521 SetVector<User *> Users(Input->users().begin(), Input->users().end());
8522 // Collect all the instructions
8524 if (auto *Instr = dyn_cast<Instruction>(User))
8525 if (Instr->getFunction() == Func)
8526 Instr->replaceUsesOfWith(Input, InputCopy);
8527 };
8528
8529 SmallVector<std::pair<Value *, Value *>> DeferredReplacement;
8530
8531 // Rewrite uses of input valus to parameters.
8532 for (auto InArg : zip(Inputs, ArgRange)) {
8533 Value *Input = std::get<0>(InArg);
8534 Argument &Arg = std::get<1>(InArg);
8535 Value *InputCopy = nullptr;
8536
8538 ArgAccessorFuncCB(Arg, Input, InputCopy, AllocaIP, Builder.saveIP());
8539 if (!AfterIP)
8540 return AfterIP.takeError();
8541 Builder.restoreIP(*AfterIP);
8542 ValueReplacementMap[Input] = std::make_tuple(InputCopy, Arg.getArgNo());
8543
8544 // In certain cases a Global may be set up for replacement, however, this
8545 // Global may be used in multiple arguments to the kernel, just segmented
8546 // apart, for example, if we have a global array, that is sectioned into
8547 // multiple mappings (technically not legal in OpenMP, but there is a case
8548 // in Fortran for Common Blocks where this is neccesary), we will end up
8549 // with GEP's into this array inside the kernel, that refer to the Global
8550 // but are technically separate arguments to the kernel for all intents and
8551 // purposes. If we have mapped a segment that requires a GEP into the 0-th
8552 // index, it will fold into an referal to the Global, if we then encounter
8553 // this folded GEP during replacement all of the references to the
8554 // Global in the kernel will be replaced with the argument we have generated
8555 // that corresponds to it, including any other GEP's that refer to the
8556 // Global that may be other arguments. This will invalidate all of the other
8557 // preceding mapped arguments that refer to the same global that may be
8558 // separate segments. To prevent this, we defer global processing until all
8559 // other processing has been performed.
8562 DeferredReplacement.push_back(std::make_pair(Input, InputCopy));
8563 continue;
8564 }
8565
8567 continue;
8568
8569 ReplaceValue(Input, InputCopy, Func);
8570 }
8571
8572 // Replace all of our deferred Input values, currently just Globals.
8573 for (auto Deferred : DeferredReplacement)
8574 ReplaceValue(std::get<0>(Deferred), std::get<1>(Deferred), Func);
8575
8576 FixupDebugInfoForOutlinedFunction(OMPBuilder, Builder, Func,
8577 ValueReplacementMap);
8578 return Func;
8579}
8580/// Given a task descriptor, TaskWithPrivates, return the pointer to the block
8581/// of pointers containing shared data between the parent task and the created
8582/// task.
8584 IRBuilderBase &Builder,
8585 Value *TaskWithPrivates,
8586 Type *TaskWithPrivatesTy) {
8587
8588 Type *TaskTy = OMPIRBuilder.Task;
8589 LLVMContext &Ctx = Builder.getContext();
8590 Value *TaskT =
8591 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 0);
8592 Value *Shareds = TaskT;
8593 // TaskWithPrivatesTy can be one of the following
8594 // 1. %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8595 // %struct.privates }
8596 // 2. %struct.kmp_task_ompbuilder_t ;; This is simply TaskTy
8597 //
8598 // In the former case, that is when TaskWithPrivatesTy != TaskTy,
8599 // its first member has to be the task descriptor. TaskTy is the type of the
8600 // task descriptor. TaskT is the pointer to the task descriptor. Loading the
8601 // first member of TaskT, gives us the pointer to shared data.
8602 if (TaskWithPrivatesTy != TaskTy)
8603 Shareds = Builder.CreateStructGEP(TaskTy, TaskT, 0);
8604 return Builder.CreateLoad(PointerType::getUnqual(Ctx), Shareds);
8605}
8606/// Create an entry point for a target task with the following.
8607/// It'll have the following signature
8608/// void @.omp_target_task_proxy_func(i32 %thread.id, ptr %task)
8609/// This function is called from emitTargetTask once the
8610/// code to launch the target kernel has been outlined already.
8611/// NumOffloadingArrays is the number of offloading arrays that we need to copy
8612/// into the task structure so that the deferred target task can access this
8613/// data even after the stack frame of the generating task has been rolled
8614/// back. Offloading arrays contain base pointers, pointers, sizes etc
8615/// of the data that the target kernel will access. These in effect are the
8616/// non-empty arrays of pointers held by OpenMPIRBuilder::TargetDataRTArgs.
8618 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI,
8619 StructType *PrivatesTy, StructType *TaskWithPrivatesTy,
8620 const size_t NumOffloadingArrays, const int SharedArgsOperandNo) {
8621
8622 // If NumOffloadingArrays is non-zero, PrivatesTy better not be nullptr.
8623 // This is because PrivatesTy is the type of the structure in which
8624 // we pass the offloading arrays to the deferred target task.
8625 assert((!NumOffloadingArrays || PrivatesTy) &&
8626 "PrivatesTy cannot be nullptr when there are offloadingArrays"
8627 "to privatize");
8628
8629 Module &M = OMPBuilder.M;
8630 // KernelLaunchFunction is the target launch function, i.e.
8631 // the function that sets up kernel arguments and calls
8632 // __tgt_target_kernel to launch the kernel on the device.
8633 //
8634 Function *KernelLaunchFunction = StaleCI->getCalledFunction();
8635
8636 // StaleCI is the CallInst which is the call to the outlined
8637 // target kernel launch function. If there are local live-in values
8638 // that the outlined function uses then these are aggregated into a structure
8639 // which is passed as the second argument. If there are no local live-in
8640 // values or if all values used by the outlined kernel are global variables,
8641 // then there's only one argument, the threadID. So, StaleCI can be
8642 //
8643 // %structArg = alloca { ptr, ptr }, align 8
8644 // %gep_ = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 0
8645 // store ptr %20, ptr %gep_, align 8
8646 // %gep_8 = getelementptr { ptr, ptr }, ptr %structArg, i32 0, i32 1
8647 // store ptr %21, ptr %gep_8, align 8
8648 // call void @_QQmain..omp_par.1(i32 %global.tid.val6, ptr %structArg)
8649 //
8650 // OR
8651 //
8652 // call void @_QQmain..omp_par.1(i32 %global.tid.val6)
8654 StaleCI->getIterator());
8655
8656 LLVMContext &Ctx = StaleCI->getParent()->getContext();
8657
8658 Type *ThreadIDTy = Type::getInt32Ty(Ctx);
8659 Type *TaskPtrTy = OMPBuilder.TaskPtr;
8660 [[maybe_unused]] Type *TaskTy = OMPBuilder.Task;
8661
8662 auto ProxyFnTy =
8663 FunctionType::get(Builder.getVoidTy(), {ThreadIDTy, TaskPtrTy},
8664 /* isVarArg */ false);
8665 auto ProxyFn = Function::Create(ProxyFnTy, GlobalValue::InternalLinkage,
8666 ".omp_target_task_proxy_func",
8667 Builder.GetInsertBlock()->getModule());
8668 Value *ThreadId = ProxyFn->getArg(0);
8669 Value *TaskWithPrivates = ProxyFn->getArg(1);
8670 ThreadId->setName("thread.id");
8671 TaskWithPrivates->setName("task");
8672
8673 bool HasShareds = SharedArgsOperandNo > 0;
8674 bool HasOffloadingArrays = NumOffloadingArrays > 0;
8675 BasicBlock *EntryBB =
8676 BasicBlock::Create(Builder.getContext(), "entry", ProxyFn);
8677 Builder.SetInsertPoint(EntryBB);
8678
8679 SmallVector<Value *> KernelLaunchArgs;
8680 KernelLaunchArgs.reserve(StaleCI->arg_size());
8681 KernelLaunchArgs.push_back(ThreadId);
8682
8683 if (HasOffloadingArrays) {
8684 assert(TaskTy != TaskWithPrivatesTy &&
8685 "If there are offloading arrays to pass to the target"
8686 "TaskTy cannot be the same as TaskWithPrivatesTy");
8687 (void)TaskTy;
8688 Value *Privates =
8689 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskWithPrivates, 1);
8690 for (unsigned int i = 0; i < NumOffloadingArrays; ++i)
8691 KernelLaunchArgs.push_back(
8692 Builder.CreateStructGEP(PrivatesTy, Privates, i));
8693 }
8694
8695 if (HasShareds) {
8696 auto *ArgStructAlloca =
8697 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgsOperandNo));
8698 assert(ArgStructAlloca &&
8699 "Unable to find the alloca instruction corresponding to arguments "
8700 "for extracted function");
8701 auto *ArgStructType = cast<StructType>(ArgStructAlloca->getAllocatedType());
8702 std::optional<TypeSize> ArgAllocSize =
8703 ArgStructAlloca->getAllocationSize(M.getDataLayout());
8704 assert(ArgStructType && ArgAllocSize &&
8705 "Unable to determine size of arguments for extracted function");
8706 uint64_t StructSize = ArgAllocSize->getFixedValue();
8707
8708 AllocaInst *NewArgStructAlloca =
8709 Builder.CreateAlloca(ArgStructType, nullptr, "structArg");
8710
8711 Value *SharedsSize = Builder.getInt64(StructSize);
8712
8714 OMPBuilder, Builder, TaskWithPrivates, TaskWithPrivatesTy);
8715
8716 Builder.CreateMemCpy(
8717 NewArgStructAlloca, NewArgStructAlloca->getAlign(), LoadShared,
8718 LoadShared->getPointerAlignment(M.getDataLayout()), SharedsSize);
8719 KernelLaunchArgs.push_back(NewArgStructAlloca);
8720 }
8721 OMPBuilder.createRuntimeFunctionCall(KernelLaunchFunction, KernelLaunchArgs);
8722 Builder.CreateRetVoid();
8723 return ProxyFn;
8724}
8726
8727 if (auto *GEP = dyn_cast<GetElementPtrInst>(V))
8728 return GEP->getSourceElementType();
8729 if (auto *Alloca = dyn_cast<AllocaInst>(V))
8730 return Alloca->getAllocatedType();
8731
8732 llvm_unreachable("Unhandled Instruction type");
8733 return nullptr;
8734}
8735// This function returns a struct that has at most two members.
8736// The first member is always %struct.kmp_task_ompbuilder_t, that is the task
8737// descriptor. The second member, if needed, is a struct containing arrays
8738// that need to be passed to the offloaded target kernel. For example,
8739// if .offload_baseptrs, .offload_ptrs and .offload_sizes have to be passed to
8740// the target kernel and their types are [3 x ptr], [3 x ptr] and [3 x i64]
8741// respectively, then the types created by this function are
8742//
8743// %struct.privates = type { [3 x ptr], [3 x ptr], [3 x i64] }
8744// %struct.task_with_privates = type { %struct.kmp_task_ompbuilder_t,
8745// %struct.privates }
8746// %struct.task_with_privates is returned by this function.
8747// If there aren't any offloading arrays to pass to the target kernel,
8748// %struct.kmp_task_ompbuilder_t is returned.
8749static StructType *
8751 ArrayRef<Value *> OffloadingArraysToPrivatize) {
8752
8753 if (OffloadingArraysToPrivatize.empty())
8754 return OMPIRBuilder.Task;
8755
8756 SmallVector<Type *, 4> StructFieldTypes;
8757 for (Value *V : OffloadingArraysToPrivatize) {
8758 assert(V->getType()->isPointerTy() &&
8759 "Expected pointer to array to privatize. Got a non-pointer value "
8760 "instead");
8761 Type *ArrayTy = getOffloadingArrayType(V);
8762 assert(ArrayTy && "ArrayType cannot be nullptr");
8763 StructFieldTypes.push_back(ArrayTy);
8764 }
8765 StructType *PrivatesStructTy =
8766 StructType::create(StructFieldTypes, "struct.privates");
8767 return StructType::create({OMPIRBuilder.Task, PrivatesStructTy},
8768 "struct.task_with_privates");
8769}
8771 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry,
8772 TargetRegionEntryInfo &EntryInfo,
8774 Function *&OutlinedFn, Constant *&OutlinedFnID,
8778
8779 OpenMPIRBuilder::FunctionGenCallback &&GenerateOutlinedFunction =
8780 [&](StringRef EntryFnName) {
8781 return createOutlinedFunction(OMPBuilder, Builder, DefaultAttrs,
8782 EntryFnName, Inputs, CBFunc,
8783 ArgAccessorFuncCB);
8784 };
8785
8786 return OMPBuilder.emitTargetRegionFunction(
8787 EntryInfo, GenerateOutlinedFunction, IsOffloadEntry, OutlinedFn,
8788 OutlinedFnID);
8789}
8790
8792 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
8795 const TargetDataRTArgs &RTArgs, bool HasNoWait) {
8796
8797 // The following explains the code-gen scenario for the `target` directive. A
8798 // similar scneario is followed for other device-related directives (e.g.
8799 // `target enter data`) but in similar fashion since we only need to emit task
8800 // that encapsulates the proper runtime call.
8801 //
8802 // When we arrive at this function, the target region itself has been
8803 // outlined into the function OutlinedFn.
8804 // So at ths point, for
8805 // --------------------------------------------------------------
8806 // void user_code_that_offloads(...) {
8807 // omp target depend(..) map(from:a) map(to:b) private(i)
8808 // do i = 1, 10
8809 // a(i) = b(i) + n
8810 // }
8811 //
8812 // --------------------------------------------------------------
8813 //
8814 // we have
8815 //
8816 // --------------------------------------------------------------
8817 //
8818 // void user_code_that_offloads(...) {
8819 // %.offload_baseptrs = alloca [2 x ptr], align 8
8820 // %.offload_ptrs = alloca [2 x ptr], align 8
8821 // %.offload_mappers = alloca [2 x ptr], align 8
8822 // ;; target region has been outlined and now we need to
8823 // ;; offload to it via a target task.
8824 // }
8825 // void outlined_device_function(ptr a, ptr b, ptr n) {
8826 // n = *n_ptr;
8827 // do i = 1, 10
8828 // a(i) = b(i) + n
8829 // }
8830 //
8831 // We have to now do the following
8832 // (i) Make an offloading call to outlined_device_function using the OpenMP
8833 // RTL. See 'kernel_launch_function' in the pseudo code below. This is
8834 // emitted by emitKernelLaunch
8835 // (ii) Create a task entry point function that calls kernel_launch_function
8836 // and is the entry point for the target task. See
8837 // '@.omp_target_task_proxy_func in the pseudocode below.
8838 // (iii) Create a task with the task entry point created in (ii)
8839 //
8840 // That is we create the following
8841 // struct task_with_privates {
8842 // struct kmp_task_ompbuilder_t task_struct;
8843 // struct privates {
8844 // [2 x ptr] ; baseptrs
8845 // [2 x ptr] ; ptrs
8846 // [2 x i64] ; sizes
8847 // }
8848 // }
8849 // void user_code_that_offloads(...) {
8850 // %.offload_baseptrs = alloca [2 x ptr], align 8
8851 // %.offload_ptrs = alloca [2 x ptr], align 8
8852 // %.offload_sizes = alloca [2 x i64], align 8
8853 //
8854 // %structArg = alloca { ptr, ptr, ptr }, align 8
8855 // %strucArg[0] = a
8856 // %strucArg[1] = b
8857 // %strucArg[2] = &n
8858 //
8859 // target_task_with_privates = @__kmpc_omp_target_task_alloc(...,
8860 // sizeof(kmp_task_ompbuilder_t),
8861 // sizeof(structArg),
8862 // @.omp_target_task_proxy_func,
8863 // ...)
8864 // memcpy(target_task_with_privates->task_struct->shareds, %structArg,
8865 // sizeof(structArg))
8866 // memcpy(target_task_with_privates->privates->baseptrs,
8867 // offload_baseptrs, sizeof(offload_baseptrs)
8868 // memcpy(target_task_with_privates->privates->ptrs,
8869 // offload_ptrs, sizeof(offload_ptrs)
8870 // memcpy(target_task_with_privates->privates->sizes,
8871 // offload_sizes, sizeof(offload_sizes)
8872 // dependencies_array = ...
8873 // ;; if nowait not present
8874 // call @__kmpc_omp_wait_deps(..., dependencies_array)
8875 // call @__kmpc_omp_task_begin_if0(...)
8876 // call @ @.omp_target_task_proxy_func(i32 thread_id, ptr
8877 // %target_task_with_privates)
8878 // call @__kmpc_omp_task_complete_if0(...)
8879 // }
8880 //
8881 // define internal void @.omp_target_task_proxy_func(i32 %thread.id,
8882 // ptr %task) {
8883 // %structArg = alloca {ptr, ptr, ptr}
8884 // %task_ptr = getelementptr(%task, 0, 0)
8885 // %shared_data = load (getelementptr %task_ptr, 0, 0)
8886 // mempcy(%structArg, %shared_data, sizeof(%structArg))
8887 //
8888 // %offloading_arrays = getelementptr(%task, 0, 1)
8889 // %offload_baseptrs = getelementptr(%offloading_arrays, 0, 0)
8890 // %offload_ptrs = getelementptr(%offloading_arrays, 0, 1)
8891 // %offload_sizes = getelementptr(%offloading_arrays, 0, 2)
8892 // kernel_launch_function(%thread.id, %offload_baseptrs, %offload_ptrs,
8893 // %offload_sizes, %structArg)
8894 // }
8895 //
8896 // We need the proxy function because the signature of the task entry point
8897 // expected by kmpc_omp_task is always the same and will be different from
8898 // that of the kernel_launch function.
8899 //
8900 // kernel_launch_function is generated by emitKernelLaunch and has the
8901 // always_inline attribute. For this example, it'll look like so:
8902 // void kernel_launch_function(%thread_id, %offload_baseptrs, %offload_ptrs,
8903 // %offload_sizes, %structArg) alwaysinline {
8904 // %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8
8905 // ; load aggregated data from %structArg
8906 // ; setup kernel_args using offload_baseptrs, offload_ptrs and
8907 // ; offload_sizes
8908 // call i32 @__tgt_target_kernel(...,
8909 // outlined_device_function,
8910 // ptr %kernel_args)
8911 // }
8912 // void outlined_device_function(ptr a, ptr b, ptr n) {
8913 // n = *n_ptr;
8914 // do i = 1, 10
8915 // a(i) = b(i) + n
8916 // }
8917 //
8918 BasicBlock *TargetTaskBodyBB =
8919 splitBB(Builder, /*CreateBranch=*/true, "target.task.body");
8920 BasicBlock *TargetTaskAllocaBB =
8921 splitBB(Builder, /*CreateBranch=*/true, "target.task.alloca");
8922
8923 InsertPointTy TargetTaskAllocaIP(TargetTaskAllocaBB,
8924 TargetTaskAllocaBB->begin());
8925 InsertPointTy TargetTaskBodyIP(TargetTaskBodyBB, TargetTaskBodyBB->begin());
8926
8927 OutlineInfo OI;
8928 OI.EntryBB = TargetTaskAllocaBB;
8929 OI.OuterAllocaBB = AllocaIP.getBlock();
8930
8931 // Add the thread ID argument.
8934 Builder, AllocaIP, ToBeDeleted, TargetTaskAllocaIP, "global.tid", false));
8935
8936 // Generate the task body which will subsequently be outlined.
8937 Builder.restoreIP(TargetTaskBodyIP);
8938 if (Error Err = TaskBodyCB(DeviceID, RTLoc, TargetTaskAllocaIP))
8939 return Err;
8940
8941 // The outliner (CodeExtractor) extract a sequence or vector of blocks that
8942 // it is given. These blocks are enumerated by
8943 // OpenMPIRBuilder::OutlineInfo::collectBlocks which expects the OI.ExitBlock
8944 // to be outside the region. In other words, OI.ExitBlock is expected to be
8945 // the start of the region after the outlining. We used to set OI.ExitBlock
8946 // to the InsertBlock after TaskBodyCB is done. This is fine in most cases
8947 // except when the task body is a single basic block. In that case,
8948 // OI.ExitBlock is set to the single task body block and will get left out of
8949 // the outlining process. So, simply create a new empty block to which we
8950 // uncoditionally branch from where TaskBodyCB left off
8951 OI.ExitBB = BasicBlock::Create(Builder.getContext(), "target.task.cont");
8952 emitBlock(OI.ExitBB, Builder.GetInsertBlock()->getParent(),
8953 /*IsFinished=*/true);
8954
8955 SmallVector<Value *, 2> OffloadingArraysToPrivatize;
8956 bool NeedsTargetTask = HasNoWait && DeviceID;
8957 if (NeedsTargetTask) {
8958 for (auto *V :
8959 {RTArgs.BasePointersArray, RTArgs.PointersArray, RTArgs.MappersArray,
8960 RTArgs.MapNamesArray, RTArgs.MapTypesArray, RTArgs.MapTypesArrayEnd,
8961 RTArgs.SizesArray}) {
8963 OffloadingArraysToPrivatize.push_back(V);
8965 }
8966 }
8967 }
8968 OI.PostOutlineCB = [this, ToBeDeleted, Dependencies, NeedsTargetTask,
8969 DeviceID, OffloadingArraysToPrivatize](
8970 Function &OutlinedFn) mutable {
8971 assert(OutlinedFn.hasOneUse() &&
8972 "there must be a single user for the outlined function");
8973
8974 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
8975
8976 // The first argument of StaleCI is always the thread id.
8977 // The next few arguments are the pointers to offloading arrays
8978 // if any. (see OffloadingArraysToPrivatize)
8979 // Finally, all other local values that are live-in into the outlined region
8980 // end up in a structure whose pointer is passed as the last argument. This
8981 // piece of data is passed in the "shared" field of the task structure. So,
8982 // we know we have to pass shareds to the task if the number of arguments is
8983 // greater than OffloadingArraysToPrivatize.size() + 1 The 1 is for the
8984 // thread id. Further, for safety, we assert that the number of arguments of
8985 // StaleCI is exactly OffloadingArraysToPrivatize.size() + 2
8986 const unsigned int NumStaleCIArgs = StaleCI->arg_size();
8987 bool HasShareds = NumStaleCIArgs > OffloadingArraysToPrivatize.size() + 1;
8988 assert((!HasShareds ||
8989 NumStaleCIArgs == (OffloadingArraysToPrivatize.size() + 2)) &&
8990 "Wrong number of arguments for StaleCI when shareds are present");
8991 int SharedArgOperandNo =
8992 HasShareds ? OffloadingArraysToPrivatize.size() + 1 : 0;
8993
8994 StructType *TaskWithPrivatesTy =
8995 createTaskWithPrivatesTy(*this, OffloadingArraysToPrivatize);
8996 StructType *PrivatesTy = nullptr;
8997
8998 if (!OffloadingArraysToPrivatize.empty())
8999 PrivatesTy =
9000 static_cast<StructType *>(TaskWithPrivatesTy->getElementType(1));
9001
9003 *this, Builder, StaleCI, PrivatesTy, TaskWithPrivatesTy,
9004 OffloadingArraysToPrivatize.size(), SharedArgOperandNo);
9005
9006 LLVM_DEBUG(dbgs() << "Proxy task entry function created: " << *ProxyFn
9007 << "\n");
9008
9009 Builder.SetInsertPoint(StaleCI);
9010
9011 // Gather the arguments for emitting the runtime call.
9012 uint32_t SrcLocStrSize;
9013 Constant *SrcLocStr =
9015 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
9016
9017 // @__kmpc_omp_task_alloc or @__kmpc_omp_target_task_alloc
9018 //
9019 // If `HasNoWait == true`, we call @__kmpc_omp_target_task_alloc to provide
9020 // the DeviceID to the deferred task and also since
9021 // @__kmpc_omp_target_task_alloc creates an untied/async task.
9022 Function *TaskAllocFn =
9023 !NeedsTargetTask
9024 ? getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_alloc)
9026 OMPRTL___kmpc_omp_target_task_alloc);
9027
9028 // Arguments - `loc_ref` (Ident) and `gtid` (ThreadID)
9029 // call.
9030 Value *ThreadID = getOrCreateThreadID(Ident);
9031
9032 // Argument - `sizeof_kmp_task_t` (TaskSize)
9033 // Tasksize refers to the size in bytes of kmp_task_t data structure
9034 // plus any other data to be passed to the target task, if any, which
9035 // is packed into a struct. kmp_task_t and the struct so created are
9036 // packed into a wrapper struct whose type is TaskWithPrivatesTy.
9037 Value *TaskSize = Builder.getInt64(
9038 M.getDataLayout().getTypeStoreSize(TaskWithPrivatesTy));
9039
9040 // Argument - `sizeof_shareds` (SharedsSize)
9041 // SharedsSize refers to the shareds array size in the kmp_task_t data
9042 // structure.
9043 Value *SharedsSize = Builder.getInt64(0);
9044 if (HasShareds) {
9045 auto *ArgStructAlloca =
9046 dyn_cast<AllocaInst>(StaleCI->getArgOperand(SharedArgOperandNo));
9047 assert(ArgStructAlloca &&
9048 "Unable to find the alloca instruction corresponding to arguments "
9049 "for extracted function");
9050 std::optional<TypeSize> ArgAllocSize =
9051 ArgStructAlloca->getAllocationSize(M.getDataLayout());
9052 assert(ArgAllocSize &&
9053 "Unable to determine size of arguments for extracted function");
9054 SharedsSize = Builder.getInt64(ArgAllocSize->getFixedValue());
9055 }
9056
9057 // Argument - `flags`
9058 // Task is tied iff (Flags & 1) == 1.
9059 // Task is untied iff (Flags & 1) == 0.
9060 // Task is final iff (Flags & 2) == 2.
9061 // Task is not final iff (Flags & 2) == 0.
9062 // A target task is not final and is untied.
9063 Value *Flags = Builder.getInt32(0);
9064
9065 // Emit the @__kmpc_omp_task_alloc runtime call
9066 // The runtime call returns a pointer to an area where the task captured
9067 // variables must be copied before the task is run (TaskData)
9068 CallInst *TaskData = nullptr;
9069
9070 SmallVector<llvm::Value *> TaskAllocArgs = {
9071 /*loc_ref=*/Ident, /*gtid=*/ThreadID,
9072 /*flags=*/Flags,
9073 /*sizeof_task=*/TaskSize, /*sizeof_shared=*/SharedsSize,
9074 /*task_func=*/ProxyFn};
9075
9076 if (NeedsTargetTask) {
9077 assert(DeviceID && "Expected non-empty device ID.");
9078 TaskAllocArgs.push_back(DeviceID);
9079 }
9080
9081 TaskData = createRuntimeFunctionCall(TaskAllocFn, TaskAllocArgs);
9082
9083 Align Alignment = TaskData->getPointerAlignment(M.getDataLayout());
9084 if (HasShareds) {
9085 Value *Shareds = StaleCI->getArgOperand(SharedArgOperandNo);
9087 *this, Builder, TaskData, TaskWithPrivatesTy);
9088 Builder.CreateMemCpy(TaskShareds, Alignment, Shareds, Alignment,
9089 SharedsSize);
9090 }
9091 if (!OffloadingArraysToPrivatize.empty()) {
9092 Value *Privates =
9093 Builder.CreateStructGEP(TaskWithPrivatesTy, TaskData, 1);
9094 for (unsigned int i = 0; i < OffloadingArraysToPrivatize.size(); ++i) {
9095 Value *PtrToPrivatize = OffloadingArraysToPrivatize[i];
9096 [[maybe_unused]] Type *ArrayType =
9097 getOffloadingArrayType(PtrToPrivatize);
9098 assert(ArrayType && "ArrayType cannot be nullptr");
9099
9100 Type *ElementType = PrivatesTy->getElementType(i);
9101 assert(ElementType == ArrayType &&
9102 "ElementType should match ArrayType");
9103 (void)ArrayType;
9104
9105 Value *Dst = Builder.CreateStructGEP(PrivatesTy, Privates, i);
9106 Builder.CreateMemCpy(
9107 Dst, Alignment, PtrToPrivatize, Alignment,
9108 Builder.getInt64(M.getDataLayout().getTypeStoreSize(ElementType)));
9109 }
9110 }
9111
9112 Value *DepArray = emitTaskDependencies(*this, Dependencies);
9113
9114 // ---------------------------------------------------------------
9115 // V5.2 13.8 target construct
9116 // If the nowait clause is present, execution of the target task
9117 // may be deferred. If the nowait clause is not present, the target task is
9118 // an included task.
9119 // ---------------------------------------------------------------
9120 // The above means that the lack of a nowait on the target construct
9121 // translates to '#pragma omp task if(0)'
9122 if (!NeedsTargetTask) {
9123 if (DepArray) {
9124 Function *TaskWaitFn =
9125 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_wait_deps);
9127 TaskWaitFn,
9128 {/*loc_ref=*/Ident, /*gtid=*/ThreadID,
9129 /*ndeps=*/Builder.getInt32(Dependencies.size()),
9130 /*dep_list=*/DepArray,
9131 /*ndeps_noalias=*/ConstantInt::get(Builder.getInt32Ty(), 0),
9132 /*noalias_dep_list=*/
9134 }
9135 // Included task.
9136 Function *TaskBeginFn =
9137 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_begin_if0);
9138 Function *TaskCompleteFn =
9139 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_complete_if0);
9140 createRuntimeFunctionCall(TaskBeginFn, {Ident, ThreadID, TaskData});
9141 CallInst *CI = createRuntimeFunctionCall(ProxyFn, {ThreadID, TaskData});
9142 CI->setDebugLoc(StaleCI->getDebugLoc());
9143 createRuntimeFunctionCall(TaskCompleteFn, {Ident, ThreadID, TaskData});
9144 } else if (DepArray) {
9145 // HasNoWait - meaning the task may be deferred. Call
9146 // __kmpc_omp_task_with_deps if there are dependencies,
9147 // else call __kmpc_omp_task
9148 Function *TaskFn =
9149 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task_with_deps);
9151 TaskFn,
9152 {Ident, ThreadID, TaskData, Builder.getInt32(Dependencies.size()),
9153 DepArray, ConstantInt::get(Builder.getInt32Ty(), 0),
9155 } else {
9156 // Emit the @__kmpc_omp_task runtime call to spawn the task
9157 Function *TaskFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_task);
9158 createRuntimeFunctionCall(TaskFn, {Ident, ThreadID, TaskData});
9159 }
9160
9161 StaleCI->eraseFromParent();
9162 for (Instruction *I : llvm::reverse(ToBeDeleted))
9163 I->eraseFromParent();
9164 };
9165 addOutlineInfo(std::move(OI));
9166
9167 LLVM_DEBUG(dbgs() << "Insert block after emitKernelLaunch = \n"
9168 << *(Builder.GetInsertBlock()) << "\n");
9169 LLVM_DEBUG(dbgs() << "Module after emitKernelLaunch = \n"
9170 << *(Builder.GetInsertBlock()->getParent()->getParent())
9171 << "\n");
9172 return Builder.saveIP();
9173}
9174
9176 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
9177 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
9178 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
9179 bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9180 if (Error Err =
9181 emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
9182 CustomMapperCB, IsNonContiguous, DeviceAddrCB))
9183 return Err;
9184 emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
9185 return Error::success();
9186}
9187
9188static void emitTargetCall(
9189 OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
9194 Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
9199 bool HasNoWait, Value *DynCGroupMem,
9200 OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9201 // Generate a function call to the host fallback implementation of the target
9202 // region. This is called by the host when no offload entry was generated for
9203 // the target region and when the offloading call fails at runtime.
9204 auto &&EmitTargetCallFallbackCB = [&](OpenMPIRBuilder::InsertPointTy IP)
9206 Builder.restoreIP(IP);
9207 // Ensure the host fallback has the same dyn_ptr ABI as the device.
9208 SmallVector<Value *> FallbackArgs(Args.begin(), Args.end());
9209 FallbackArgs.push_back(
9210 Constant::getNullValue(PointerType::getUnqual(Builder.getContext())));
9211 OMPBuilder.createRuntimeFunctionCall(OutlinedFn, FallbackArgs);
9212 return Builder.saveIP();
9213 };
9214
9215 bool HasDependencies = Dependencies.size() > 0;
9216 bool RequiresOuterTargetTask = HasNoWait || HasDependencies;
9217
9219
9220 auto TaskBodyCB =
9221 [&](Value *DeviceID, Value *RTLoc,
9222 IRBuilderBase::InsertPoint TargetTaskAllocaIP) -> Error {
9223 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9224 // produce any.
9226 // emitKernelLaunch makes the necessary runtime call to offload the
9227 // kernel. We then outline all that code into a separate function
9228 // ('kernel_launch_function' in the pseudo code above). This function is
9229 // then called by the target task proxy function (see
9230 // '@.omp_target_task_proxy_func' in the pseudo code above)
9231 // "@.omp_target_task_proxy_func' is generated by
9232 // emitTargetTaskProxyFunction.
9233 if (OutlinedFnID && DeviceID)
9234 return OMPBuilder.emitKernelLaunch(Builder, OutlinedFnID,
9235 EmitTargetCallFallbackCB, KArgs,
9236 DeviceID, RTLoc, TargetTaskAllocaIP);
9237
9238 // We only need to do the outlining if `DeviceID` is set to avoid calling
9239 // `emitKernelLaunch` if we want to code-gen for the host; e.g. if we are
9240 // generating the `else` branch of an `if` clause.
9241 //
9242 // When OutlinedFnID is set to nullptr, then it's not an offloading call.
9243 // In this case, we execute the host implementation directly.
9244 return EmitTargetCallFallbackCB(OMPBuilder.Builder.saveIP());
9245 }());
9246
9247 OMPBuilder.Builder.restoreIP(AfterIP);
9248 return Error::success();
9249 };
9250
9251 auto &&EmitTargetCallElse =
9252 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9254 // Assume no error was returned because EmitTargetCallFallbackCB doesn't
9255 // produce any.
9257 if (RequiresOuterTargetTask) {
9258 // Arguments that are intended to be directly forwarded to an
9259 // emitKernelLaunch call are pased as nullptr, since
9260 // OutlinedFnID=nullptr results in that call not being done.
9262 return OMPBuilder.emitTargetTask(TaskBodyCB, /*DeviceID=*/nullptr,
9263 /*RTLoc=*/nullptr, AllocaIP,
9264 Dependencies, EmptyRTArgs, HasNoWait);
9265 }
9266 return EmitTargetCallFallbackCB(Builder.saveIP());
9267 }());
9268
9269 Builder.restoreIP(AfterIP);
9270 return Error::success();
9271 };
9272
9273 auto &&EmitTargetCallThen =
9274 [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
9276 Info.HasNoWait = HasNoWait;
9277 OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
9278
9280 if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
9281 AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
9282 /*IsNonContiguous=*/true,
9283 /*ForEndCall=*/false))
9284 return Err;
9285
9286 SmallVector<Value *, 3> NumTeamsC;
9287 for (auto [DefaultVal, RuntimeVal] :
9288 zip_equal(DefaultAttrs.MaxTeams, RuntimeAttrs.MaxTeams))
9289 NumTeamsC.push_back(RuntimeVal ? RuntimeVal
9290 : Builder.getInt32(DefaultVal));
9291
9292 // Calculate number of threads: 0 if no clauses specified, otherwise it is
9293 // the minimum between optional THREAD_LIMIT and NUM_THREADS clauses.
9294 auto InitMaxThreadsClause = [&Builder](Value *Clause) {
9295 if (Clause)
9296 Clause = Builder.CreateIntCast(Clause, Builder.getInt32Ty(),
9297 /*isSigned=*/false);
9298 return Clause;
9299 };
9300 auto CombineMaxThreadsClauses = [&Builder](Value *Clause, Value *&Result) {
9301 if (Clause)
9302 Result =
9303 Result ? Builder.CreateSelect(Builder.CreateICmpULT(Result, Clause),
9304 Result, Clause)
9305 : Clause;
9306 };
9307
9308 // If a multi-dimensional THREAD_LIMIT is set, it is the OMPX_BARE case, so
9309 // the NUM_THREADS clause is overriden by THREAD_LIMIT.
9310 SmallVector<Value *, 3> NumThreadsC;
9311 Value *MaxThreadsClause =
9312 RuntimeAttrs.TeamsThreadLimit.size() == 1
9313 ? InitMaxThreadsClause(RuntimeAttrs.MaxThreads)
9314 : nullptr;
9315
9316 for (auto [TeamsVal, TargetVal] : zip_equal(
9317 RuntimeAttrs.TeamsThreadLimit, RuntimeAttrs.TargetThreadLimit)) {
9318 Value *TeamsThreadLimitClause = InitMaxThreadsClause(TeamsVal);
9319 Value *NumThreads = InitMaxThreadsClause(TargetVal);
9320
9321 CombineMaxThreadsClauses(TeamsThreadLimitClause, NumThreads);
9322 CombineMaxThreadsClauses(MaxThreadsClause, NumThreads);
9323
9324 NumThreadsC.push_back(NumThreads ? NumThreads : Builder.getInt32(0));
9325 }
9326
9327 unsigned NumTargetItems = Info.NumberOfPtrs;
9328 uint32_t SrcLocStrSize;
9329 Constant *SrcLocStr = OMPBuilder.getOrCreateDefaultSrcLocStr(SrcLocStrSize);
9330 Value *RTLoc = OMPBuilder.getOrCreateIdent(SrcLocStr, SrcLocStrSize,
9331 llvm::omp::IdentFlag(0), 0);
9332
9333 Value *TripCount = RuntimeAttrs.LoopTripCount
9334 ? Builder.CreateIntCast(RuntimeAttrs.LoopTripCount,
9335 Builder.getInt64Ty(),
9336 /*isSigned=*/false)
9337 : Builder.getInt64(0);
9338
9339 // Request zero groupprivate bytes by default.
9340 if (!DynCGroupMem)
9341 DynCGroupMem = Builder.getInt32(0);
9342
9344 NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9345 HasNoWait, DynCGroupMemFallback);
9346
9347 // Assume no error was returned because TaskBodyCB and
9348 // EmitTargetCallFallbackCB don't produce any.
9350 // The presence of certain clauses on the target directive require the
9351 // explicit generation of the target task.
9352 if (RequiresOuterTargetTask)
9353 return OMPBuilder.emitTargetTask(TaskBodyCB, RuntimeAttrs.DeviceID,
9354 RTLoc, AllocaIP, Dependencies,
9355 KArgs.RTArgs, Info.HasNoWait);
9356
9357 return OMPBuilder.emitKernelLaunch(
9358 Builder, OutlinedFnID, EmitTargetCallFallbackCB, KArgs,
9359 RuntimeAttrs.DeviceID, RTLoc, AllocaIP);
9360 }());
9361
9362 Builder.restoreIP(AfterIP);
9363 return Error::success();
9364 };
9365
9366 // If we don't have an ID for the target region, it means an offload entry
9367 // wasn't created. In this case we just run the host fallback directly and
9368 // ignore any potential 'if' clauses.
9369 if (!OutlinedFnID) {
9370 cantFail(EmitTargetCallElse(AllocaIP, Builder.saveIP()));
9371 return;
9372 }
9373
9374 // If there's no 'if' clause, only generate the kernel launch code path.
9375 if (!IfCond) {
9376 cantFail(EmitTargetCallThen(AllocaIP, Builder.saveIP()));
9377 return;
9378 }
9379
9380 cantFail(OMPBuilder.emitIfClause(IfCond, EmitTargetCallThen,
9381 EmitTargetCallElse, AllocaIP));
9382}
9383
9385 const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
9386 InsertPointTy CodeGenIP, TargetDataInfo &Info,
9387 TargetRegionEntryInfo &EntryInfo,
9388 const TargetKernelDefaultAttrs &DefaultAttrs,
9389 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
9390 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
9393 CustomMapperCallbackTy CustomMapperCB,
9394 const SmallVector<DependData> &Dependencies, bool HasNowait,
9395 Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback) {
9396
9397 if (!updateToLocation(Loc))
9398 return InsertPointTy();
9399
9400 Builder.restoreIP(CodeGenIP);
9401
9402 Function *OutlinedFn;
9403 Constant *OutlinedFnID = nullptr;
9404 // The target region is outlined into its own function. The LLVM IR for
9405 // the target region itself is generated using the callbacks CBFunc
9406 // and ArgAccessorFuncCB
9408 *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
9409 OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
9410 return Err;
9411
9412 // If we are not on the target device, then we need to generate code
9413 // to make a remote call (offload) to the previously outlined function
9414 // that represents the target region. Do that now.
9415 if (!Config.isTargetDevice())
9416 emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
9417 IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
9418 CustomMapperCB, Dependencies, HasNowait, DynCGroupMem,
9419 DynCGroupMemFallback);
9420 return Builder.saveIP();
9421}
9422
9423std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
9424 StringRef FirstSeparator,
9425 StringRef Separator) {
9426 SmallString<128> Buffer;
9427 llvm::raw_svector_ostream OS(Buffer);
9428 StringRef Sep = FirstSeparator;
9429 for (StringRef Part : Parts) {
9430 OS << Sep << Part;
9431 Sep = Separator;
9432 }
9433 return OS.str().str();
9434}
9435
9436std::string
9438 return OpenMPIRBuilder::getNameWithSeparators(Parts, Config.firstSeparator(),
9439 Config.separator());
9440}
9441
9443 Type *Ty, const StringRef &Name, std::optional<unsigned> AddressSpace) {
9444 auto &Elem = *InternalVars.try_emplace(Name, nullptr).first;
9445 if (Elem.second) {
9446 assert(Elem.second->getValueType() == Ty &&
9447 "OMP internal variable has different type than requested");
9448 } else {
9449 // TODO: investigate the appropriate linkage type used for the global
9450 // variable for possibly changing that to internal or private, or maybe
9451 // create different versions of the function for different OMP internal
9452 // variables.
9453 const DataLayout &DL = M.getDataLayout();
9454 // TODO: Investigate why AMDGPU expects AS 0 for globals even though the
9455 // default global AS is 1.
9456 // See double-target-call-with-declare-target.f90 and
9457 // declare-target-vars-in-target-region.f90 libomptarget
9458 // tests.
9459 unsigned AddressSpaceVal = AddressSpace ? *AddressSpace
9460 : M.getTargetTriple().isAMDGPU()
9461 ? 0
9462 : DL.getDefaultGlobalsAddressSpace();
9463 auto Linkage = this->M.getTargetTriple().getArch() == Triple::wasm32
9466 auto *GV = new GlobalVariable(M, Ty, /*IsConstant=*/false, Linkage,
9467 Constant::getNullValue(Ty), Elem.first(),
9468 /*InsertBefore=*/nullptr,
9469 GlobalValue::NotThreadLocal, AddressSpaceVal);
9470 const llvm::Align TypeAlign = DL.getABITypeAlign(Ty);
9471 const llvm::Align PtrAlign = DL.getPointerABIAlignment(AddressSpaceVal);
9472 GV->setAlignment(std::max(TypeAlign, PtrAlign));
9473 Elem.second = GV;
9474 }
9475
9476 return Elem.second;
9477}
9478
9479Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
9480 std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
9481 std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
9482 return getOrCreateInternalVariable(KmpCriticalNameTy, Name);
9483}
9484
9486 LLVMContext &Ctx = Builder.getContext();
9487 Value *Null =
9488 Constant::getNullValue(PointerType::getUnqual(BasePtr->getContext()));
9489 Value *SizeGep =
9490 Builder.CreateGEP(BasePtr->getType(), Null, Builder.getInt32(1));
9491 Value *SizePtrToInt = Builder.CreatePtrToInt(SizeGep, Type::getInt64Ty(Ctx));
9492 return SizePtrToInt;
9493}
9494
9497 std::string VarName) {
9498 llvm::Constant *MaptypesArrayInit =
9499 llvm::ConstantDataArray::get(M.getContext(), Mappings);
9500 auto *MaptypesArrayGlobal = new llvm::GlobalVariable(
9501 M, MaptypesArrayInit->getType(),
9502 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MaptypesArrayInit,
9503 VarName);
9504 MaptypesArrayGlobal->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
9505 return MaptypesArrayGlobal;
9506}
9507
9509 InsertPointTy AllocaIP,
9510 unsigned NumOperands,
9511 struct MapperAllocas &MapperAllocas) {
9512 if (!updateToLocation(Loc))
9513 return;
9514
9515 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9516 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9517 Builder.restoreIP(AllocaIP);
9518 AllocaInst *ArgsBase = Builder.CreateAlloca(
9519 ArrI8PtrTy, /* ArraySize = */ nullptr, ".offload_baseptrs");
9520 AllocaInst *Args = Builder.CreateAlloca(ArrI8PtrTy, /* ArraySize = */ nullptr,
9521 ".offload_ptrs");
9522 AllocaInst *ArgSizes = Builder.CreateAlloca(
9523 ArrI64Ty, /* ArraySize = */ nullptr, ".offload_sizes");
9525 MapperAllocas.ArgsBase = ArgsBase;
9526 MapperAllocas.Args = Args;
9527 MapperAllocas.ArgSizes = ArgSizes;
9528}
9529
9531 Function *MapperFunc, Value *SrcLocInfo,
9532 Value *MaptypesArg, Value *MapnamesArg,
9534 int64_t DeviceID, unsigned NumOperands) {
9535 if (!updateToLocation(Loc))
9536 return;
9537
9538 auto *ArrI8PtrTy = ArrayType::get(Int8Ptr, NumOperands);
9539 auto *ArrI64Ty = ArrayType::get(Int64, NumOperands);
9540 Value *ArgsBaseGEP =
9541 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.ArgsBase,
9542 {Builder.getInt32(0), Builder.getInt32(0)});
9543 Value *ArgsGEP =
9544 Builder.CreateInBoundsGEP(ArrI8PtrTy, MapperAllocas.Args,
9545 {Builder.getInt32(0), Builder.getInt32(0)});
9546 Value *ArgSizesGEP =
9547 Builder.CreateInBoundsGEP(ArrI64Ty, MapperAllocas.ArgSizes,
9548 {Builder.getInt32(0), Builder.getInt32(0)});
9549 Value *NullPtr =
9550 Constant::getNullValue(PointerType::getUnqual(Int8Ptr->getContext()));
9551 createRuntimeFunctionCall(MapperFunc, {SrcLocInfo, Builder.getInt64(DeviceID),
9552 Builder.getInt32(NumOperands),
9553 ArgsBaseGEP, ArgsGEP, ArgSizesGEP,
9554 MaptypesArg, MapnamesArg, NullPtr});
9555}
9556
9558 TargetDataRTArgs &RTArgs,
9559 TargetDataInfo &Info,
9560 bool ForEndCall) {
9561 assert((!ForEndCall || Info.separateBeginEndCalls()) &&
9562 "expected region end call to runtime only when end call is separate");
9563 auto UnqualPtrTy = PointerType::getUnqual(M.getContext());
9564 auto VoidPtrTy = UnqualPtrTy;
9565 auto VoidPtrPtrTy = UnqualPtrTy;
9566 auto Int64Ty = Type::getInt64Ty(M.getContext());
9567 auto Int64PtrTy = UnqualPtrTy;
9568
9569 if (!Info.NumberOfPtrs) {
9570 RTArgs.BasePointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9571 RTArgs.PointersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9572 RTArgs.SizesArray = ConstantPointerNull::get(Int64PtrTy);
9573 RTArgs.MapTypesArray = ConstantPointerNull::get(Int64PtrTy);
9574 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9575 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9576 return;
9577 }
9578
9579 RTArgs.BasePointersArray = Builder.CreateConstInBoundsGEP2_32(
9580 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs),
9581 Info.RTArgs.BasePointersArray,
9582 /*Idx0=*/0, /*Idx1=*/0);
9583 RTArgs.PointersArray = Builder.CreateConstInBoundsGEP2_32(
9584 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray,
9585 /*Idx0=*/0,
9586 /*Idx1=*/0);
9587 RTArgs.SizesArray = Builder.CreateConstInBoundsGEP2_32(
9588 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
9589 /*Idx0=*/0, /*Idx1=*/0);
9590 RTArgs.MapTypesArray = Builder.CreateConstInBoundsGEP2_32(
9591 ArrayType::get(Int64Ty, Info.NumberOfPtrs),
9592 ForEndCall && Info.RTArgs.MapTypesArrayEnd ? Info.RTArgs.MapTypesArrayEnd
9593 : Info.RTArgs.MapTypesArray,
9594 /*Idx0=*/0,
9595 /*Idx1=*/0);
9596
9597 // Only emit the mapper information arrays if debug information is
9598 // requested.
9599 if (!Info.EmitDebug)
9600 RTArgs.MapNamesArray = ConstantPointerNull::get(VoidPtrPtrTy);
9601 else
9602 RTArgs.MapNamesArray = Builder.CreateConstInBoundsGEP2_32(
9603 ArrayType::get(VoidPtrTy, Info.NumberOfPtrs), Info.RTArgs.MapNamesArray,
9604 /*Idx0=*/0,
9605 /*Idx1=*/0);
9606 // If there is no user-defined mapper, set the mapper array to nullptr to
9607 // avoid an unnecessary data privatization
9608 if (!Info.HasMapper)
9609 RTArgs.MappersArray = ConstantPointerNull::get(VoidPtrPtrTy);
9610 else
9611 RTArgs.MappersArray =
9612 Builder.CreatePointerCast(Info.RTArgs.MappersArray, VoidPtrPtrTy);
9613}
9614
9616 InsertPointTy CodeGenIP,
9617 MapInfosTy &CombinedInfo,
9618 TargetDataInfo &Info) {
9620 CombinedInfo.NonContigInfo;
9621
9622 // Build an array of struct descriptor_dim and then assign it to
9623 // offload_args.
9624 //
9625 // struct descriptor_dim {
9626 // uint64_t offset;
9627 // uint64_t count;
9628 // uint64_t stride
9629 // };
9630 Type *Int64Ty = Builder.getInt64Ty();
9632 M.getContext(), ArrayRef<Type *>({Int64Ty, Int64Ty, Int64Ty}),
9633 "struct.descriptor_dim");
9634
9635 enum { OffsetFD = 0, CountFD, StrideFD };
9636 // We need two index variable here since the size of "Dims" is the same as
9637 // the size of Components, however, the size of offset, count, and stride is
9638 // equal to the size of base declaration that is non-contiguous.
9639 for (unsigned I = 0, L = 0, E = NonContigInfo.Dims.size(); I < E; ++I) {
9640 // Skip emitting ir if dimension size is 1 since it cannot be
9641 // non-contiguous.
9642 if (NonContigInfo.Dims[I] == 1)
9643 continue;
9644 Builder.restoreIP(AllocaIP);
9645 ArrayType *ArrayTy = ArrayType::get(DimTy, NonContigInfo.Dims[I]);
9646 AllocaInst *DimsAddr =
9647 Builder.CreateAlloca(ArrayTy, /* ArraySize = */ nullptr, "dims");
9648 Builder.restoreIP(CodeGenIP);
9649 for (unsigned II = 0, EE = NonContigInfo.Dims[I]; II < EE; ++II) {
9650 unsigned RevIdx = EE - II - 1;
9651 Value *DimsLVal = Builder.CreateInBoundsGEP(
9652 ArrayTy, DimsAddr, {Builder.getInt64(0), Builder.getInt64(II)});
9653 // Offset
9654 Value *OffsetLVal = Builder.CreateStructGEP(DimTy, DimsLVal, OffsetFD);
9655 Builder.CreateAlignedStore(
9656 NonContigInfo.Offsets[L][RevIdx], OffsetLVal,
9657 M.getDataLayout().getPrefTypeAlign(OffsetLVal->getType()));
9658 // Count
9659 Value *CountLVal = Builder.CreateStructGEP(DimTy, DimsLVal, CountFD);
9660 Builder.CreateAlignedStore(
9661 NonContigInfo.Counts[L][RevIdx], CountLVal,
9662 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9663 // Stride
9664 Value *StrideLVal = Builder.CreateStructGEP(DimTy, DimsLVal, StrideFD);
9665 Builder.CreateAlignedStore(
9666 NonContigInfo.Strides[L][RevIdx], StrideLVal,
9667 M.getDataLayout().getPrefTypeAlign(CountLVal->getType()));
9668 }
9669 // args[I] = &dims
9670 Builder.restoreIP(CodeGenIP);
9671 Value *DAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(
9672 DimsAddr, Builder.getPtrTy());
9673 Value *P = Builder.CreateConstInBoundsGEP2_32(
9674 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs),
9675 Info.RTArgs.PointersArray, 0, I);
9676 Builder.CreateAlignedStore(
9677 DAddr, P, M.getDataLayout().getPrefTypeAlign(Builder.getPtrTy()));
9678 ++L;
9679 }
9680}
9681
9682void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
9683 Function *MapperFn, Value *MapperHandle, Value *Base, Value *Begin,
9684 Value *Size, Value *MapType, Value *MapName, TypeSize ElementSize,
9685 BasicBlock *ExitBB, bool IsInit) {
9686 StringRef Prefix = IsInit ? ".init" : ".del";
9687
9688 // Evaluate if this is an array section.
9690 M.getContext(), createPlatformSpecificName({"omp.array", Prefix}));
9691 Value *IsArray =
9692 Builder.CreateICmpSGT(Size, Builder.getInt64(1), "omp.arrayinit.isarray");
9693 Value *DeleteBit = Builder.CreateAnd(
9694 MapType,
9695 Builder.getInt64(
9696 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9697 OpenMPOffloadMappingFlags::OMP_MAP_DELETE)));
9698 Value *DeleteCond;
9699 Value *Cond;
9700 if (IsInit) {
9701 // base != begin?
9702 Value *BaseIsBegin = Builder.CreateICmpNE(Base, Begin);
9703 Cond = Builder.CreateOr(IsArray, BaseIsBegin);
9704 DeleteCond = Builder.CreateIsNull(
9705 DeleteBit,
9706 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9707 } else {
9708 Cond = IsArray;
9709 DeleteCond = Builder.CreateIsNotNull(
9710 DeleteBit,
9711 createPlatformSpecificName({"omp.array", Prefix, ".delete"}));
9712 }
9713 Cond = Builder.CreateAnd(Cond, DeleteCond);
9714 Builder.CreateCondBr(Cond, BodyBB, ExitBB);
9715
9716 emitBlock(BodyBB, MapperFn);
9717 // Get the array size by multiplying element size and element number (i.e., \p
9718 // Size).
9719 Value *ArraySize = Builder.CreateNUWMul(Size, Builder.getInt64(ElementSize));
9720 // Remove OMP_MAP_TO and OMP_MAP_FROM from the map type, so that it achieves
9721 // memory allocation/deletion purpose only.
9722 Value *MapTypeArg = Builder.CreateAnd(
9723 MapType,
9724 Builder.getInt64(
9725 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9726 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9727 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9728 MapTypeArg = Builder.CreateOr(
9729 MapTypeArg,
9730 Builder.getInt64(
9731 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9732 OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT)));
9733
9734 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9735 // data structure.
9736 Value *OffloadingArgs[] = {MapperHandle, Base, Begin,
9737 ArraySize, MapTypeArg, MapName};
9739 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9740 OffloadingArgs);
9741}
9742
9745 llvm::Value *BeginArg)>
9746 GenMapInfoCB,
9747 Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
9748 SmallVector<Type *> Params;
9749 Params.emplace_back(Builder.getPtrTy());
9750 Params.emplace_back(Builder.getPtrTy());
9751 Params.emplace_back(Builder.getPtrTy());
9752 Params.emplace_back(Builder.getInt64Ty());
9753 Params.emplace_back(Builder.getInt64Ty());
9754 Params.emplace_back(Builder.getPtrTy());
9755
9756 auto *FnTy =
9757 FunctionType::get(Builder.getVoidTy(), Params, /* IsVarArg */ false);
9758
9759 SmallString<64> TyStr;
9760 raw_svector_ostream Out(TyStr);
9761 Function *MapperFn =
9763 MapperFn->addFnAttr(Attribute::NoInline);
9764 MapperFn->addFnAttr(Attribute::NoUnwind);
9765 MapperFn->addParamAttr(0, Attribute::NoUndef);
9766 MapperFn->addParamAttr(1, Attribute::NoUndef);
9767 MapperFn->addParamAttr(2, Attribute::NoUndef);
9768 MapperFn->addParamAttr(3, Attribute::NoUndef);
9769 MapperFn->addParamAttr(4, Attribute::NoUndef);
9770 MapperFn->addParamAttr(5, Attribute::NoUndef);
9771
9772 // Start the mapper function code generation.
9773 BasicBlock *EntryBB = BasicBlock::Create(M.getContext(), "entry", MapperFn);
9774 auto SavedIP = Builder.saveIP();
9775 Builder.SetInsertPoint(EntryBB);
9776
9777 Value *MapperHandle = MapperFn->getArg(0);
9778 Value *BaseIn = MapperFn->getArg(1);
9779 Value *BeginIn = MapperFn->getArg(2);
9780 Value *Size = MapperFn->getArg(3);
9781 Value *MapType = MapperFn->getArg(4);
9782 Value *MapName = MapperFn->getArg(5);
9783
9784 // Compute the starting and end addresses of array elements.
9785 // Prepare common arguments for array initiation and deletion.
9786 // Convert the size in bytes into the number of array elements.
9787 TypeSize ElementSize = M.getDataLayout().getTypeStoreSize(ElemTy);
9788 Size = Builder.CreateExactUDiv(Size, Builder.getInt64(ElementSize));
9789 Value *PtrBegin = BeginIn;
9790 Value *PtrEnd = Builder.CreateGEP(ElemTy, PtrBegin, Size);
9791
9792 // Emit array initiation if this is an array section and \p MapType indicates
9793 // that memory allocation is required.
9794 BasicBlock *HeadBB = BasicBlock::Create(M.getContext(), "omp.arraymap.head");
9795 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9796 MapType, MapName, ElementSize, HeadBB,
9797 /*IsInit=*/true);
9798
9799 // Emit a for loop to iterate through SizeArg of elements and map all of them.
9800
9801 // Emit the loop header block.
9802 emitBlock(HeadBB, MapperFn);
9803 BasicBlock *BodyBB = BasicBlock::Create(M.getContext(), "omp.arraymap.body");
9804 BasicBlock *DoneBB = BasicBlock::Create(M.getContext(), "omp.done");
9805 // Evaluate whether the initial condition is satisfied.
9806 Value *IsEmpty =
9807 Builder.CreateICmpEQ(PtrBegin, PtrEnd, "omp.arraymap.isempty");
9808 Builder.CreateCondBr(IsEmpty, DoneBB, BodyBB);
9809
9810 // Emit the loop body block.
9811 emitBlock(BodyBB, MapperFn);
9812 BasicBlock *LastBB = BodyBB;
9813 PHINode *PtrPHI =
9814 Builder.CreatePHI(PtrBegin->getType(), 2, "omp.arraymap.ptrcurrent");
9815 PtrPHI->addIncoming(PtrBegin, HeadBB);
9816
9817 // Get map clause information. Fill up the arrays with all mapped variables.
9818 MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
9819 if (!Info)
9820 return Info.takeError();
9821
9822 // Call the runtime API __tgt_mapper_num_components to get the number of
9823 // pre-existing components.
9824 Value *OffloadingArgs[] = {MapperHandle};
9825 Value *PreviousSize = createRuntimeFunctionCall(
9826 getOrCreateRuntimeFunction(M, OMPRTL___tgt_mapper_num_components),
9827 OffloadingArgs);
9828 Value *ShiftedPreviousSize =
9829 Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
9830
9831 // Fill up the runtime mapper handle for all components.
9832 for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
9833 Value *CurBaseArg = Info->BasePointers[I];
9834 Value *CurBeginArg = Info->Pointers[I];
9835 Value *CurSizeArg = Info->Sizes[I];
9836 Value *CurNameArg = Info->Names.size()
9837 ? Info->Names[I]
9838 : Constant::getNullValue(Builder.getPtrTy());
9839
9840 // Extract the MEMBER_OF field from the map type.
9841 Value *OriMapType = Builder.getInt64(
9842 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9843 Info->Types[I]));
9844 Value *MemberMapType =
9845 Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
9846
9847 // Combine the map type inherited from user-defined mapper with that
9848 // specified in the program. According to the OMP_MAP_TO and OMP_MAP_FROM
9849 // bits of the \a MapType, which is the input argument of the mapper
9850 // function, the following code will set the OMP_MAP_TO and OMP_MAP_FROM
9851 // bits of MemberMapType.
9852 // [OpenMP 5.0], 1.2.6. map-type decay.
9853 // | alloc | to | from | tofrom | release | delete
9854 // ----------------------------------------------------------
9855 // alloc | alloc | alloc | alloc | alloc | release | delete
9856 // to | alloc | to | alloc | to | release | delete
9857 // from | alloc | alloc | from | from | release | delete
9858 // tofrom | alloc | to | from | tofrom | release | delete
9859 Value *LeftToFrom = Builder.CreateAnd(
9860 MapType,
9861 Builder.getInt64(
9862 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9863 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9864 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9865 BasicBlock *AllocBB = BasicBlock::Create(M.getContext(), "omp.type.alloc");
9866 BasicBlock *AllocElseBB =
9867 BasicBlock::Create(M.getContext(), "omp.type.alloc.else");
9868 BasicBlock *ToBB = BasicBlock::Create(M.getContext(), "omp.type.to");
9869 BasicBlock *ToElseBB =
9870 BasicBlock::Create(M.getContext(), "omp.type.to.else");
9871 BasicBlock *FromBB = BasicBlock::Create(M.getContext(), "omp.type.from");
9872 BasicBlock *EndBB = BasicBlock::Create(M.getContext(), "omp.type.end");
9873 Value *IsAlloc = Builder.CreateIsNull(LeftToFrom);
9874 Builder.CreateCondBr(IsAlloc, AllocBB, AllocElseBB);
9875 // In case of alloc, clear OMP_MAP_TO and OMP_MAP_FROM.
9876 emitBlock(AllocBB, MapperFn);
9877 Value *AllocMapType = Builder.CreateAnd(
9878 MemberMapType,
9879 Builder.getInt64(
9880 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9881 OpenMPOffloadMappingFlags::OMP_MAP_TO |
9882 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9883 Builder.CreateBr(EndBB);
9884 emitBlock(AllocElseBB, MapperFn);
9885 Value *IsTo = Builder.CreateICmpEQ(
9886 LeftToFrom,
9887 Builder.getInt64(
9888 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9889 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9890 Builder.CreateCondBr(IsTo, ToBB, ToElseBB);
9891 // In case of to, clear OMP_MAP_FROM.
9892 emitBlock(ToBB, MapperFn);
9893 Value *ToMapType = Builder.CreateAnd(
9894 MemberMapType,
9895 Builder.getInt64(
9896 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9897 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9898 Builder.CreateBr(EndBB);
9899 emitBlock(ToElseBB, MapperFn);
9900 Value *IsFrom = Builder.CreateICmpEQ(
9901 LeftToFrom,
9902 Builder.getInt64(
9903 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9904 OpenMPOffloadMappingFlags::OMP_MAP_FROM)));
9905 Builder.CreateCondBr(IsFrom, FromBB, EndBB);
9906 // In case of from, clear OMP_MAP_TO.
9907 emitBlock(FromBB, MapperFn);
9908 Value *FromMapType = Builder.CreateAnd(
9909 MemberMapType,
9910 Builder.getInt64(
9911 ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
9912 OpenMPOffloadMappingFlags::OMP_MAP_TO)));
9913 // In case of tofrom, do nothing.
9914 emitBlock(EndBB, MapperFn);
9915 LastBB = EndBB;
9916 PHINode *CurMapType =
9917 Builder.CreatePHI(Builder.getInt64Ty(), 4, "omp.maptype");
9918 CurMapType->addIncoming(AllocMapType, AllocBB);
9919 CurMapType->addIncoming(ToMapType, ToBB);
9920 CurMapType->addIncoming(FromMapType, FromBB);
9921 CurMapType->addIncoming(MemberMapType, ToElseBB);
9922
9923 Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
9924 CurSizeArg, CurMapType, CurNameArg};
9925
9926 auto ChildMapperFn = CustomMapperCB(I);
9927 if (!ChildMapperFn)
9928 return ChildMapperFn.takeError();
9929 if (*ChildMapperFn) {
9930 // Call the corresponding mapper function.
9931 createRuntimeFunctionCall(*ChildMapperFn, OffloadingArgs)
9932 ->setDoesNotThrow();
9933 } else {
9934 // Call the runtime API __tgt_push_mapper_component to fill up the runtime
9935 // data structure.
9937 getOrCreateRuntimeFunction(M, OMPRTL___tgt_push_mapper_component),
9938 OffloadingArgs);
9939 }
9940 }
9941
9942 // Update the pointer to point to the next element that needs to be mapped,
9943 // and check whether we have mapped all elements.
9944 Value *PtrNext = Builder.CreateConstGEP1_32(ElemTy, PtrPHI, /*Idx0=*/1,
9945 "omp.arraymap.next");
9946 PtrPHI->addIncoming(PtrNext, LastBB);
9947 Value *IsDone = Builder.CreateICmpEQ(PtrNext, PtrEnd, "omp.arraymap.isdone");
9948 BasicBlock *ExitBB = BasicBlock::Create(M.getContext(), "omp.arraymap.exit");
9949 Builder.CreateCondBr(IsDone, ExitBB, BodyBB);
9950
9951 emitBlock(ExitBB, MapperFn);
9952 // Emit array deletion if this is an array section and \p MapType indicates
9953 // that deletion is required.
9954 emitUDMapperArrayInitOrDel(MapperFn, MapperHandle, BaseIn, BeginIn, Size,
9955 MapType, MapName, ElementSize, DoneBB,
9956 /*IsInit=*/false);
9957
9958 // Emit the function exit block.
9959 emitBlock(DoneBB, MapperFn, /*IsFinished=*/true);
9960
9961 Builder.CreateRetVoid();
9962 Builder.restoreIP(SavedIP);
9963 return MapperFn;
9964}
9965
9967 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
9968 TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
9969 bool IsNonContiguous,
9970 function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
9971
9972 // Reset the array information.
9973 Info.clearArrayInfo();
9974 Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
9975
9976 if (Info.NumberOfPtrs == 0)
9977 return Error::success();
9978
9979 Builder.restoreIP(AllocaIP);
9980 // Detect if we have any capture size requiring runtime evaluation of the
9981 // size so that a constant array could be eventually used.
9982 ArrayType *PointerArrayType =
9983 ArrayType::get(Builder.getPtrTy(), Info.NumberOfPtrs);
9984
9985 Info.RTArgs.BasePointersArray = Builder.CreateAlloca(
9986 PointerArrayType, /* ArraySize = */ nullptr, ".offload_baseptrs");
9987
9988 Info.RTArgs.PointersArray = Builder.CreateAlloca(
9989 PointerArrayType, /* ArraySize = */ nullptr, ".offload_ptrs");
9990 AllocaInst *MappersArray = Builder.CreateAlloca(
9991 PointerArrayType, /* ArraySize = */ nullptr, ".offload_mappers");
9992 Info.RTArgs.MappersArray = MappersArray;
9993
9994 // If we don't have any VLA types or other types that require runtime
9995 // evaluation, we can use a constant array for the map sizes, otherwise we
9996 // need to fill up the arrays as we do for the pointers.
9997 Type *Int64Ty = Builder.getInt64Ty();
9998 SmallVector<Constant *> ConstSizes(CombinedInfo.Sizes.size(),
9999 ConstantInt::get(Int64Ty, 0));
10000 SmallBitVector RuntimeSizes(CombinedInfo.Sizes.size());
10001 for (unsigned I = 0, E = CombinedInfo.Sizes.size(); I < E; ++I) {
10002 bool IsNonContigEntry =
10003 IsNonContiguous &&
10004 (static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10005 CombinedInfo.Types[I] &
10006 OpenMPOffloadMappingFlags::OMP_MAP_NON_CONTIG) != 0);
10007 // For NON_CONTIG entries, ArgSizes stores the dimension count (number of
10008 // descriptor_dim records), not the byte size.
10009 if (IsNonContigEntry) {
10010 assert(I < CombinedInfo.NonContigInfo.Dims.size() &&
10011 "Index must be in-bounds for NON_CONTIG Dims array");
10012 const uint64_t DimCount = CombinedInfo.NonContigInfo.Dims[I];
10013 assert(DimCount > 0 && "NON_CONTIG DimCount must be > 0");
10014 ConstSizes[I] = ConstantInt::get(Int64Ty, DimCount);
10015 continue;
10016 }
10017 if (auto *CI = dyn_cast<Constant>(CombinedInfo.Sizes[I])) {
10018 if (!isa<ConstantExpr>(CI) && !isa<GlobalValue>(CI)) {
10019 ConstSizes[I] = CI;
10020 continue;
10021 }
10022 }
10023 RuntimeSizes.set(I);
10024 }
10025
10026 if (RuntimeSizes.all()) {
10027 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10028 Info.RTArgs.SizesArray = Builder.CreateAlloca(
10029 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10030 restoreIPandDebugLoc(Builder, CodeGenIP);
10031 } else {
10032 auto *SizesArrayInit = ConstantArray::get(
10033 ArrayType::get(Int64Ty, ConstSizes.size()), ConstSizes);
10034 std::string Name = createPlatformSpecificName({"offload_sizes"});
10035 auto *SizesArrayGbl =
10036 new GlobalVariable(M, SizesArrayInit->getType(), /*isConstant=*/true,
10037 GlobalValue::PrivateLinkage, SizesArrayInit, Name);
10038 SizesArrayGbl->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
10039
10040 if (!RuntimeSizes.any()) {
10041 Info.RTArgs.SizesArray = SizesArrayGbl;
10042 } else {
10043 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10044 Align OffloadSizeAlign = M.getDataLayout().getABIIntegerTypeAlignment(64);
10045 ArrayType *SizeArrayType = ArrayType::get(Int64Ty, Info.NumberOfPtrs);
10046 AllocaInst *Buffer = Builder.CreateAlloca(
10047 SizeArrayType, /* ArraySize = */ nullptr, ".offload_sizes");
10048 Buffer->setAlignment(OffloadSizeAlign);
10049 restoreIPandDebugLoc(Builder, CodeGenIP);
10050 Builder.CreateMemCpy(
10051 Buffer, M.getDataLayout().getPrefTypeAlign(Buffer->getType()),
10052 SizesArrayGbl, OffloadSizeAlign,
10053 Builder.getIntN(
10054 IndexSize,
10055 Buffer->getAllocationSize(M.getDataLayout())->getFixedValue()));
10056
10057 Info.RTArgs.SizesArray = Buffer;
10058 }
10059 restoreIPandDebugLoc(Builder, CodeGenIP);
10060 }
10061
10062 // The map types are always constant so we don't need to generate code to
10063 // fill arrays. Instead, we create an array constant.
10065 for (auto mapFlag : CombinedInfo.Types)
10066 Mapping.push_back(
10067 static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10068 mapFlag));
10069 std::string MaptypesName = createPlatformSpecificName({"offload_maptypes"});
10070 auto *MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10071 Info.RTArgs.MapTypesArray = MapTypesArrayGbl;
10072
10073 // The information types are only built if provided.
10074 if (!CombinedInfo.Names.empty()) {
10075 auto *MapNamesArrayGbl = createOffloadMapnames(
10076 CombinedInfo.Names, createPlatformSpecificName({"offload_mapnames"}));
10077 Info.RTArgs.MapNamesArray = MapNamesArrayGbl;
10078 Info.EmitDebug = true;
10079 } else {
10080 Info.RTArgs.MapNamesArray =
10082 Info.EmitDebug = false;
10083 }
10084
10085 // If there's a present map type modifier, it must not be applied to the end
10086 // of a region, so generate a separate map type array in that case.
10087 if (Info.separateBeginEndCalls()) {
10088 bool EndMapTypesDiffer = false;
10089 for (uint64_t &Type : Mapping) {
10090 if (Type & static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10091 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT)) {
10092 Type &= ~static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
10093 OpenMPOffloadMappingFlags::OMP_MAP_PRESENT);
10094 EndMapTypesDiffer = true;
10095 }
10096 }
10097 if (EndMapTypesDiffer) {
10098 MapTypesArrayGbl = createOffloadMaptypes(Mapping, MaptypesName);
10099 Info.RTArgs.MapTypesArrayEnd = MapTypesArrayGbl;
10100 }
10101 }
10102
10103 PointerType *PtrTy = Builder.getPtrTy();
10104 for (unsigned I = 0; I < Info.NumberOfPtrs; ++I) {
10105 Value *BPVal = CombinedInfo.BasePointers[I];
10106 Value *BP = Builder.CreateConstInBoundsGEP2_32(
10107 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.BasePointersArray,
10108 0, I);
10109 Builder.CreateAlignedStore(BPVal, BP,
10110 M.getDataLayout().getPrefTypeAlign(PtrTy));
10111
10112 if (Info.requiresDevicePointerInfo()) {
10113 if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Pointer) {
10114 CodeGenIP = Builder.saveIP();
10115 Builder.restoreIP(AllocaIP);
10116 Info.DevicePtrInfoMap[BPVal] = {BP, Builder.CreateAlloca(PtrTy)};
10117 Builder.restoreIP(CodeGenIP);
10118 if (DeviceAddrCB)
10119 DeviceAddrCB(I, Info.DevicePtrInfoMap[BPVal].second);
10120 } else if (CombinedInfo.DevicePointers[I] == DeviceInfoTy::Address) {
10121 Info.DevicePtrInfoMap[BPVal] = {BP, BP};
10122 if (DeviceAddrCB)
10123 DeviceAddrCB(I, BP);
10124 }
10125 }
10126
10127 Value *PVal = CombinedInfo.Pointers[I];
10128 Value *P = Builder.CreateConstInBoundsGEP2_32(
10129 ArrayType::get(PtrTy, Info.NumberOfPtrs), Info.RTArgs.PointersArray, 0,
10130 I);
10131 // TODO: Check alignment correct.
10132 Builder.CreateAlignedStore(PVal, P,
10133 M.getDataLayout().getPrefTypeAlign(PtrTy));
10134
10135 if (RuntimeSizes.test(I)) {
10136 Value *S = Builder.CreateConstInBoundsGEP2_32(
10137 ArrayType::get(Int64Ty, Info.NumberOfPtrs), Info.RTArgs.SizesArray,
10138 /*Idx0=*/0,
10139 /*Idx1=*/I);
10140 Builder.CreateAlignedStore(Builder.CreateIntCast(CombinedInfo.Sizes[I],
10141 Int64Ty,
10142 /*isSigned=*/true),
10143 S, M.getDataLayout().getPrefTypeAlign(PtrTy));
10144 }
10145 // Fill up the mapper array.
10146 unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
10147 Value *MFunc = ConstantPointerNull::get(PtrTy);
10148
10149 auto CustomMFunc = CustomMapperCB(I);
10150 if (!CustomMFunc)
10151 return CustomMFunc.takeError();
10152 if (*CustomMFunc)
10153 MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
10154
10155 Value *MAddr = Builder.CreateInBoundsGEP(
10156 PointerArrayType, MappersArray,
10157 {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
10158 Builder.CreateAlignedStore(
10159 MFunc, MAddr, M.getDataLayout().getPrefTypeAlign(MAddr->getType()));
10160 }
10161
10162 if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
10163 Info.NumberOfPtrs == 0)
10164 return Error::success();
10165 emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
10166 return Error::success();
10167}
10168
10170 BasicBlock *CurBB = Builder.GetInsertBlock();
10171
10172 if (!CurBB || CurBB->hasTerminator()) {
10173 // If there is no insert point or the previous block is already
10174 // terminated, don't touch it.
10175 } else {
10176 // Otherwise, create a fall-through branch.
10177 Builder.CreateBr(Target);
10178 }
10179
10180 Builder.ClearInsertionPoint();
10181}
10182
10184 bool IsFinished) {
10185 BasicBlock *CurBB = Builder.GetInsertBlock();
10186
10187 // Fall out of the current block (if necessary).
10188 emitBranch(BB);
10189
10190 if (IsFinished && BB->use_empty()) {
10191 BB->eraseFromParent();
10192 return;
10193 }
10194
10195 // Place the block after the current block, if possible, or else at
10196 // the end of the function.
10197 if (CurBB && CurBB->getParent())
10198 CurFn->insert(std::next(CurBB->getIterator()), BB);
10199 else
10200 CurFn->insert(CurFn->end(), BB);
10201 Builder.SetInsertPoint(BB);
10202}
10203
10205 BodyGenCallbackTy ElseGen,
10206 InsertPointTy AllocaIP) {
10207 // If the condition constant folds and can be elided, try to avoid emitting
10208 // the condition and the dead arm of the if/else.
10209 if (auto *CI = dyn_cast<ConstantInt>(Cond)) {
10210 auto CondConstant = CI->getSExtValue();
10211 if (CondConstant)
10212 return ThenGen(AllocaIP, Builder.saveIP());
10213
10214 return ElseGen(AllocaIP, Builder.saveIP());
10215 }
10216
10217 Function *CurFn = Builder.GetInsertBlock()->getParent();
10218
10219 // Otherwise, the condition did not fold, or we couldn't elide it. Just
10220 // emit the conditional branch.
10221 BasicBlock *ThenBlock = BasicBlock::Create(M.getContext(), "omp_if.then");
10222 BasicBlock *ElseBlock = BasicBlock::Create(M.getContext(), "omp_if.else");
10223 BasicBlock *ContBlock = BasicBlock::Create(M.getContext(), "omp_if.end");
10224 Builder.CreateCondBr(Cond, ThenBlock, ElseBlock);
10225 // Emit the 'then' code.
10226 emitBlock(ThenBlock, CurFn);
10227 if (Error Err = ThenGen(AllocaIP, Builder.saveIP()))
10228 return Err;
10229 emitBranch(ContBlock);
10230 // Emit the 'else' code if present.
10231 // There is no need to emit line number for unconditional branch.
10232 emitBlock(ElseBlock, CurFn);
10233 if (Error Err = ElseGen(AllocaIP, Builder.saveIP()))
10234 return Err;
10235 // There is no need to emit line number for unconditional branch.
10236 emitBranch(ContBlock);
10237 // Emit the continuation block for code after the if.
10238 emitBlock(ContBlock, CurFn, /*IsFinished=*/true);
10239 return Error::success();
10240}
10241
10242bool OpenMPIRBuilder::checkAndEmitFlushAfterAtomic(
10243 const LocationDescription &Loc, llvm::AtomicOrdering AO, AtomicKind AK) {
10246 "Unexpected Atomic Ordering.");
10247
10248 bool Flush = false;
10250
10251 switch (AK) {
10252 case Read:
10255 FlushAO = AtomicOrdering::Acquire;
10256 Flush = true;
10257 }
10258 break;
10259 case Write:
10260 case Compare:
10261 case Update:
10264 FlushAO = AtomicOrdering::Release;
10265 Flush = true;
10266 }
10267 break;
10268 case Capture:
10269 switch (AO) {
10271 FlushAO = AtomicOrdering::Acquire;
10272 Flush = true;
10273 break;
10275 FlushAO = AtomicOrdering::Release;
10276 Flush = true;
10277 break;
10281 Flush = true;
10282 break;
10283 default:
10284 // do nothing - leave silently.
10285 break;
10286 }
10287 }
10288
10289 if (Flush) {
10290 // Currently Flush RT call still doesn't take memory_ordering, so for when
10291 // that happens, this tries to do the resolution of which atomic ordering
10292 // to use with but issue the flush call
10293 // TODO: pass `FlushAO` after memory ordering support is added
10294 (void)FlushAO;
10295 emitFlush(Loc);
10296 }
10297
10298 // for AO == AtomicOrdering::Monotonic and all other case combinations
10299 // do nothing
10300 return Flush;
10301}
10302
10306 AtomicOrdering AO, InsertPointTy AllocaIP) {
10307 if (!updateToLocation(Loc))
10308 return Loc.IP;
10309
10310 assert(X.Var->getType()->isPointerTy() &&
10311 "OMP Atomic expects a pointer to target memory");
10312 Type *XElemTy = X.ElemTy;
10313 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10314 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10315 "OMP atomic read expected a scalar type");
10316
10317 Value *XRead = nullptr;
10318
10319 if (XElemTy->isIntegerTy()) {
10320 LoadInst *XLD =
10321 Builder.CreateLoad(XElemTy, X.Var, X.IsVolatile, "omp.atomic.read");
10322 XLD->setAtomic(AO);
10323 XRead = cast<Value>(XLD);
10324 } else if (XElemTy->isStructTy()) {
10325 // FIXME: Add checks to ensure __atomic_load is emitted iff the
10326 // target does not support `atomicrmw` of the size of the struct
10327 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10328 OldVal->setAtomic(AO);
10329 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10330 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10331 OpenMPIRBuilder::AtomicInfo atomicInfo(
10332 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10333 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10334 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10335 XRead = AtomicLoadRes.first;
10336 OldVal->eraseFromParent();
10337 } else {
10338 // We need to perform atomic op as integer
10339 IntegerType *IntCastTy =
10340 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10341 LoadInst *XLoad =
10342 Builder.CreateLoad(IntCastTy, X.Var, X.IsVolatile, "omp.atomic.load");
10343 XLoad->setAtomic(AO);
10344 if (XElemTy->isFloatingPointTy()) {
10345 XRead = Builder.CreateBitCast(XLoad, XElemTy, "atomic.flt.cast");
10346 } else {
10347 XRead = Builder.CreateIntToPtr(XLoad, XElemTy, "atomic.ptr.cast");
10348 }
10349 }
10350 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Read);
10351 Builder.CreateStore(XRead, V.Var, V.IsVolatile);
10352 return Builder.saveIP();
10353}
10354
10357 AtomicOpValue &X, Value *Expr,
10358 AtomicOrdering AO, InsertPointTy AllocaIP) {
10359 if (!updateToLocation(Loc))
10360 return Loc.IP;
10361
10362 assert(X.Var->getType()->isPointerTy() &&
10363 "OMP Atomic expects a pointer to target memory");
10364 Type *XElemTy = X.ElemTy;
10365 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10366 XElemTy->isPointerTy() || XElemTy->isStructTy()) &&
10367 "OMP atomic write expected a scalar type");
10368
10369 if (XElemTy->isIntegerTy()) {
10370 StoreInst *XSt = Builder.CreateStore(Expr, X.Var, X.IsVolatile);
10371 XSt->setAtomic(AO);
10372 } else if (XElemTy->isStructTy()) {
10373 LoadInst *OldVal = Builder.CreateLoad(XElemTy, X.Var, "omp.atomic.read");
10374 const DataLayout &DL = OldVal->getModule()->getDataLayout();
10375 unsigned LoadSize = DL.getTypeStoreSize(XElemTy);
10376 OpenMPIRBuilder::AtomicInfo atomicInfo(
10377 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10378 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X.Var);
10379 atomicInfo.EmitAtomicStoreLibcall(AO, Expr);
10380 OldVal->eraseFromParent();
10381 } else {
10382 // We need to bitcast and perform atomic op as integers
10383 IntegerType *IntCastTy =
10384 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10385 Value *ExprCast =
10386 Builder.CreateBitCast(Expr, IntCastTy, "atomic.src.int.cast");
10387 StoreInst *XSt = Builder.CreateStore(ExprCast, X.Var, X.IsVolatile);
10388 XSt->setAtomic(AO);
10389 }
10390
10391 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Write);
10392 return Builder.saveIP();
10393}
10394
10397 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
10398 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
10399 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10400 assert(!isConflictIP(Loc.IP, AllocaIP) && "IPs must not be ambiguous");
10401 if (!updateToLocation(Loc))
10402 return Loc.IP;
10403
10404 LLVM_DEBUG({
10405 Type *XTy = X.Var->getType();
10406 assert(XTy->isPointerTy() &&
10407 "OMP Atomic expects a pointer to target memory");
10408 Type *XElemTy = X.ElemTy;
10409 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10410 XElemTy->isPointerTy()) &&
10411 "OMP atomic update expected a scalar type");
10412 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10413 (RMWOp != AtomicRMWInst::UMax) && (RMWOp != AtomicRMWInst::UMin) &&
10414 "OpenMP atomic does not support LT or GT operations");
10415 });
10416
10417 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10418 AllocaIP, X.Var, X.ElemTy, Expr, AO, RMWOp, UpdateOp, X.IsVolatile,
10419 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10420 if (!AtomicResult)
10421 return AtomicResult.takeError();
10422 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Update);
10423 return Builder.saveIP();
10424}
10425
10426// FIXME: Duplicating AtomicExpand
10427Value *OpenMPIRBuilder::emitRMWOpAsInstruction(Value *Src1, Value *Src2,
10428 AtomicRMWInst::BinOp RMWOp) {
10429 switch (RMWOp) {
10430 case AtomicRMWInst::Add:
10431 return Builder.CreateAdd(Src1, Src2);
10432 case AtomicRMWInst::Sub:
10433 return Builder.CreateSub(Src1, Src2);
10434 case AtomicRMWInst::And:
10435 return Builder.CreateAnd(Src1, Src2);
10437 return Builder.CreateNeg(Builder.CreateAnd(Src1, Src2));
10438 case AtomicRMWInst::Or:
10439 return Builder.CreateOr(Src1, Src2);
10440 case AtomicRMWInst::Xor:
10441 return Builder.CreateXor(Src1, Src2);
10446 case AtomicRMWInst::Max:
10447 case AtomicRMWInst::Min:
10460 llvm_unreachable("Unsupported atomic update operation");
10461 }
10462 llvm_unreachable("Unsupported atomic update operation");
10463}
10464
10465Expected<std::pair<Value *, Value *>> OpenMPIRBuilder::emitAtomicUpdate(
10466 InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
10468 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX, bool IsXBinopExpr,
10469 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10470 // TODO: handle the case where XElemTy is not byte-sized or not a power of 2
10471 // or a complex datatype.
10472 bool emitRMWOp = false;
10473 switch (RMWOp) {
10474 case AtomicRMWInst::Add:
10475 case AtomicRMWInst::And:
10477 case AtomicRMWInst::Or:
10478 case AtomicRMWInst::Xor:
10480 emitRMWOp = XElemTy;
10481 break;
10482 case AtomicRMWInst::Sub:
10483 emitRMWOp = (IsXBinopExpr && XElemTy);
10484 break;
10485 default:
10486 emitRMWOp = false;
10487 }
10488 emitRMWOp &= XElemTy->isIntegerTy();
10489
10490 std::pair<Value *, Value *> Res;
10491 if (emitRMWOp) {
10492 AtomicRMWInst *RMWInst =
10493 Builder.CreateAtomicRMW(RMWOp, X, Expr, llvm::MaybeAlign(), AO);
10494 if (T.isAMDGPU()) {
10495 if (IsIgnoreDenormalMode)
10496 RMWInst->setMetadata("amdgpu.ignore.denormal.mode",
10497 llvm::MDNode::get(Builder.getContext(), {}));
10498 if (!IsFineGrainedMemory)
10499 RMWInst->setMetadata("amdgpu.no.fine.grained.memory",
10500 llvm::MDNode::get(Builder.getContext(), {}));
10501 if (!IsRemoteMemory)
10502 RMWInst->setMetadata("amdgpu.no.remote.memory",
10503 llvm::MDNode::get(Builder.getContext(), {}));
10504 }
10505 Res.first = RMWInst;
10506 // not needed except in case of postfix captures. Generate anyway for
10507 // consistency with the else part. Will be removed with any DCE pass.
10508 // AtomicRMWInst::Xchg does not have a coressponding instruction.
10509 if (RMWOp == AtomicRMWInst::Xchg)
10510 Res.second = Res.first;
10511 else
10512 Res.second = emitRMWOpAsInstruction(Res.first, Expr, RMWOp);
10513 } else if (RMWOp == llvm::AtomicRMWInst::BinOp::BAD_BINOP &&
10514 XElemTy->isStructTy()) {
10515 LoadInst *OldVal =
10516 Builder.CreateLoad(XElemTy, X, X->getName() + ".atomic.load");
10517 OldVal->setAtomic(AO);
10518 const DataLayout &LoadDL = OldVal->getModule()->getDataLayout();
10519 unsigned LoadSize =
10520 LoadDL.getTypeStoreSize(OldVal->getPointerOperand()->getType());
10521
10522 OpenMPIRBuilder::AtomicInfo atomicInfo(
10523 &Builder, XElemTy, LoadSize * 8, LoadSize * 8, OldVal->getAlign(),
10524 OldVal->getAlign(), true /* UseLibcall */, AllocaIP, X);
10525 auto AtomicLoadRes = atomicInfo.EmitAtomicLoadLibcall(AO);
10526 BasicBlock *CurBB = Builder.GetInsertBlock();
10527 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10528 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10529 BasicBlock *ExitBB =
10530 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10531 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10532 X->getName() + ".atomic.cont");
10533 ContBB->getTerminator()->eraseFromParent();
10534 Builder.restoreIP(AllocaIP);
10535 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10536 NewAtomicAddr->setName(X->getName() + "x.new.val");
10537 Builder.SetInsertPoint(ContBB);
10538 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10539 PHI->addIncoming(AtomicLoadRes.first, CurBB);
10540 Value *OldExprVal = PHI;
10541 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10542 if (!CBResult)
10543 return CBResult.takeError();
10544 Value *Upd = *CBResult;
10545 Builder.CreateStore(Upd, NewAtomicAddr);
10548 auto Result = atomicInfo.EmitAtomicCompareExchangeLibcall(
10549 AtomicLoadRes.second, NewAtomicAddr, AO, Failure);
10550 LoadInst *PHILoad = Builder.CreateLoad(XElemTy, Result.first);
10551 PHI->addIncoming(PHILoad, Builder.GetInsertBlock());
10552 Builder.CreateCondBr(Result.second, ExitBB, ContBB);
10553 OldVal->eraseFromParent();
10554 Res.first = OldExprVal;
10555 Res.second = Upd;
10556
10557 if (UnreachableInst *ExitTI =
10559 CurBBTI->eraseFromParent();
10560 Builder.SetInsertPoint(ExitBB);
10561 } else {
10562 Builder.SetInsertPoint(ExitTI);
10563 }
10564 } else {
10565 IntegerType *IntCastTy =
10566 IntegerType::get(M.getContext(), XElemTy->getScalarSizeInBits());
10567 LoadInst *OldVal =
10568 Builder.CreateLoad(IntCastTy, X, X->getName() + ".atomic.load");
10569 OldVal->setAtomic(AO);
10570 // CurBB
10571 // | /---\
10572 // ContBB |
10573 // | \---/
10574 // ExitBB
10575 BasicBlock *CurBB = Builder.GetInsertBlock();
10576 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10577 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10578 BasicBlock *ExitBB =
10579 CurBB->splitBasicBlock(CurBBTI, X->getName() + ".atomic.exit");
10580 BasicBlock *ContBB = CurBB->splitBasicBlock(CurBB->getTerminator(),
10581 X->getName() + ".atomic.cont");
10582 ContBB->getTerminator()->eraseFromParent();
10583 Builder.restoreIP(AllocaIP);
10584 AllocaInst *NewAtomicAddr = Builder.CreateAlloca(XElemTy);
10585 NewAtomicAddr->setName(X->getName() + "x.new.val");
10586 Builder.SetInsertPoint(ContBB);
10587 llvm::PHINode *PHI = Builder.CreatePHI(OldVal->getType(), 2);
10588 PHI->addIncoming(OldVal, CurBB);
10589 bool IsIntTy = XElemTy->isIntegerTy();
10590 Value *OldExprVal = PHI;
10591 if (!IsIntTy) {
10592 if (XElemTy->isFloatingPointTy()) {
10593 OldExprVal = Builder.CreateBitCast(PHI, XElemTy,
10594 X->getName() + ".atomic.fltCast");
10595 } else {
10596 OldExprVal = Builder.CreateIntToPtr(PHI, XElemTy,
10597 X->getName() + ".atomic.ptrCast");
10598 }
10599 }
10600
10601 Expected<Value *> CBResult = UpdateOp(OldExprVal, Builder);
10602 if (!CBResult)
10603 return CBResult.takeError();
10604 Value *Upd = *CBResult;
10605 Builder.CreateStore(Upd, NewAtomicAddr);
10606 LoadInst *DesiredVal = Builder.CreateLoad(IntCastTy, NewAtomicAddr);
10609 AtomicCmpXchgInst *Result = Builder.CreateAtomicCmpXchg(
10610 X, PHI, DesiredVal, llvm::MaybeAlign(), AO, Failure);
10611 Result->setVolatile(VolatileX);
10612 Value *PreviousVal = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10613 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10614 PHI->addIncoming(PreviousVal, Builder.GetInsertBlock());
10615 Builder.CreateCondBr(SuccessFailureVal, ExitBB, ContBB);
10616
10617 Res.first = OldExprVal;
10618 Res.second = Upd;
10619
10620 // set Insertion point in exit block
10621 if (UnreachableInst *ExitTI =
10623 CurBBTI->eraseFromParent();
10624 Builder.SetInsertPoint(ExitBB);
10625 } else {
10626 Builder.SetInsertPoint(ExitTI);
10627 }
10628 }
10629
10630 return Res;
10631}
10632
10635 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
10636 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
10637 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
10638 bool IsIgnoreDenormalMode, bool IsFineGrainedMemory, bool IsRemoteMemory) {
10639 if (!updateToLocation(Loc))
10640 return Loc.IP;
10641
10642 LLVM_DEBUG({
10643 Type *XTy = X.Var->getType();
10644 assert(XTy->isPointerTy() &&
10645 "OMP Atomic expects a pointer to target memory");
10646 Type *XElemTy = X.ElemTy;
10647 assert((XElemTy->isFloatingPointTy() || XElemTy->isIntegerTy() ||
10648 XElemTy->isPointerTy()) &&
10649 "OMP atomic capture expected a scalar type");
10650 assert((RMWOp != AtomicRMWInst::Max) && (RMWOp != AtomicRMWInst::Min) &&
10651 "OpenMP atomic does not support LT or GT operations");
10652 });
10653
10654 // If UpdateExpr is 'x' updated with some `expr` not based on 'x',
10655 // 'x' is simply atomically rewritten with 'expr'.
10656 AtomicRMWInst::BinOp AtomicOp = (UpdateExpr ? RMWOp : AtomicRMWInst::Xchg);
10657 Expected<std::pair<Value *, Value *>> AtomicResult = emitAtomicUpdate(
10658 AllocaIP, X.Var, X.ElemTy, Expr, AO, AtomicOp, UpdateOp, X.IsVolatile,
10659 IsXBinopExpr, IsIgnoreDenormalMode, IsFineGrainedMemory, IsRemoteMemory);
10660 if (!AtomicResult)
10661 return AtomicResult.takeError();
10662 Value *CapturedVal =
10663 (IsPostfixUpdate ? AtomicResult->first : AtomicResult->second);
10664 Builder.CreateStore(CapturedVal, V.Var, V.IsVolatile);
10665
10666 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Capture);
10667 return Builder.saveIP();
10668}
10669
10673 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10674 bool IsFailOnly) {
10675
10677 return createAtomicCompare(Loc, X, V, R, E, D, AO, Op, IsXBinopExpr,
10678 IsPostfixUpdate, IsFailOnly, Failure);
10679}
10680
10684 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
10685 bool IsFailOnly, AtomicOrdering Failure) {
10686
10687 if (!updateToLocation(Loc))
10688 return Loc.IP;
10689
10690 assert(X.Var->getType()->isPointerTy() &&
10691 "OMP atomic expects a pointer to target memory");
10692 // compare capture
10693 if (V.Var) {
10694 assert(V.Var->getType()->isPointerTy() && "v.var must be of pointer type");
10695 assert(V.ElemTy == X.ElemTy && "x and v must be of same type");
10696 }
10697
10698 bool IsInteger = E->getType()->isIntegerTy();
10699
10700 if (Op == OMPAtomicCompareOp::EQ) {
10701 AtomicCmpXchgInst *Result = nullptr;
10702 if (!IsInteger) {
10703 IntegerType *IntCastTy =
10704 IntegerType::get(M.getContext(), X.ElemTy->getScalarSizeInBits());
10705 Value *EBCast = Builder.CreateBitCast(E, IntCastTy);
10706 Value *DBCast = Builder.CreateBitCast(D, IntCastTy);
10707 Result = Builder.CreateAtomicCmpXchg(X.Var, EBCast, DBCast, MaybeAlign(),
10708 AO, Failure);
10709 } else {
10710 Result =
10711 Builder.CreateAtomicCmpXchg(X.Var, E, D, MaybeAlign(), AO, Failure);
10712 }
10713
10714 if (V.Var) {
10715 Value *OldValue = Builder.CreateExtractValue(Result, /*Idxs=*/0);
10716 if (!IsInteger)
10717 OldValue = Builder.CreateBitCast(OldValue, X.ElemTy);
10718 assert(OldValue->getType() == V.ElemTy &&
10719 "OldValue and V must be of same type");
10720 if (IsPostfixUpdate) {
10721 Builder.CreateStore(OldValue, V.Var, V.IsVolatile);
10722 } else {
10723 Value *SuccessOrFail = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10724 if (IsFailOnly) {
10725 // CurBB----
10726 // | |
10727 // v |
10728 // ContBB |
10729 // | |
10730 // v |
10731 // ExitBB <-
10732 //
10733 // where ContBB only contains the store of old value to 'v'.
10734 BasicBlock *CurBB = Builder.GetInsertBlock();
10735 Instruction *CurBBTI = CurBB->getTerminatorOrNull();
10736 CurBBTI = CurBBTI ? CurBBTI : Builder.CreateUnreachable();
10737 BasicBlock *ExitBB = CurBB->splitBasicBlock(
10738 CurBBTI, X.Var->getName() + ".atomic.exit");
10739 BasicBlock *ContBB = CurBB->splitBasicBlock(
10740 CurBB->getTerminator(), X.Var->getName() + ".atomic.cont");
10741 ContBB->getTerminator()->eraseFromParent();
10742 CurBB->getTerminator()->eraseFromParent();
10743
10744 Builder.CreateCondBr(SuccessOrFail, ExitBB, ContBB);
10745
10746 Builder.SetInsertPoint(ContBB);
10747 Builder.CreateStore(OldValue, V.Var);
10748 Builder.CreateBr(ExitBB);
10749
10750 if (UnreachableInst *ExitTI =
10752 CurBBTI->eraseFromParent();
10753 Builder.SetInsertPoint(ExitBB);
10754 } else {
10755 Builder.SetInsertPoint(ExitTI);
10756 }
10757 } else {
10758 Value *CapturedValue =
10759 Builder.CreateSelect(SuccessOrFail, E, OldValue);
10760 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10761 }
10762 }
10763 }
10764 // The comparison result has to be stored.
10765 if (R.Var) {
10766 assert(R.Var->getType()->isPointerTy() &&
10767 "r.var must be of pointer type");
10768 assert(R.ElemTy->isIntegerTy() && "r must be of integral type");
10769
10770 Value *SuccessFailureVal = Builder.CreateExtractValue(Result, /*Idxs=*/1);
10771 Value *ResultCast = R.IsSigned
10772 ? Builder.CreateSExt(SuccessFailureVal, R.ElemTy)
10773 : Builder.CreateZExt(SuccessFailureVal, R.ElemTy);
10774 Builder.CreateStore(ResultCast, R.Var, R.IsVolatile);
10775 }
10776 } else {
10777 assert((Op == OMPAtomicCompareOp::MAX || Op == OMPAtomicCompareOp::MIN) &&
10778 "Op should be either max or min at this point");
10779 assert(!IsFailOnly && "IsFailOnly is only valid when the comparison is ==");
10780
10781 // Reverse the ordop as the OpenMP forms are different from LLVM forms.
10782 // Let's take max as example.
10783 // OpenMP form:
10784 // x = x > expr ? expr : x;
10785 // LLVM form:
10786 // *ptr = *ptr > val ? *ptr : val;
10787 // We need to transform to LLVM form.
10788 // x = x <= expr ? x : expr;
10790 if (IsXBinopExpr) {
10791 if (IsInteger) {
10792 if (X.IsSigned)
10793 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Min
10795 else
10796 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMin
10798 } else {
10799 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMin
10801 }
10802 } else {
10803 if (IsInteger) {
10804 if (X.IsSigned)
10805 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::Max
10807 else
10808 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::UMax
10810 } else {
10811 NewOp = Op == OMPAtomicCompareOp::MAX ? AtomicRMWInst::FMax
10813 }
10814 }
10815
10816 AtomicRMWInst *OldValue =
10817 Builder.CreateAtomicRMW(NewOp, X.Var, E, MaybeAlign(), AO);
10818 if (V.Var) {
10819 Value *CapturedValue = nullptr;
10820 if (IsPostfixUpdate) {
10821 CapturedValue = OldValue;
10822 } else {
10823 CmpInst::Predicate Pred;
10824 switch (NewOp) {
10825 case AtomicRMWInst::Max:
10826 Pred = CmpInst::ICMP_SGT;
10827 break;
10829 Pred = CmpInst::ICMP_UGT;
10830 break;
10832 Pred = CmpInst::FCMP_OGT;
10833 break;
10834 case AtomicRMWInst::Min:
10835 Pred = CmpInst::ICMP_SLT;
10836 break;
10838 Pred = CmpInst::ICMP_ULT;
10839 break;
10841 Pred = CmpInst::FCMP_OLT;
10842 break;
10843 default:
10844 llvm_unreachable("unexpected comparison op");
10845 }
10846 Value *NonAtomicCmp = Builder.CreateCmp(Pred, OldValue, E);
10847 CapturedValue = Builder.CreateSelect(NonAtomicCmp, E, OldValue);
10848 }
10849 Builder.CreateStore(CapturedValue, V.Var, V.IsVolatile);
10850 }
10851 }
10852
10853 checkAndEmitFlushAfterAtomic(Loc, AO, AtomicKind::Compare);
10854
10855 return Builder.saveIP();
10856}
10857
10860 BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower,
10861 Value *NumTeamsUpper, Value *ThreadLimit,
10862 Value *IfExpr) {
10863 if (!updateToLocation(Loc))
10864 return InsertPointTy();
10865
10866 uint32_t SrcLocStrSize;
10867 Constant *SrcLocStr = getOrCreateSrcLocStr(Loc, SrcLocStrSize);
10868 Value *Ident = getOrCreateIdent(SrcLocStr, SrcLocStrSize);
10869 Function *CurrentFunction = Builder.GetInsertBlock()->getParent();
10870
10871 // Outer allocation basicblock is the entry block of the current function.
10872 BasicBlock &OuterAllocaBB = CurrentFunction->getEntryBlock();
10873 if (&OuterAllocaBB == Builder.GetInsertBlock()) {
10874 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.entry");
10875 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
10876 }
10877
10878 // The current basic block is split into four basic blocks. After outlining,
10879 // they will be mapped as follows:
10880 // ```
10881 // def current_fn() {
10882 // current_basic_block:
10883 // br label %teams.exit
10884 // teams.exit:
10885 // ; instructions after teams
10886 // }
10887 //
10888 // def outlined_fn() {
10889 // teams.alloca:
10890 // br label %teams.body
10891 // teams.body:
10892 // ; instructions within teams body
10893 // }
10894 // ```
10895 BasicBlock *ExitBB = splitBB(Builder, /*CreateBranch=*/true, "teams.exit");
10896 BasicBlock *BodyBB = splitBB(Builder, /*CreateBranch=*/true, "teams.body");
10897 BasicBlock *AllocaBB =
10898 splitBB(Builder, /*CreateBranch=*/true, "teams.alloca");
10899
10900 bool SubClausesPresent =
10901 (NumTeamsLower || NumTeamsUpper || ThreadLimit || IfExpr);
10902 // Push num_teams
10903 if (!Config.isTargetDevice() && SubClausesPresent) {
10904 assert((NumTeamsLower == nullptr || NumTeamsUpper != nullptr) &&
10905 "if lowerbound is non-null, then upperbound must also be non-null "
10906 "for bounds on num_teams");
10907
10908 if (NumTeamsUpper == nullptr)
10909 NumTeamsUpper = Builder.getInt32(0);
10910
10911 if (NumTeamsLower == nullptr)
10912 NumTeamsLower = NumTeamsUpper;
10913
10914 if (IfExpr) {
10915 assert(IfExpr->getType()->isIntegerTy() &&
10916 "argument to if clause must be an integer value");
10917
10918 // upper = ifexpr ? upper : 1
10919 if (IfExpr->getType() != Int1)
10920 IfExpr = Builder.CreateICmpNE(IfExpr,
10921 ConstantInt::get(IfExpr->getType(), 0));
10922 NumTeamsUpper = Builder.CreateSelect(
10923 IfExpr, NumTeamsUpper, Builder.getInt32(1), "numTeamsUpper");
10924
10925 // lower = ifexpr ? lower : 1
10926 NumTeamsLower = Builder.CreateSelect(
10927 IfExpr, NumTeamsLower, Builder.getInt32(1), "numTeamsLower");
10928 }
10929
10930 if (ThreadLimit == nullptr)
10931 ThreadLimit = Builder.getInt32(0);
10932
10933 // The __kmpc_push_num_teams_51 function expects int32 as the arguments. So,
10934 // truncate or sign extend the passed values to match the int32 parameters.
10935 Value *NumTeamsLowerInt32 =
10936 Builder.CreateSExtOrTrunc(NumTeamsLower, Builder.getInt32Ty());
10937 Value *NumTeamsUpperInt32 =
10938 Builder.CreateSExtOrTrunc(NumTeamsUpper, Builder.getInt32Ty());
10939 Value *ThreadLimitInt32 =
10940 Builder.CreateSExtOrTrunc(ThreadLimit, Builder.getInt32Ty());
10941
10942 Value *ThreadNum = getOrCreateThreadID(Ident);
10943
10945 getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_teams_51),
10946 {Ident, ThreadNum, NumTeamsLowerInt32, NumTeamsUpperInt32,
10947 ThreadLimitInt32});
10948 }
10949 // Generate the body of teams.
10950 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
10951 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
10952 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
10953 return Err;
10954
10955 OutlineInfo OI;
10956 OI.EntryBB = AllocaBB;
10957 OI.ExitBB = ExitBB;
10958 OI.OuterAllocaBB = &OuterAllocaBB;
10959
10960 // Insert fake values for global tid and bound tid.
10962 InsertPointTy OuterAllocaIP(&OuterAllocaBB, OuterAllocaBB.begin());
10964 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "gid", true));
10966 Builder, OuterAllocaIP, ToBeDeleted, AllocaIP, "tid", true));
10967
10968 auto HostPostOutlineCB = [this, Ident,
10969 ToBeDeleted](Function &OutlinedFn) mutable {
10970 // The stale call instruction will be replaced with a new call instruction
10971 // for runtime call with the outlined function.
10972
10973 assert(OutlinedFn.hasOneUse() &&
10974 "there must be a single user for the outlined function");
10975 CallInst *StaleCI = cast<CallInst>(OutlinedFn.user_back());
10976 ToBeDeleted.push_back(StaleCI);
10977
10978 assert((OutlinedFn.arg_size() == 2 || OutlinedFn.arg_size() == 3) &&
10979 "Outlined function must have two or three arguments only");
10980
10981 bool HasShared = OutlinedFn.arg_size() == 3;
10982
10983 OutlinedFn.getArg(0)->setName("global.tid.ptr");
10984 OutlinedFn.getArg(1)->setName("bound.tid.ptr");
10985 if (HasShared)
10986 OutlinedFn.getArg(2)->setName("data");
10987
10988 // Call to the runtime function for teams in the current function.
10989 assert(StaleCI && "Error while outlining - no CallInst user found for the "
10990 "outlined function.");
10991 Builder.SetInsertPoint(StaleCI);
10992 SmallVector<Value *> Args = {
10993 Ident, Builder.getInt32(StaleCI->arg_size() - 2), &OutlinedFn};
10994 if (HasShared)
10995 Args.push_back(StaleCI->getArgOperand(2));
10998 omp::RuntimeFunction::OMPRTL___kmpc_fork_teams),
10999 Args);
11000
11001 for (Instruction *I : llvm::reverse(ToBeDeleted))
11002 I->eraseFromParent();
11003 };
11004
11005 if (!Config.isTargetDevice())
11006 OI.PostOutlineCB = HostPostOutlineCB;
11007
11008 addOutlineInfo(std::move(OI));
11009
11010 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11011
11012 return Builder.saveIP();
11013}
11014
11017 InsertPointTy OuterAllocaIP,
11018 BodyGenCallbackTy BodyGenCB) {
11019 if (!updateToLocation(Loc))
11020 return InsertPointTy();
11021
11022 BasicBlock *OuterAllocaBB = OuterAllocaIP.getBlock();
11023
11024 if (OuterAllocaBB == Builder.GetInsertBlock()) {
11025 BasicBlock *BodyBB =
11026 splitBB(Builder, /*CreateBranch=*/true, "distribute.entry");
11027 Builder.SetInsertPoint(BodyBB, BodyBB->begin());
11028 }
11029 BasicBlock *ExitBB =
11030 splitBB(Builder, /*CreateBranch=*/true, "distribute.exit");
11031 BasicBlock *BodyBB =
11032 splitBB(Builder, /*CreateBranch=*/true, "distribute.body");
11033 BasicBlock *AllocaBB =
11034 splitBB(Builder, /*CreateBranch=*/true, "distribute.alloca");
11035
11036 // Generate the body of distribute clause
11037 InsertPointTy AllocaIP(AllocaBB, AllocaBB->begin());
11038 InsertPointTy CodeGenIP(BodyBB, BodyBB->begin());
11039 if (Error Err = BodyGenCB(AllocaIP, CodeGenIP))
11040 return Err;
11041
11042 // When using target we use different runtime functions which require a
11043 // callback.
11044 if (Config.isTargetDevice()) {
11045 OutlineInfo OI;
11046 OI.OuterAllocaBB = OuterAllocaIP.getBlock();
11047 OI.EntryBB = AllocaBB;
11048 OI.ExitBB = ExitBB;
11049
11050 addOutlineInfo(std::move(OI));
11051 }
11052 Builder.SetInsertPoint(ExitBB, ExitBB->begin());
11053
11054 return Builder.saveIP();
11055}
11056
11059 std::string VarName) {
11060 llvm::Constant *MapNamesArrayInit = llvm::ConstantArray::get(
11062 Names.size()),
11063 Names);
11064 auto *MapNamesArrayGlobal = new llvm::GlobalVariable(
11065 M, MapNamesArrayInit->getType(),
11066 /*isConstant=*/true, llvm::GlobalValue::PrivateLinkage, MapNamesArrayInit,
11067 VarName);
11068 return MapNamesArrayGlobal;
11069}
11070
11071// Create all simple and struct types exposed by the runtime and remember
11072// the llvm::PointerTypes of them for easy access later.
11073void OpenMPIRBuilder::initializeTypes(Module &M) {
11074 LLVMContext &Ctx = M.getContext();
11075 StructType *T;
11076 unsigned DefaultTargetAS = Config.getDefaultTargetAS();
11077 unsigned ProgramAS = M.getDataLayout().getProgramAddressSpace();
11078#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
11079#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
11080 VarName##Ty = ArrayType::get(ElemTy, ArraySize); \
11081 VarName##PtrTy = PointerType::get(Ctx, DefaultTargetAS);
11082#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
11083 VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg); \
11084 VarName##Ptr = PointerType::get(Ctx, ProgramAS);
11085#define OMP_STRUCT_TYPE(VarName, StructName, Packed, ...) \
11086 T = StructType::getTypeByName(Ctx, StructName); \
11087 if (!T) \
11088 T = StructType::create(Ctx, {__VA_ARGS__}, StructName, Packed); \
11089 VarName = T; \
11090 VarName##Ptr = PointerType::get(Ctx, DefaultTargetAS);
11091#include "llvm/Frontend/OpenMP/OMPKinds.def"
11092}
11093
11096 SmallVectorImpl<BasicBlock *> &BlockVector) {
11098 BlockSet.insert(EntryBB);
11099 BlockSet.insert(ExitBB);
11100
11101 Worklist.push_back(EntryBB);
11102 while (!Worklist.empty()) {
11103 BasicBlock *BB = Worklist.pop_back_val();
11104 BlockVector.push_back(BB);
11105 for (BasicBlock *SuccBB : successors(BB))
11106 if (BlockSet.insert(SuccBB).second)
11107 Worklist.push_back(SuccBB);
11108 }
11109}
11110
11112 uint64_t Size, int32_t Flags,
11114 StringRef Name) {
11115 if (!Config.isGPU()) {
11118 Name.empty() ? Addr->getName() : Name, Size, Flags, /*Data=*/0);
11119 return;
11120 }
11121 // TODO: Add support for global variables on the device after declare target
11122 // support.
11123 Function *Fn = dyn_cast<Function>(Addr);
11124 if (!Fn)
11125 return;
11126
11127 // Add a function attribute for the kernel.
11128 Fn->addFnAttr("kernel");
11129 if (T.isAMDGCN())
11130 Fn->addFnAttr("uniform-work-group-size");
11131 Fn->addFnAttr(Attribute::MustProgress);
11132}
11133
11134// We only generate metadata for function that contain target regions.
11137
11138 // If there are no entries, we don't need to do anything.
11139 if (OffloadInfoManager.empty())
11140 return;
11141
11142 LLVMContext &C = M.getContext();
11145 16>
11146 OrderedEntries(OffloadInfoManager.size());
11147
11148 // Auxiliary methods to create metadata values and strings.
11149 auto &&GetMDInt = [this](unsigned V) {
11150 return ConstantAsMetadata::get(ConstantInt::get(Builder.getInt32Ty(), V));
11151 };
11152
11153 auto &&GetMDString = [&C](StringRef V) { return MDString::get(C, V); };
11154
11155 // Create the offloading info metadata node.
11156 NamedMDNode *MD = M.getOrInsertNamedMetadata("omp_offload.info");
11157 auto &&TargetRegionMetadataEmitter =
11158 [&C, MD, &OrderedEntries, &GetMDInt, &GetMDString](
11159 const TargetRegionEntryInfo &EntryInfo,
11161 // Generate metadata for target regions. Each entry of this metadata
11162 // contains:
11163 // - Entry 0 -> Kind of this type of metadata (0).
11164 // - Entry 1 -> Device ID of the file where the entry was identified.
11165 // - Entry 2 -> File ID of the file where the entry was identified.
11166 // - Entry 3 -> Mangled name of the function where the entry was
11167 // identified.
11168 // - Entry 4 -> Line in the file where the entry was identified.
11169 // - Entry 5 -> Count of regions at this DeviceID/FilesID/Line.
11170 // - Entry 6 -> Order the entry was created.
11171 // The first element of the metadata node is the kind.
11172 Metadata *Ops[] = {
11173 GetMDInt(E.getKind()), GetMDInt(EntryInfo.DeviceID),
11174 GetMDInt(EntryInfo.FileID), GetMDString(EntryInfo.ParentName),
11175 GetMDInt(EntryInfo.Line), GetMDInt(EntryInfo.Count),
11176 GetMDInt(E.getOrder())};
11177
11178 // Save this entry in the right position of the ordered entries array.
11179 OrderedEntries[E.getOrder()] = std::make_pair(&E, EntryInfo);
11180
11181 // Add metadata to the named metadata node.
11182 MD->addOperand(MDNode::get(C, Ops));
11183 };
11184
11185 OffloadInfoManager.actOnTargetRegionEntriesInfo(TargetRegionMetadataEmitter);
11186
11187 // Create function that emits metadata for each device global variable entry;
11188 auto &&DeviceGlobalVarMetadataEmitter =
11189 [&C, &OrderedEntries, &GetMDInt, &GetMDString, MD](
11190 StringRef MangledName,
11192 // Generate metadata for global variables. Each entry of this metadata
11193 // contains:
11194 // - Entry 0 -> Kind of this type of metadata (1).
11195 // - Entry 1 -> Mangled name of the variable.
11196 // - Entry 2 -> Declare target kind.
11197 // - Entry 3 -> Order the entry was created.
11198 // The first element of the metadata node is the kind.
11199 Metadata *Ops[] = {GetMDInt(E.getKind()), GetMDString(MangledName),
11200 GetMDInt(E.getFlags()), GetMDInt(E.getOrder())};
11201
11202 // Save this entry in the right position of the ordered entries array.
11203 TargetRegionEntryInfo varInfo(MangledName, 0, 0, 0);
11204 OrderedEntries[E.getOrder()] = std::make_pair(&E, varInfo);
11205
11206 // Add metadata to the named metadata node.
11207 MD->addOperand(MDNode::get(C, Ops));
11208 };
11209
11210 OffloadInfoManager.actOnDeviceGlobalVarEntriesInfo(
11211 DeviceGlobalVarMetadataEmitter);
11212
11213 for (const auto &E : OrderedEntries) {
11214 assert(E.first && "All ordered entries must exist!");
11215 if (const auto *CE =
11217 E.first)) {
11218 if (!CE->getID() || !CE->getAddress()) {
11219 // Do not blame the entry if the parent funtion is not emitted.
11220 TargetRegionEntryInfo EntryInfo = E.second;
11221 StringRef FnName = EntryInfo.ParentName;
11222 if (!M.getNamedValue(FnName))
11223 continue;
11224 ErrorFn(EMIT_MD_TARGET_REGION_ERROR, EntryInfo);
11225 continue;
11226 }
11227 createOffloadEntry(CE->getID(), CE->getAddress(),
11228 /*Size=*/0, CE->getFlags(),
11230 } else if (const auto *CE = dyn_cast<
11232 E.first)) {
11235 CE->getFlags());
11236 switch (Flags) {
11239 if (Config.isTargetDevice() && Config.hasRequiresUnifiedSharedMemory())
11240 continue;
11241 if (!CE->getAddress()) {
11242 ErrorFn(EMIT_MD_DECLARE_TARGET_ERROR, E.second);
11243 continue;
11244 }
11245 // The vaiable has no definition - no need to add the entry.
11246 if (CE->getVarSize() == 0)
11247 continue;
11248 break;
11250 assert(((Config.isTargetDevice() && !CE->getAddress()) ||
11251 (!Config.isTargetDevice() && CE->getAddress())) &&
11252 "Declaret target link address is set.");
11253 if (Config.isTargetDevice())
11254 continue;
11255 if (!CE->getAddress()) {
11257 continue;
11258 }
11259 break;
11262 if (!CE->getAddress()) {
11263 ErrorFn(EMIT_MD_GLOBAL_VAR_INDIRECT_ERROR, E.second);
11264 continue;
11265 }
11266 break;
11267 default:
11268 break;
11269 }
11270
11271 // Hidden or internal symbols on the device are not externally visible.
11272 // We should not attempt to register them by creating an offloading
11273 // entry. Indirect variables are handled separately on the device.
11274 if (auto *GV = dyn_cast<GlobalValue>(CE->getAddress()))
11275 if ((GV->hasLocalLinkage() || GV->hasHiddenVisibility()) &&
11276 (Flags !=
11278 Flags != OffloadEntriesInfoManager::
11279 OMPTargetGlobalVarEntryIndirectVTable))
11280 continue;
11281
11282 // Indirect globals need to use a special name that doesn't match the name
11283 // of the associated host global.
11285 Flags ==
11287 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11288 Flags, CE->getLinkage(), CE->getVarName());
11289 else
11290 createOffloadEntry(CE->getAddress(), CE->getAddress(), CE->getVarSize(),
11291 Flags, CE->getLinkage());
11292
11293 } else {
11294 llvm_unreachable("Unsupported entry kind.");
11295 }
11296 }
11297
11298 // Emit requires directive globals to a special entry so the runtime can
11299 // register them when the device image is loaded.
11300 // TODO: This reduces the offloading entries to a 32-bit integer. Offloading
11301 // entries should be redesigned to better suit this use-case.
11302 if (Config.hasRequiresFlags() && !Config.isTargetDevice())
11306 ".requires", /*Size=*/0,
11308 Config.getRequiresFlags());
11309}
11310
11313 unsigned FileID, unsigned Line, unsigned Count) {
11314 raw_svector_ostream OS(Name);
11315 OS << KernelNamePrefix << llvm::format("%x", DeviceID)
11316 << llvm::format("_%x_", FileID) << ParentName << "_l" << Line;
11317 if (Count)
11318 OS << "_" << Count;
11319}
11320
11322 SmallVectorImpl<char> &Name, const TargetRegionEntryInfo &EntryInfo) {
11323 unsigned NewCount = getTargetRegionEntryInfoCount(EntryInfo);
11325 Name, EntryInfo.ParentName, EntryInfo.DeviceID, EntryInfo.FileID,
11326 EntryInfo.Line, NewCount);
11327}
11328
11331 vfs::FileSystem &VFS,
11332 StringRef ParentName) {
11333 sys::fs::UniqueID ID(0xdeadf17e, 0);
11334 auto FileIDInfo = CallBack();
11335 uint64_t FileID = 0;
11336 if (ErrorOr<vfs::Status> Status = VFS.status(std::get<0>(FileIDInfo))) {
11337 ID = Status->getUniqueID();
11338 FileID = Status->getUniqueID().getFile();
11339 } else {
11340 // If the inode ID could not be determined, create a hash value
11341 // the current file name and use that as an ID.
11342 FileID = hash_value(std::get<0>(FileIDInfo));
11343 }
11344
11345 return TargetRegionEntryInfo(ParentName, ID.getDevice(), FileID,
11346 std::get<1>(FileIDInfo));
11347}
11348
11350 unsigned Offset = 0;
11351 for (uint64_t Remain =
11352 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11354 !(Remain & 1); Remain = Remain >> 1)
11355 Offset++;
11356 return Offset;
11357}
11358
11361 // Rotate by getFlagMemberOffset() bits.
11362 return static_cast<omp::OpenMPOffloadMappingFlags>(((uint64_t)Position + 1)
11363 << getFlagMemberOffset());
11364}
11365
11368 omp::OpenMPOffloadMappingFlags MemberOfFlag) {
11369 // If the entry is PTR_AND_OBJ but has not been marked with the special
11370 // placeholder value 0xFFFF in the MEMBER_OF field, then it should not be
11371 // marked as MEMBER_OF.
11372 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11374 static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11377 return;
11378
11379 // Entries with ATTACH are not members-of anything. They are handled
11380 // separately by the runtime after other maps have been handled.
11381 if (static_cast<std::underlying_type_t<omp::OpenMPOffloadMappingFlags>>(
11383 return;
11384
11385 // Reset the placeholder value to prepare the flag for the assignment of the
11386 // proper MEMBER_OF value.
11387 Flags &= ~omp::OpenMPOffloadMappingFlags::OMP_MAP_MEMBER_OF;
11388 Flags |= MemberOfFlag;
11389}
11390
11394 bool IsDeclaration, bool IsExternallyVisible,
11395 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11396 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11397 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
11398 std::function<Constant *()> GlobalInitializer,
11399 std::function<GlobalValue::LinkageTypes()> VariableLinkage) {
11400 // TODO: convert this to utilise the IRBuilder Config rather than
11401 // a passed down argument.
11402 if (OpenMPSIMD)
11403 return nullptr;
11404
11407 CaptureClause ==
11409 Config.hasRequiresUnifiedSharedMemory())) {
11410 SmallString<64> PtrName;
11411 {
11412 raw_svector_ostream OS(PtrName);
11413 OS << MangledName;
11414 if (!IsExternallyVisible)
11415 OS << format("_%x", EntryInfo.FileID);
11416 OS << "_decl_tgt_ref_ptr";
11417 }
11418
11419 Value *Ptr = M.getNamedValue(PtrName);
11420
11421 if (!Ptr) {
11422 GlobalValue *GlobalValue = M.getNamedValue(MangledName);
11423 Ptr = getOrCreateInternalVariable(LlvmPtrTy, PtrName);
11424
11425 auto *GV = cast<GlobalVariable>(Ptr);
11426 GV->setLinkage(GlobalValue::WeakAnyLinkage);
11427
11428 if (!Config.isTargetDevice()) {
11429 if (GlobalInitializer)
11430 GV->setInitializer(GlobalInitializer());
11431 else
11432 GV->setInitializer(GlobalValue);
11433 }
11434
11436 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11437 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11438 GlobalInitializer, VariableLinkage, LlvmPtrTy, cast<Constant>(Ptr));
11439 }
11440
11441 return cast<Constant>(Ptr);
11442 }
11443
11444 return nullptr;
11445}
11446
11450 bool IsDeclaration, bool IsExternallyVisible,
11451 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
11452 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
11453 std::vector<Triple> TargetTriple,
11454 std::function<Constant *()> GlobalInitializer,
11455 std::function<GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy,
11456 Constant *Addr) {
11458 (TargetTriple.empty() && !Config.isTargetDevice()))
11459 return;
11460
11462 StringRef VarName;
11463 int64_t VarSize;
11465
11467 CaptureClause ==
11469 !Config.hasRequiresUnifiedSharedMemory()) {
11471 VarName = MangledName;
11472 GlobalValue *LlvmVal = M.getNamedValue(VarName);
11473
11474 if (!IsDeclaration)
11475 VarSize = divideCeil(
11476 M.getDataLayout().getTypeSizeInBits(LlvmVal->getValueType()), 8);
11477 else
11478 VarSize = 0;
11479 Linkage = (VariableLinkage) ? VariableLinkage() : LlvmVal->getLinkage();
11480
11481 // This is a workaround carried over from Clang which prevents undesired
11482 // optimisation of internal variables.
11483 if (Config.isTargetDevice() &&
11484 (!IsExternallyVisible || Linkage == GlobalValue::LinkOnceODRLinkage)) {
11485 // Do not create a "ref-variable" if the original is not also available
11486 // on the host.
11487 if (!OffloadInfoManager.hasDeviceGlobalVarEntryInfo(VarName))
11488 return;
11489
11490 std::string RefName = createPlatformSpecificName({VarName, "ref"});
11491
11492 if (!M.getNamedValue(RefName)) {
11493 Constant *AddrRef =
11494 getOrCreateInternalVariable(Addr->getType(), RefName);
11495 auto *GvAddrRef = cast<GlobalVariable>(AddrRef);
11496 GvAddrRef->setConstant(true);
11497 GvAddrRef->setLinkage(GlobalValue::InternalLinkage);
11498 GvAddrRef->setInitializer(Addr);
11499 GeneratedRefs.push_back(GvAddrRef);
11500 }
11501 }
11502 } else {
11505 else
11507
11508 if (Config.isTargetDevice()) {
11509 VarName = (Addr) ? Addr->getName() : "";
11510 Addr = nullptr;
11511 } else {
11513 CaptureClause, DeviceClause, IsDeclaration, IsExternallyVisible,
11514 EntryInfo, MangledName, GeneratedRefs, OpenMPSIMD, TargetTriple,
11515 LlvmPtrTy, GlobalInitializer, VariableLinkage);
11516 VarName = (Addr) ? Addr->getName() : "";
11517 }
11518 VarSize = M.getDataLayout().getPointerSize();
11520 }
11521
11522 OffloadInfoManager.registerDeviceGlobalVarEntryInfo(VarName, Addr, VarSize,
11523 Flags, Linkage);
11524}
11525
11526/// Loads all the offload entries information from the host IR
11527/// metadata.
11529 // If we are in target mode, load the metadata from the host IR. This code has
11530 // to match the metadata creation in createOffloadEntriesAndInfoMetadata().
11531
11532 NamedMDNode *MD = M.getNamedMetadata(ompOffloadInfoName);
11533 if (!MD)
11534 return;
11535
11536 for (MDNode *MN : MD->operands()) {
11537 auto &&GetMDInt = [MN](unsigned Idx) {
11538 auto *V = cast<ConstantAsMetadata>(MN->getOperand(Idx));
11539 return cast<ConstantInt>(V->getValue())->getZExtValue();
11540 };
11541
11542 auto &&GetMDString = [MN](unsigned Idx) {
11543 auto *V = cast<MDString>(MN->getOperand(Idx));
11544 return V->getString();
11545 };
11546
11547 switch (GetMDInt(0)) {
11548 default:
11549 llvm_unreachable("Unexpected metadata!");
11550 break;
11551 case OffloadEntriesInfoManager::OffloadEntryInfo::
11552 OffloadingEntryInfoTargetRegion: {
11553 TargetRegionEntryInfo EntryInfo(/*ParentName=*/GetMDString(3),
11554 /*DeviceID=*/GetMDInt(1),
11555 /*FileID=*/GetMDInt(2),
11556 /*Line=*/GetMDInt(4),
11557 /*Count=*/GetMDInt(5));
11558 OffloadInfoManager.initializeTargetRegionEntryInfo(EntryInfo,
11559 /*Order=*/GetMDInt(6));
11560 break;
11561 }
11562 case OffloadEntriesInfoManager::OffloadEntryInfo::
11563 OffloadingEntryInfoDeviceGlobalVar:
11564 OffloadInfoManager.initializeDeviceGlobalVarEntryInfo(
11565 /*MangledName=*/GetMDString(1),
11567 /*Flags=*/GetMDInt(2)),
11568 /*Order=*/GetMDInt(3));
11569 break;
11570 }
11571 }
11572}
11573
11575 StringRef HostFilePath) {
11576 if (HostFilePath.empty())
11577 return;
11578
11579 auto Buf = VFS.getBufferForFile(HostFilePath);
11580 if (std::error_code Err = Buf.getError()) {
11581 report_fatal_error(("error opening host file from host file path inside of "
11582 "OpenMPIRBuilder: " +
11583 Err.message())
11584 .c_str());
11585 }
11586
11587 LLVMContext Ctx;
11589 Ctx, parseBitcodeFile(Buf.get()->getMemBufferRef(), Ctx));
11590 if (std::error_code Err = M.getError()) {
11592 ("error parsing host file inside of OpenMPIRBuilder: " + Err.message())
11593 .c_str());
11594 }
11595
11596 loadOffloadInfoMetadata(*M.get());
11597}
11598
11601 llvm::StringRef Name) {
11602 Builder.restoreIP(Loc.IP);
11603
11604 BasicBlock *CurBB = Builder.GetInsertBlock();
11605 assert(CurBB &&
11606 "expected a valid insertion block for creating an iterator loop");
11607 Function *F = CurBB->getParent();
11608
11609 InsertPointTy SplitIP = Builder.saveIP();
11610 if (SplitIP.getPoint() == CurBB->end())
11611 if (Instruction *Terminator = CurBB->getTerminatorOrNull())
11612 SplitIP = InsertPointTy(CurBB, Terminator->getIterator());
11613
11614 BasicBlock *ContBB =
11615 splitBB(SplitIP, /*CreateBranch=*/false,
11616 Builder.getCurrentDebugLocation(), "omp.it.cont");
11617
11618 CanonicalLoopInfo *CLI =
11619 createLoopSkeleton(Builder.getCurrentDebugLocation(), TripCount, F,
11620 /*PreInsertBefore=*/ContBB,
11621 /*PostInsertBefore=*/ContBB, Name);
11622
11623 // Enter loop from original block.
11624 redirectTo(CurBB, CLI->getPreheader(), Builder.getCurrentDebugLocation());
11625
11626 // Remove the unconditional branch inserted by createLoopSkeleton in the body
11627 if (Instruction *T = CLI->getBody()->getTerminatorOrNull())
11628 T->eraseFromParent();
11629
11630 InsertPointTy BodyIP = CLI->getBodyIP();
11631 if (llvm::Error Err = BodyGen(BodyIP, CLI->getIndVar()))
11632 return Err;
11633
11634 // Body must either fallthrough to the latch or branch directly to it.
11635 if (Instruction *BodyTerminator = CLI->getBody()->getTerminatorOrNull()) {
11636 auto *BodyBr = dyn_cast<UncondBrInst>(BodyTerminator);
11637 if (!BodyBr || BodyBr->getSuccessor() != CLI->getLatch()) {
11639 "iterator bodygen must terminate the canonical body with an "
11640 "unconditional branch to the loop latch",
11642 }
11643 } else {
11644 // Ensure we end the loop body by jumping to the latch.
11645 Builder.SetInsertPoint(CLI->getBody());
11646 Builder.CreateBr(CLI->getLatch());
11647 }
11648
11649 // Link After -> ContBB
11650 Builder.SetInsertPoint(CLI->getAfter(), CLI->getAfter()->begin());
11651 if (!CLI->getAfter()->hasTerminator())
11652 Builder.CreateBr(ContBB);
11653
11654 return InsertPointTy{ContBB, ContBB->begin()};
11655}
11656
11657/// Mangle the parameter part of the vector function name according to
11658/// their OpenMP classification. The mangling function is defined in
11659/// section 4.5 of the AAVFABI(2021Q1).
11660static std::string mangleVectorParameters(
11662 SmallString<256> Buffer;
11663 llvm::raw_svector_ostream Out(Buffer);
11664 for (const auto &ParamAttr : ParamAttrs) {
11665 switch (ParamAttr.Kind) {
11667 Out << 'l';
11668 break;
11670 Out << 'R';
11671 break;
11673 Out << 'U';
11674 break;
11676 Out << 'L';
11677 break;
11679 Out << 'u';
11680 break;
11682 Out << 'v';
11683 break;
11684 }
11685 if (ParamAttr.HasVarStride)
11686 Out << "s" << ParamAttr.StrideOrArg;
11687 else if (ParamAttr.Kind ==
11689 ParamAttr.Kind ==
11691 ParamAttr.Kind ==
11693 ParamAttr.Kind ==
11695 // Don't print the step value if it is not present or if it is
11696 // equal to 1.
11697 if (ParamAttr.StrideOrArg < 0)
11698 Out << 'n' << -ParamAttr.StrideOrArg;
11699 else if (ParamAttr.StrideOrArg != 1)
11700 Out << ParamAttr.StrideOrArg;
11701 }
11702
11703 if (!!ParamAttr.Alignment)
11704 Out << 'a' << ParamAttr.Alignment;
11705 }
11706
11707 return std::string(Out.str());
11708}
11709
11711 llvm::Function *Fn, unsigned NumElts, const llvm::APSInt &VLENVal,
11713 struct ISADataTy {
11714 char ISA;
11715 unsigned VecRegSize;
11716 };
11717 ISADataTy ISAData[] = {
11718 {'b', 128}, // SSE
11719 {'c', 256}, // AVX
11720 {'d', 256}, // AVX2
11721 {'e', 512}, // AVX512
11722 };
11724 switch (Branch) {
11726 Masked.push_back('N');
11727 Masked.push_back('M');
11728 break;
11730 Masked.push_back('N');
11731 break;
11733 Masked.push_back('M');
11734 break;
11735 }
11736 for (char Mask : Masked) {
11737 for (const ISADataTy &Data : ISAData) {
11739 llvm::raw_svector_ostream Out(Buffer);
11740 Out << "_ZGV" << Data.ISA << Mask;
11741 if (!VLENVal) {
11742 assert(NumElts && "Non-zero simdlen/cdtsize expected");
11743 Out << llvm::APSInt::getUnsigned(Data.VecRegSize / NumElts);
11744 } else {
11745 Out << VLENVal;
11746 }
11747 Out << mangleVectorParameters(ParamAttrs);
11748 Out << '_' << Fn->getName();
11749 Fn->addFnAttr(Out.str());
11750 }
11751 }
11752}
11753
11754// Function used to add the attribute. The parameter `VLEN` is templated to
11755// allow the use of `x` when targeting scalable functions for SVE.
11756template <typename T>
11757static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix,
11758 char ISA, StringRef ParSeq,
11759 StringRef MangledName, bool OutputBecomesInput,
11760 llvm::Function *Fn) {
11761 SmallString<256> Buffer;
11762 llvm::raw_svector_ostream Out(Buffer);
11763 Out << Prefix << ISA << LMask << VLEN;
11764 if (OutputBecomesInput)
11765 Out << 'v';
11766 Out << ParSeq << '_' << MangledName;
11767 Fn->addFnAttr(Out.str());
11768}
11769
11770// Helper function to generate the Advanced SIMD names depending on the value
11771// of the NDS when simdlen is not present.
11772static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask,
11773 StringRef Prefix, char ISA,
11774 StringRef ParSeq, StringRef MangledName,
11775 bool OutputBecomesInput,
11776 llvm::Function *Fn) {
11777 switch (NDS) {
11778 case 8:
11779 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11780 OutputBecomesInput, Fn);
11781 addAArch64VectorName(16, Mask, Prefix, ISA, ParSeq, MangledName,
11782 OutputBecomesInput, Fn);
11783 break;
11784 case 16:
11785 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11786 OutputBecomesInput, Fn);
11787 addAArch64VectorName(8, Mask, Prefix, ISA, ParSeq, MangledName,
11788 OutputBecomesInput, Fn);
11789 break;
11790 case 32:
11791 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11792 OutputBecomesInput, Fn);
11793 addAArch64VectorName(4, Mask, Prefix, ISA, ParSeq, MangledName,
11794 OutputBecomesInput, Fn);
11795 break;
11796 case 64:
11797 case 128:
11798 addAArch64VectorName(2, Mask, Prefix, ISA, ParSeq, MangledName,
11799 OutputBecomesInput, Fn);
11800 break;
11801 default:
11802 llvm_unreachable("Scalar type is too wide.");
11803 }
11804}
11805
11806/// Emit vector function attributes for AArch64, as defined in the AAVFABI.
11808 llvm::Function *Fn, unsigned UserVLEN,
11810 char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput) {
11811 assert((ISA == 'n' || ISA == 's') && "Expected ISA either 's' or 'n'.");
11812
11813 // Sort out parameter sequence.
11814 const std::string ParSeq = mangleVectorParameters(ParamAttrs);
11815 StringRef Prefix = "_ZGV";
11816 StringRef MangledName = Fn->getName();
11817
11818 // Generate simdlen from user input (if any).
11819 if (UserVLEN) {
11820 if (ISA == 's') {
11821 // SVE generates only a masked function.
11822 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11823 OutputBecomesInput, Fn);
11824 return;
11825 }
11826
11827 switch (Branch) {
11829 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11830 OutputBecomesInput, Fn);
11831 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11832 OutputBecomesInput, Fn);
11833 break;
11835 addAArch64VectorName(UserVLEN, "M", Prefix, ISA, ParSeq, MangledName,
11836 OutputBecomesInput, Fn);
11837 break;
11839 addAArch64VectorName(UserVLEN, "N", Prefix, ISA, ParSeq, MangledName,
11840 OutputBecomesInput, Fn);
11841 break;
11842 }
11843 return;
11844 }
11845
11846 if (ISA == 's') {
11847 // SVE, section 3.4.1, item 1.
11848 addAArch64VectorName("x", "M", Prefix, ISA, ParSeq, MangledName,
11849 OutputBecomesInput, Fn);
11850 return;
11851 }
11852
11853 switch (Branch) {
11855 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11856 MangledName, OutputBecomesInput, Fn);
11857 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11858 MangledName, OutputBecomesInput, Fn);
11859 break;
11861 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "M", Prefix, ISA, ParSeq,
11862 MangledName, OutputBecomesInput, Fn);
11863 break;
11865 addAArch64AdvSIMDNDSNames(NarrowestDataSize, "N", Prefix, ISA, ParSeq,
11866 MangledName, OutputBecomesInput, Fn);
11867 break;
11868 }
11869}
11870
11871//===----------------------------------------------------------------------===//
11872// OffloadEntriesInfoManager
11873//===----------------------------------------------------------------------===//
11874
11876 return OffloadEntriesTargetRegion.empty() &&
11877 OffloadEntriesDeviceGlobalVar.empty();
11878}
11879
11880unsigned OffloadEntriesInfoManager::getTargetRegionEntryInfoCount(
11881 const TargetRegionEntryInfo &EntryInfo) const {
11882 auto It = OffloadEntriesTargetRegionCount.find(
11883 getTargetRegionEntryCountKey(EntryInfo));
11884 if (It == OffloadEntriesTargetRegionCount.end())
11885 return 0;
11886 return It->second;
11887}
11888
11889void OffloadEntriesInfoManager::incrementTargetRegionEntryInfoCount(
11890 const TargetRegionEntryInfo &EntryInfo) {
11891 OffloadEntriesTargetRegionCount[getTargetRegionEntryCountKey(EntryInfo)] =
11892 EntryInfo.Count + 1;
11893}
11894
11895/// Initialize target region entry.
11897 const TargetRegionEntryInfo &EntryInfo, unsigned Order) {
11898 OffloadEntriesTargetRegion[EntryInfo] =
11899 OffloadEntryInfoTargetRegion(Order, /*Addr=*/nullptr, /*ID=*/nullptr,
11901 ++OffloadingEntriesNum;
11902}
11903
11905 TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID,
11907 assert(EntryInfo.Count == 0 && "expected default EntryInfo");
11908
11909 // Update the EntryInfo with the next available count for this location.
11910 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11911
11912 // If we are emitting code for a target, the entry is already initialized,
11913 // only has to be registered.
11914 if (OMPBuilder->Config.isTargetDevice()) {
11915 // This could happen if the device compilation is invoked standalone.
11916 if (!hasTargetRegionEntryInfo(EntryInfo)) {
11917 return;
11918 }
11919 auto &Entry = OffloadEntriesTargetRegion[EntryInfo];
11920 Entry.setAddress(Addr);
11921 Entry.setID(ID);
11922 Entry.setFlags(Flags);
11923 } else {
11925 hasTargetRegionEntryInfo(EntryInfo, /*IgnoreAddressId*/ true))
11926 return;
11927 assert(!hasTargetRegionEntryInfo(EntryInfo) &&
11928 "Target region entry already registered!");
11929 OffloadEntryInfoTargetRegion Entry(OffloadingEntriesNum, Addr, ID, Flags);
11930 OffloadEntriesTargetRegion[EntryInfo] = Entry;
11931 ++OffloadingEntriesNum;
11932 }
11933 incrementTargetRegionEntryInfoCount(EntryInfo);
11934}
11935
11937 TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId) const {
11938
11939 // Update the EntryInfo with the next available count for this location.
11940 EntryInfo.Count = getTargetRegionEntryInfoCount(EntryInfo);
11941
11942 auto It = OffloadEntriesTargetRegion.find(EntryInfo);
11943 if (It == OffloadEntriesTargetRegion.end()) {
11944 return false;
11945 }
11946 // Fail if this entry is already registered.
11947 if (!IgnoreAddressId && (It->second.getAddress() || It->second.getID()))
11948 return false;
11949 return true;
11950}
11951
11953 const OffloadTargetRegionEntryInfoActTy &Action) {
11954 // Scan all target region entries and perform the provided action.
11955 for (const auto &It : OffloadEntriesTargetRegion) {
11956 Action(It.first, It.second);
11957 }
11958}
11959
11961 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order) {
11962 OffloadEntriesDeviceGlobalVar.try_emplace(Name, Order, Flags);
11963 ++OffloadingEntriesNum;
11964}
11965
11967 StringRef VarName, Constant *Addr, int64_t VarSize,
11969 if (OMPBuilder->Config.isTargetDevice()) {
11970 // This could happen if the device compilation is invoked standalone.
11971 if (!hasDeviceGlobalVarEntryInfo(VarName))
11972 return;
11973 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11974 if (Entry.getAddress() && hasDeviceGlobalVarEntryInfo(VarName)) {
11975 if (Entry.getVarSize() == 0) {
11976 Entry.setVarSize(VarSize);
11977 Entry.setLinkage(Linkage);
11978 }
11979 return;
11980 }
11981 Entry.setVarSize(VarSize);
11982 Entry.setLinkage(Linkage);
11983 Entry.setAddress(Addr);
11984 } else {
11985 if (hasDeviceGlobalVarEntryInfo(VarName)) {
11986 auto &Entry = OffloadEntriesDeviceGlobalVar[VarName];
11987 assert(Entry.isValid() && Entry.getFlags() == Flags &&
11988 "Entry not initialized!");
11989 if (Entry.getVarSize() == 0) {
11990 Entry.setVarSize(VarSize);
11991 Entry.setLinkage(Linkage);
11992 }
11993 return;
11994 }
11996 Flags ==
11998 OffloadEntriesDeviceGlobalVar.try_emplace(VarName, OffloadingEntriesNum,
11999 Addr, VarSize, Flags, Linkage,
12000 VarName.str());
12001 else
12002 OffloadEntriesDeviceGlobalVar.try_emplace(
12003 VarName, OffloadingEntriesNum, Addr, VarSize, Flags, Linkage, "");
12004 ++OffloadingEntriesNum;
12005 }
12006}
12007
12010 // Scan all target region entries and perform the provided action.
12011 for (const auto &E : OffloadEntriesDeviceGlobalVar)
12012 Action(E.getKey(), E.getValue());
12013}
12014
12015//===----------------------------------------------------------------------===//
12016// CanonicalLoopInfo
12017//===----------------------------------------------------------------------===//
12018
12019void CanonicalLoopInfo::collectControlBlocks(
12021 // We only count those BBs as control block for which we do not need to
12022 // reverse the CFG, i.e. not the loop body which can contain arbitrary control
12023 // flow. For consistency, this also means we do not add the Body block, which
12024 // is just the entry to the body code.
12025 BBs.reserve(BBs.size() + 6);
12026 BBs.append({getPreheader(), Header, Cond, Latch, Exit, getAfter()});
12027}
12028
12030 assert(isValid() && "Requires a valid canonical loop");
12031 for (BasicBlock *Pred : predecessors(Header)) {
12032 if (Pred != Latch)
12033 return Pred;
12034 }
12035 llvm_unreachable("Missing preheader");
12036}
12037
12038void CanonicalLoopInfo::setTripCount(Value *TripCount) {
12039 assert(isValid() && "Requires a valid canonical loop");
12040
12041 Instruction *CmpI = &getCond()->front();
12042 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
12043 CmpI->setOperand(1, TripCount);
12044
12045#ifndef NDEBUG
12046 assertOK();
12047#endif
12048}
12049
12050void CanonicalLoopInfo::mapIndVar(
12051 llvm::function_ref<Value *(Instruction *)> Updater) {
12052 assert(isValid() && "Requires a valid canonical loop");
12053
12054 Instruction *OldIV = getIndVar();
12055
12056 // Record all uses excluding those introduced by the updater. Uses by the
12057 // CanonicalLoopInfo itself to keep track of the number of iterations are
12058 // excluded.
12059 SmallVector<Use *> ReplacableUses;
12060 for (Use &U : OldIV->uses()) {
12061 auto *User = dyn_cast<Instruction>(U.getUser());
12062 if (!User)
12063 continue;
12064 if (User->getParent() == getCond())
12065 continue;
12066 if (User->getParent() == getLatch())
12067 continue;
12068 ReplacableUses.push_back(&U);
12069 }
12070
12071 // Run the updater that may introduce new uses
12072 Value *NewIV = Updater(OldIV);
12073
12074 // Replace the old uses with the value returned by the updater.
12075 for (Use *U : ReplacableUses)
12076 U->set(NewIV);
12077
12078#ifndef NDEBUG
12079 assertOK();
12080#endif
12081}
12082
12084#ifndef NDEBUG
12085 // No constraints if this object currently does not describe a loop.
12086 if (!isValid())
12087 return;
12088
12089 BasicBlock *Preheader = getPreheader();
12090 BasicBlock *Body = getBody();
12091 BasicBlock *After = getAfter();
12092
12093 // Verify standard control-flow we use for OpenMP loops.
12094 assert(Preheader);
12095 assert(isa<UncondBrInst>(Preheader->getTerminator()) &&
12096 "Preheader must terminate with unconditional branch");
12097 assert(Preheader->getSingleSuccessor() == Header &&
12098 "Preheader must jump to header");
12099
12100 assert(Header);
12101 assert(isa<UncondBrInst>(Header->getTerminator()) &&
12102 "Header must terminate with unconditional branch");
12103 assert(Header->getSingleSuccessor() == Cond &&
12104 "Header must jump to exiting block");
12105
12106 assert(Cond);
12107 assert(Cond->getSinglePredecessor() == Header &&
12108 "Exiting block only reachable from header");
12109
12110 assert(isa<CondBrInst>(Cond->getTerminator()) &&
12111 "Exiting block must terminate with conditional branch");
12112 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(0) == Body &&
12113 "Exiting block's first successor jump to the body");
12114 assert(cast<CondBrInst>(Cond->getTerminator())->getSuccessor(1) == Exit &&
12115 "Exiting block's second successor must exit the loop");
12116
12117 assert(Body);
12118 assert(Body->getSinglePredecessor() == Cond &&
12119 "Body only reachable from exiting block");
12120 assert(!isa<PHINode>(Body->front()));
12121
12122 assert(Latch);
12123 assert(isa<UncondBrInst>(Latch->getTerminator()) &&
12124 "Latch must terminate with unconditional branch");
12125 assert(Latch->getSingleSuccessor() == Header && "Latch must jump to header");
12126 // TODO: To support simple redirecting of the end of the body code that has
12127 // multiple; introduce another auxiliary basic block like preheader and after.
12128 assert(Latch->getSinglePredecessor() != nullptr);
12129 assert(!isa<PHINode>(Latch->front()));
12130
12131 assert(Exit);
12132 assert(isa<UncondBrInst>(Exit->getTerminator()) &&
12133 "Exit block must terminate with unconditional branch");
12134 assert(Exit->getSingleSuccessor() == After &&
12135 "Exit block must jump to after block");
12136
12137 assert(After);
12138 assert(After->getSinglePredecessor() == Exit &&
12139 "After block only reachable from exit block");
12140 assert(After->empty() || !isa<PHINode>(After->front()));
12141
12142 Instruction *IndVar = getIndVar();
12143 assert(IndVar && "Canonical induction variable not found?");
12144 assert(isa<IntegerType>(IndVar->getType()) &&
12145 "Induction variable must be an integer");
12146 assert(cast<PHINode>(IndVar)->getParent() == Header &&
12147 "Induction variable must be a PHI in the loop header");
12148 assert(cast<PHINode>(IndVar)->getIncomingBlock(0) == Preheader);
12149 assert(
12150 cast<ConstantInt>(cast<PHINode>(IndVar)->getIncomingValue(0))->isZero());
12151 assert(cast<PHINode>(IndVar)->getIncomingBlock(1) == Latch);
12152
12153 auto *NextIndVar = cast<PHINode>(IndVar)->getIncomingValue(1);
12154 assert(cast<Instruction>(NextIndVar)->getParent() == Latch);
12155 assert(cast<BinaryOperator>(NextIndVar)->getOpcode() == BinaryOperator::Add);
12156 assert(cast<BinaryOperator>(NextIndVar)->getOperand(0) == IndVar);
12157 assert(cast<ConstantInt>(cast<BinaryOperator>(NextIndVar)->getOperand(1))
12158 ->isOne());
12159
12160 Value *TripCount = getTripCount();
12161 assert(TripCount && "Loop trip count not found?");
12162 assert(IndVar->getType() == TripCount->getType() &&
12163 "Trip count and induction variable must have the same type");
12164
12165 auto *CmpI = cast<CmpInst>(&Cond->front());
12166 assert(CmpI->getPredicate() == CmpInst::ICMP_ULT &&
12167 "Exit condition must be a signed less-than comparison");
12168 assert(CmpI->getOperand(0) == IndVar &&
12169 "Exit condition must compare the induction variable");
12170 assert(CmpI->getOperand(1) == TripCount &&
12171 "Exit condition must compare with the trip count");
12172#endif
12173}
12174
12176 Header = nullptr;
12177 Cond = nullptr;
12178 Latch = nullptr;
12179 Exit = nullptr;
12180}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Expand Atomic instructions
@ ParamAttr
This file contains the simple types necessary to represent the attributes associated with functions a...
static const Function * getParent(const Value *V)
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Hexagon Common GEP
Hexagon Hardware Loops
This file provides various utilities for inspecting and working with the control flow graph in LLVM I...
This header defines various interfaces for pass management in LLVM.
iv Induction Variable Users
Definition IVUsers.cpp:48
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
static bool isZero(Value *V, const DataLayout &DL, DominatorTree *DT, AssumptionCache *AC)
Definition Lint.cpp:539
static cl::opt< unsigned > TileSize("fuse-matrix-tile-size", cl::init(4), cl::Hidden, cl::desc("Tile size for matrix instruction fusion using square-shaped tiles."))
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
This file contains the declarations for metadata subclasses.
#define T
uint64_t IntrinsicInst * II
#define OMP_KERNEL_ARG_VERSION
Provides definitions for Target specific Grid Values.
static Value * removeASCastIfPresent(Value *V)
static void createTargetLoopWorkshareCall(OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType, BasicBlock *InsertBlock, Value *Ident, Value *LoopBodyArg, Value *TripCount, Function &LoopBodyFn, bool NoLoop)
Value * createFakeIntVal(IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy OuterAllocaIP, llvm::SmallVectorImpl< Instruction * > &ToBeDeleted, OpenMPIRBuilder::InsertPointTy InnerAllocaIP, const Twine &Name="", bool AsPtr=true, bool Is64Bit=false)
static void redirectTo(BasicBlock *Source, BasicBlock *Target, DebugLoc DL)
Make Source branch to Target.
static FunctionCallee getKmpcDistForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void applyParallelAccessesMetadata(CanonicalLoopInfo *CLI, LLVMContext &Ctx, Loop *Loop, LoopInfo &LoopInfo, SmallVector< Metadata * > &LoopMDList)
static void addAArch64VectorName(T VLEN, StringRef LMask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static FunctionCallee getKmpcForDynamicFiniForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for finalizing the dynamic loop using depending on type.
static Expected< Function * > createOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, StringRef FuncName, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void FixupDebugInfoForOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, Function *Func, DenseMap< Value *, std::tuple< Value *, unsigned > > &ValueReplacementMap)
static OMPScheduleType getOpenMPOrderingScheduleType(OMPScheduleType BaseScheduleType, bool HasOrderedClause)
Adds ordering modifier flags to schedule type.
static OMPScheduleType getOpenMPMonotonicityScheduleType(OMPScheduleType ScheduleType, bool HasSimdModifier, bool HasMonotonic, bool HasNonmonotonic, bool HasOrderedClause)
Adds monotonicity modifier flags to schedule type.
static std::string mangleVectorParameters(ArrayRef< llvm::OpenMPIRBuilder::DeclareSimdAttrTy > ParamAttrs)
Mangle the parameter part of the vector function name according to their OpenMP classification.
static void workshareLoopTargetCallback(OpenMPIRBuilder *OMPIRBuilder, CanonicalLoopInfo *CLI, Value *Ident, Function &OutlinedFn, const SmallVector< Instruction *, 4 > &ToBeDeleted, WorksharingLoopType LoopType, bool NoLoop)
static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType)
static llvm::CallInst * emitNoUnwindRuntimeCall(IRBuilder<> &Builder, llvm::FunctionCallee Callee, ArrayRef< llvm::Value * > Args, const llvm::Twine &Name)
static Error populateReductionFunction(Function *ReductionFunc, ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, IRBuilder<> &Builder, ArrayRef< bool > IsByRef, bool IsGPU)
static Function * getFreshReductionFunc(Module &M)
static void raiseUserConstantDataAllocasToEntryBlock(IRBuilderBase &Builder, Function *Function)
static FunctionCallee getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for updating the next loop using OpenMP dynamic scheduling depending...
static bool isConflictIP(IRBuilder<>::InsertPoint IP1, IRBuilder<>::InsertPoint IP2)
Return whether IP1 and IP2 are ambiguous, i.e.
static void checkReductionInfos(ArrayRef< OpenMPIRBuilder::ReductionInfo > ReductionInfos, bool IsGPU)
static Type * getOffloadingArrayType(Value *V)
static OMPScheduleType getOpenMPBaseScheduleType(llvm::omp::ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasDistScheduleChunks)
Determine which scheduling algorithm to use, determined from schedule clause arguments.
static OMPScheduleType computeOpenMPScheduleType(ScheduleKind ClauseKind, bool HasChunks, bool HasSimdModifier, bool HasMonotonicModifier, bool HasNonmonotonicModifier, bool HasOrderedClause, bool HasDistScheduleChunks)
Determine the schedule type using schedule and ordering clause arguments.
static FunctionCallee getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
Returns an LLVM function to call for initializing loop bounds using OpenMP dynamic scheduling dependi...
static StructType * createTaskWithPrivatesTy(OpenMPIRBuilder &OMPIRBuilder, ArrayRef< Value * > OffloadingArraysToPrivatize)
static cl::opt< double > UnrollThresholdFactor("openmp-ir-builder-unroll-threshold-factor", cl::Hidden, cl::desc("Factor for the unroll threshold to account for code " "simplifications still taking place"), cl::init(1.5))
static int32_t computeHeuristicUnrollFactor(CanonicalLoopInfo *CLI)
Heuristically determine the best-performant unroll factor for CLI.
static Value * emitTaskDependencies(OpenMPIRBuilder &OMPBuilder, const SmallVectorImpl< OpenMPIRBuilder::DependData > &Dependencies)
static Error emitTargetOutlinedFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, bool IsOffloadEntry, TargetRegionEntryInfo &EntryInfo, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, Function *&OutlinedFn, Constant *&OutlinedFnID, SmallVectorImpl< Value * > &Inputs, OpenMPIRBuilder::TargetBodyGenCallbackTy &CBFunc, OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy &ArgAccessorFuncCB)
static void updateNVPTXAttr(Function &Kernel, StringRef Name, int32_t Value, bool Min)
static OpenMPIRBuilder::InsertPointTy getInsertPointAfterInstr(Instruction *I)
static void redirectAllPredecessorsTo(BasicBlock *OldTarget, BasicBlock *NewTarget, DebugLoc DL)
Redirect all edges that branch to OldTarget to NewTarget.
static void hoistNonEntryAllocasToEntryBlock(llvm::BasicBlock &Block)
static std::unique_ptr< TargetMachine > createTargetMachine(Function *F, CodeGenOptLevel OptLevel)
Create the TargetMachine object to query the backend for optimization preferences.
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder)
static void addAccessGroupMetadata(BasicBlock *Block, MDNode *AccessGroup, LoopInfo &LI)
Attach llvm.access.group metadata to the memref instructions of Block.
static void addBasicBlockMetadata(BasicBlock *BB, ArrayRef< Metadata * > Properties)
Attach metadata Properties to the basic block described by BB.
static void restoreIPandDebugLoc(llvm::IRBuilderBase &Builder, llvm::IRBuilderBase::InsertPoint IP)
This is wrapper over IRBuilderBase::restoreIP that also restores the current debug location to the la...
static LoadInst * loadSharedDataFromTaskDescriptor(OpenMPIRBuilder &OMPIRBuilder, IRBuilderBase &Builder, Value *TaskWithPrivates, Type *TaskWithPrivatesTy)
Given a task descriptor, TaskWithPrivates, return the pointer to the block of pointers containing sha...
static cl::opt< bool > OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden, cl::desc("Use optimistic attributes describing " "'as-if' properties of runtime calls."), cl::init(false))
static bool hasGridValue(const Triple &T)
static FunctionCallee getKmpcForStaticLoopForType(Type *Ty, OpenMPIRBuilder *OMPBuilder, WorksharingLoopType LoopType)
static void emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::TargetDataInfo &Info, const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs, const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID, SmallVectorImpl< Value * > &Args, OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB, OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait, Value *DynCGroupMem, OMPDynGroupprivateFallbackType DynCGroupMemFallback)
static const omp::GV & getGridValue(const Triple &T, Function *Kernel)
static void addAArch64AdvSIMDNDSNames(unsigned NDS, StringRef Mask, StringRef Prefix, char ISA, StringRef ParSeq, StringRef MangledName, bool OutputBecomesInput, llvm::Function *Fn)
static Function * emitTargetTaskProxyFunction(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder, CallInst *StaleCI, StructType *PrivatesTy, StructType *TaskWithPrivatesTy, const size_t NumOffloadingArrays, const int SharedArgsOperandNo)
Create an entry point for a target task with the following.
static void addLoopMetadata(CanonicalLoopInfo *Loop, ArrayRef< Metadata * > Properties)
Attach loop metadata Properties to the loop described by Loop.
static void removeUnusedBlocksFromParent(ArrayRef< BasicBlock * > BBs)
Determine which blocks in BBs are reachable from outside and remove the ones that are not reachable f...
static void targetParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, BasicBlock *OuterAllocaBB, Value *Ident, Value *IfCondition, Value *NumThreads, Instruction *PrivTID, AllocaInst *PrivTIDAddr, Value *ThreadID, const SmallVector< Instruction *, 4 > &ToBeDeleted)
static void hostParallelCallback(OpenMPIRBuilder *OMPIRBuilder, Function &OutlinedFn, Function *OuterFn, Value *Ident, Value *IfCondition, Instruction *PrivTID, AllocaInst *PrivTIDAddr, const SmallVector< Instruction *, 4 > &ToBeDeleted)
#define P(N)
FunctionAnalysisManager FAM
Function * Fun
This file defines the Pass Instrumentation classes that provide instrumentation points into the pass ...
const SmallVectorImpl< MachineOperand > & Cond
Remove Loads Into Fake Uses
static bool isValid(const char C)
Returns true if C is a valid mangled character: <0-9a-zA-Z_>.
std::unordered_set< BasicBlock * > BlockSet
This file implements the SmallBitVector class.
This file defines the SmallSet class.
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:114
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Defines the virtual file system interface vfs::FileSystem.
Value * RHS
Value * LHS
static cl::opt< unsigned > MaxThreads("xcore-max-threads", cl::Optional, cl::desc("Maximum number of threads (for emulation thread-local storage)"), cl::Hidden, cl::value_desc("number"), cl::init(8))
static const uint32_t IV[8]
Definition blake3_impl.h:83
The Input class is used to parse a yaml document into in-memory structs and vectors.
Class for arbitrary precision integers.
Definition APInt.h:78
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
static APSInt getUnsigned(uint64_t X)
Definition APSInt.h:349
This class represents a conversion between pointers from one address space to another.
an instruction to allocate memory on the stack
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
PointerType * getType() const
Overload to return most specific pointer type.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getAddressSpace() const
Return the address space for the allocation.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
LLVM_ABI bool isArrayAllocation() const
Return true if there is an allocation size parameter to the allocation instruction that is not 1.
void setAlignment(Align Align)
const Value * getArraySize() const
Get the number of elements allocated.
bool registerPass(PassBuilderT &&PassBuilder)
Register an analysis pass with the manager.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition Argument.h:50
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Class to represent array types.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
A function analysis which provides an AssumptionCache.
LLVM_ABI AssumptionCache run(Function &F, FunctionAnalysisManager &)
A cache of @llvm.assume calls within a function.
An instruction that atomically checks whether a specified value is in a memory location,...
static AtomicOrdering getStrongestFailureOrdering(AtomicOrdering SuccessOrdering)
Returns the strongest permitted ordering on failure, given the desired ordering on success.
LLVM_ABI std::pair< LoadInst *, AllocaInst * > EmitAtomicLoadLibcall(AtomicOrdering AO)
Definition Atomic.cpp:107
LLVM_ABI void EmitAtomicStoreLibcall(AtomicOrdering AO, Value *Source)
Definition Atomic.cpp:148
an instruction that atomically reads a memory location, combines it with another value,...
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ FMinimum
*p = minimum(old, v) minimum matches the behavior of llvm.minimum.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FMaximum
*p = maximum(old, v) maximum matches the behavior of llvm.maximum.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMaximumNum
*p = maximumnum(old, v) maximumnum matches the behavior of llvm.maximumnum.
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ FMinimumNum
*p = minimumnum(old, v) minimumnum matches the behavior of llvm.minimumnum.
@ Nand
*p = ~(old & v)
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const
Add attributes to the attribute set.
LLVM_ABI AttributeSet addAttribute(LLVMContext &C, Attribute::AttrKind Kind) const
Add an argument attribute.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
LLVM_ABI void replaceSuccessorsPhiUsesWith(BasicBlock *Old, BasicBlock *New)
Update all phi nodes in this basic block's successors to refer to basic block New instead of basic bl...
iterator end()
Definition BasicBlock.h:474
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
reverse_iterator rbegin()
Definition BasicBlock.h:477
bool hasTerminator() const LLVM_READONLY
Returns whether the block has a terminator.
Definition BasicBlock.h:232
bool empty() const
Definition BasicBlock.h:483
const Instruction & back() const
Definition BasicBlock.h:486
LLVM_ABI BasicBlock * splitBasicBlockBefore(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction and insert the new basic blo...
LLVM_ABI InstListType::const_iterator getFirstNonPHIIt() const
Returns an iterator to the first instruction in this block that is not a PHINode instruction.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
LLVM_ABI InstListType::const_iterator getFirstNonPHIOrDbg(bool SkipPseudoOp=true) const
Returns a pointer to the first instruction in this block that is not a PHINode or a debug intrinsic,...
LLVM_ABI const BasicBlock * getUniqueSuccessor() const
Return the successor of this block if it has a unique successor.
LLVM_ABI const BasicBlock * getSinglePredecessor() const
Return the predecessor of this block if it has a single predecessor block.
const Instruction & front() const
Definition BasicBlock.h:484
InstListType::reverse_iterator reverse_iterator
Definition BasicBlock.h:172
LLVM_ABI const BasicBlock * getUniquePredecessor() const
Return the predecessor of this block if it has a unique predecessor block.
const Instruction * getTerminatorOrNull() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:248
LLVM_ABI const BasicBlock * getSingleSuccessor() const
Return the successor of this block if it has a single successor.
LLVM_ABI SymbolTableList< BasicBlock >::iterator eraseFromParent()
Unlink 'this' from the containing function and delete it.
reverse_iterator rend()
Definition BasicBlock.h:479
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
LLVM_ABI LLVMContext & getContext() const
Get the context in which this basic block lives.
void moveBefore(BasicBlock *MovePos)
Unlink this basic block from its current function and insert it into the function that MovePos lives ...
Definition BasicBlock.h:388
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
void splice(BasicBlock::iterator ToIt, BasicBlock *FromBB)
Transfer all instructions from FromBB to this basic block at ToIt.
Definition BasicBlock.h:659
LLVM_ABI void removePredecessor(BasicBlock *Pred, bool KeepOneInputPHIs=false)
Update PHI nodes in this BasicBlock before removal of predecessor Pred.
void setDoesNotThrow()
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
User::op_iterator arg_begin()
Return the iterator pointing to the beginning of the argument list.
Value * getArgOperand(unsigned i) const
User::op_iterator arg_end()
Return the iterator pointing to the end of the argument list.
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:706
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:699
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ ICMP_ULE
unsigned less or equal
Definition InstrTypes.h:702
A cache for the CodeExtractor analysis.
Utility class for extracting code into a new function.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:859
static LLVM_ABI Constant * getString(LLVMContext &Context, StringRef Initializer, bool AddNull=true, bool ByteString=false)
This method constructs a CDS and initializes it with a text string.
static LLVM_ABI Constant * getPointerCast(Constant *C, Type *Ty)
Create a BitCast, AddrSpaceCast, or a PtrToInt cast constant expression.
static LLVM_ABI Constant * getTruncOrBitCast(Constant *C, Type *Ty)
static LLVM_ABI Constant * getPointerBitCastOrAddrSpaceCast(Constant *C, Type *Ty)
Create a BitCast or AddrSpaceCast for a pointer type depending on the address space.
static LLVM_ABI Constant * getSizeOf(Type *Ty)
getSizeOf constant expr - computes the (alloc) size of a type (in address-units, not bits) in a targe...
static LLVM_ABI Constant * getAddrSpaceCast(Constant *C, Type *Ty, bool OnlyIfReduced=false)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantPointerNull * get(PointerType *T)
Static factory methods - Return objects of the specified value.
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
DILocalScope * getScope() const
Get the local scope for this variable.
DINodeArray getAnnotations() const
DIFile * getFile() const
Subprogram description. Uses SubclassData1.
Base class for types.
uint32_t getAlignInBits() const
DIFile * getFile() const
DIType * getType() const
unsigned getLine() const
StringRef getName() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
Record of a variable value-assignment, aka a non instruction representation of the dbg....
A debug info location.
Definition DebugLoc.h:123
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
LLVM_ABI DominatorTree run(Function &F, FunctionAnalysisManager &)
Run the analysis pass over a function and produce a dominator tree.
bool properlyDominates(const DomTreeNodeBase< NodeT > *A, const DomTreeNodeBase< NodeT > *B) const
properlyDominates - Returns true iff A dominates B and A != B.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Represents either an error or a value T.
Definition ErrorOr.h:56
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
static ErrorSuccess success()
Create a success value.
Definition Error.h:336
Tagged union holding either a T or a Error.
Definition Error.h:485
Error takeError()
Take ownership of the stored error.
Definition Error.h:612
reference get()
Returns a reference to the stored T value.
Definition Error.h:582
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:638
static Function * Create(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace, const Twine &N="", Module *M=nullptr)
Definition Function.h:168
const BasicBlock & getEntryBlock() const
Definition Function.h:809
Argument * arg_iterator
Definition Function.h:73
bool empty() const
Definition Function.h:859
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
void removeFromParent()
removeFromParent - This method unlinks 'this' from the containing module, but does not delete it.
Definition Function.cpp:445
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
AttributeList getAttributes() const
Return the attribute list for this Function.
Definition Function.h:354
const Function & getFunction() const
Definition Function.h:166
iterator begin()
Definition Function.h:853
arg_iterator arg_begin()
Definition Function.h:868
void setAttributes(AttributeList Attrs)
Set the attribute list for this Function.
Definition Function.h:357
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
adds the attribute to the list of attributes for the given arg.
Definition Function.cpp:666
Function::iterator insert(Function::iterator Position, BasicBlock *BB)
Insert BB in the basic block list at Position.
Definition Function.h:755
size_t arg_size() const
Definition Function.h:901
Type * getReturnType() const
Returns the type of the ret val.
Definition Function.h:216
iterator end()
Definition Function.h:855
void setCallingConv(CallingConv::ID CC)
Definition Function.h:276
Argument * getArg(unsigned i) const
Definition Function.h:886
bool hasMetadata() const
Return true if this value has any metadata attached to it.
Definition Value.h:608
LLVM_ABI void addMetadata(unsigned KindID, MDNode &MD)
Add a metadata attachment.
LinkageTypes getLinkage() const
void setLinkage(LinkageTypes LT)
Module * getParent()
Get the module that this global value is contained inside of...
void setDSOLocal(bool Local)
PointerType * getType() const
Global values are always pointers.
@ HiddenVisibility
The GV is hidden.
Definition GlobalValue.h:69
@ ProtectedVisibility
The GV is protected.
Definition GlobalValue.h:70
void setVisibility(VisibilityTypes V)
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
@ PrivateLinkage
Like Internal, but omit from symbol table.
Definition GlobalValue.h:61
@ CommonLinkage
Tentative definitions.
Definition GlobalValue.h:63
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ WeakODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:58
@ WeakAnyLinkage
Keep one copy of named function when linking (weak)
Definition GlobalValue.h:57
@ AppendingLinkage
Special purpose, only applies to global arrays.
Definition GlobalValue.h:59
@ LinkOnceODRLinkage
Same, but only replaced by something equivalent.
Definition GlobalValue.h:56
Type * getValueType() const
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
BasicBlock * getBlock() const
Definition IRBuilder.h:306
bool isSet() const
Returns true if this insert point is set.
Definition IRBuilder.h:304
BasicBlock::iterator getPoint() const
Definition IRBuilder.h:307
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
InsertPoint saveIP() const
Returns the current insert point.
Definition IRBuilder.h:311
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition IRBuilder.h:323
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
LLVM_ABI const DebugLoc & getStableDebugLoc() const
Fetch the debug location for this node, unless this is a debug intrinsic, in which case fetch the deb...
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
LLVM_ABI unsigned getNumSuccessors() const LLVM_READONLY
Return the number of successors that this instruction has.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
LLVM_ABI void moveBefore(InstListType::iterator InsertPos)
Unlink this instruction from its current basic block and insert it into the basic block that MovePos ...
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI BasicBlock * getSuccessor(unsigned Idx) const LLVM_READONLY
Return the specified successor. This instruction must be a terminator.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void moveBeforePreserving(InstListType::iterator MovePos)
Perform a moveBefore operation, while signalling that the caller intends to preserve the original ord...
void setDebugLoc(DebugLoc Loc)
Set the debug location information for this instruction.
LLVM_ABI void insertAfter(Instruction *InsertPos)
Insert an unlinked instruction into a basic block immediately after the specified instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
Align getAlign() const
Return the alignment of the access that is being performed.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
LLVM_ABI LoopInfo run(Function &F, FunctionAnalysisManager &AM)
Definition LoopInfo.cpp:990
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
LoopT * getLoopFor(const BlockT *BB) const
Return the inner most loop that BB lives in.
This class represents a loop nest and can be used to query its properties.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
LLVM_ABI MDNode * createCallbackEncoding(unsigned CalleeArgNo, ArrayRef< int > Arguments, bool VarArgsArePassed)
Return metadata describing a callback (see llvm::AbstractCallSite).
Metadata node.
Definition Metadata.h:1080
LLVM_ABI void replaceOperandWith(unsigned I, Metadata *New)
Replace a specific operand.
static MDTuple * getDistinct(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1580
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1442
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
size_type size() const
Definition MapVector.h:56
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:278
A tuple of MDNodes.
Definition Metadata.h:1760
iterator_range< op_iterator > operands()
Definition Metadata.h:1856
LLVM_ABI void addOperand(MDNode *M)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
LLVM_ABI bool hasRequiresDynamicAllocators() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI void emitAArch64DeclareSimdFunction(llvm::Function *Fn, unsigned VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch, char ISA, unsigned NarrowestDataSize, bool OutputBecomesInput)
Emit AArch64 vector-function ABI attributes for a declare simd function.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI llvm::StructType * getKmpTaskAffinityInfoTy()
Return the LLVM struct type matching runtime kmp_task_affinity_info_t.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI InsertPointOrErrorTy createIteratorLoop(LocationDescription Loc, llvm::Value *TripCount, IteratorBodyGenTy BodyGen, llvm::StringRef Name="iterator")
Create a canonical iterator loop at the current insertion point.
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, AffinityData Affinities={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
LLVM_ABI void finalize(Function *Fn=nullptr)
Finalize the underlying module, e.g., by outlining regions.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
llvm::function_ref< llvm::Error( InsertPointTy BodyIP, llvm::Value *LinearIV)> IteratorBodyGenTy
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
LLVM_ABI CanonicalLoopInfo * fuseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops)
Fuse a sequence of loops.
LLVM_ABI void emitX86DeclareSimdFunction(llvm::Function *Fn, unsigned NumElements, const llvm::APSInt &VLENVal, llvm::ArrayRef< DeclareSimdAttrTy > ParamAttrs, DeclareSimdBranch Branch)
Emit x86 vector-function ABI attributes for a declare simd function.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
Pseudo-analysis pass that exposes the PassInstrumentation to pass managers.
Class to represent pointers.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
PostDominatorTree Class - Concrete subclass of DominatorTree that is used to compute the post-dominat...
Analysis pass that exposes the ScalarEvolution for a function.
LLVM_ABI ScalarEvolution run(Function &F, FunctionAnalysisManager &AM)
The main scalar evolution driver.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
bool remove_if(UnaryPredicate P)
Remove items from the set vector based on a predicate function.
Definition SetVector.h:230
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
SmallBitVector & set()
bool test(unsigned Idx) const
bool all() const
Returns true if all bits are set.
bool any() const
Returns true if any bit is set.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
iterator end() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
iterator begin() const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:134
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
void append(StringRef RHS)
Append from a StringRef.
Definition SmallString.h:68
StringRef str() const
Explicit conversion to StringRef.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAlignment(Align Align)
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
ValueTy lookup(StringRef Key) const
lookup - Return the entry for the specified key, or a default constructed value if no such entry exis...
Definition StringMap.h:260
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::string str() const
str - Get the contents as an std::string.
Definition StringRef.h:222
constexpr bool empty() const
empty - Check if the string is empty.
Definition StringRef.h:140
constexpr size_t size() const
size - Get the string size.
Definition StringRef.h:143
size_t count(char C) const
Return the number of occurrences of C in the string.
Definition StringRef.h:471
bool ends_with(StringRef Suffix) const
Check if this string ends with the given Suffix.
Definition StringRef.h:270
StringRef drop_back(size_t N=1) const
Return a StringRef equal to 'this' but with the last N elements dropped.
Definition StringRef.h:636
Class to represent struct types.
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:483
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:689
Type * getElementType(unsigned N) const
Multiway switch.
LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest)
Add an entry to the switch instruction.
Analysis pass providing the TargetTransformInfo.
LLVM_ABI Result run(const Function &F, FunctionAnalysisManager &)
Analysis pass providing the TargetLibraryInfo.
Target - Wrapper for Target specific information.
TargetMachine * createTargetMachine(const Triple &TT, StringRef CPU, StringRef Features, const TargetOptions &Options, std::optional< Reloc::Model > RM, std::optional< CodeModel::Model > CM=std::nullopt, CodeGenOptLevel OL=CodeGenOptLevel::Default, bool JIT=false) const
createTargetMachine - Create a target specific machine implementation for the specified Triple.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
bool isPPC() const
Tests whether the target is PowerPC (32- or 64-bit LE or BE).
Definition Triple.h:1099
bool isX86() const
Tests whether the target is x86 (32- or 64-bit).
Definition Triple.h:1161
bool isWasm() const
Tests whether the target is wasm (32- and 64-bit).
Definition Triple.h:1177
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
LLVM_ABI std::string str() const
Return the twine contents as a std::string.
Definition Twine.cpp:17
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
LLVM_ABI Type * getStructElementType(unsigned N) const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:286
bool isStructTy() const
True if this is an instance of StructType.
Definition Type.h:278
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:310
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
Unconditional Branch instruction.
static UncondBrInst * Create(BasicBlock *Target, InsertPosition InsertBefore=nullptr)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
This function has undefined behavior.
Produce an estimate of the unrolled cost of the specified loop.
Definition UnrollLoop.h:150
LLVM_ABI bool canUnroll() const
Whether it is legal to unroll this loop.
uint64_t getRolledLoopSize() const
Definition UnrollLoop.h:166
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
User * user_back()
Definition Value.h:413
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:967
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
LLVM_ABI User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
Definition Value.cpp:188
LLVM_ABI const Value * stripPointerCasts() const
Strip off pointer casts, all-zero GEPs and address space casts.
Definition Value.cpp:713
bool use_empty() const
Definition Value.h:347
user_iterator user_end()
Definition Value.h:411
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
iterator_range< use_iterator > uses()
Definition Value.h:381
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
An efficient, type-erasing, non-owning reference to a callable.
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
The virtual file system interface.
llvm::ErrorOr< std::unique_ptr< llvm::MemoryBuffer > > getBufferForFile(const Twine &Name, int64_t FileSize=-1, bool RequiresNullTerminator=true, bool IsVolatile=false, bool IsText=true)
This is a convenience method that opens a file, gets its content and then closes the file.
virtual llvm::ErrorOr< Status > status(const Twine &Path)=0
Get the status of the entry at Path, if one exists.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ PTX_Kernel
Call to a PTX kernel. Passes all arguments in parameter space.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
constexpr StringLiteral MaxNTID("nvvm.maxntid")
constexpr StringLiteral MaxClusterRank("nvvm.maxclusterrank")
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
LLVM_ABI GlobalVariable * emitOffloadingEntry(Module &M, object::OffloadKind Kind, Constant *Addr, StringRef Name, uint64_t Size, uint32_t Flags, uint64_t Data, Constant *AuxAddr=nullptr, StringRef SectionName="llvm_offload_entries")
Create an offloading section struct used to register this global at runtime.
Definition Utility.cpp:87
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
@ OMP_MAP_PTR_AND_OBJ
The element being mapped is a pointer-pointee pair; both the pointer and the pointee should be mapped...
@ OMP_MAP_MEMBER_OF
The 16 MSBs of the flags indicate whether the entry is member of some struct/class.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
constexpr const GV & getAMDGPUGridValues()
static constexpr GV SPIRVGridValues
For generic SPIR-V GPUs.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
static constexpr GV NVPTXGridValues
For Nvidia GPUs.
Function * Kernel
Summary of a kernel (=entry point for target offloading).
Definition OpenMPOpt.h:21
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:831
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
hash_code hash_value(const FixedPointSemantics &Val)
LLVM_ABI Expected< std::unique_ptr< Module > > parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context, ParserCallbacks Callbacks={})
Read the specified bitcode file, returning the module.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:841
LLVM_ABI BasicBlock * CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix="", Function *F=nullptr, ClonedCodeInfo *CodeInfo=nullptr, bool MapAtoms=true)
Return a copy of the specified basic block, but without embedding the block into a particular functio...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
unsigned getPointerAddressSpace(const Type *T)
Definition SPIRVUtils.h:374
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
FunctionAddr VTableAddr uintptr_t uintptr_t Int32Ty
Definition InstrProf.h:328
auto successors(const MachineBasicBlock *BB)
LLVM_ABI std::error_code inconvertibleErrorCode()
The value returned by this function can be returned from convertToErrorCode for Error values where no...
Definition Error.cpp:94
testing::Matcher< const detail::ErrorHolder & > Failed()
Definition Error.h:198
constexpr from_range_t from_range
auto dyn_cast_if_present(const Y &Val)
dyn_cast_if_present<X> - Functionally identical to dyn_cast, except that a null (or none in the case ...
Definition Casting.h:732
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
std::string utostr(uint64_t X, bool isNeg=false)
void * PointerTy
ErrorOr< T > expectedToErrorOrAndEmitErrors(LLVMContext &Ctx, Expected< T > Val)
bool isa_and_nonnull(const Y &Val)
Definition Casting.h:676
LLVM_ABI bool convertUsersOfConstantsToInstructions(ArrayRef< Constant * > Consts, Function *RestrictToFunc=nullptr, bool RemoveDeadConstants=true, bool IncludeSelf=false)
Replace constant expressions users of the given constants with instructions.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
FunctionAddr VTableAddr uintptr_t uintptr_t Version
Definition InstrProf.h:334
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
TargetTransformInfo::PeelingPreferences gatherPeelingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, std::optional< bool > UserAllowPeeling, std::optional< bool > UserAllowProfileBasedPeeling, bool UnrollingSpecficValues=false)
LLVM_ABI void SplitBlockAndInsertIfThenElse(Value *Cond, BasicBlock::iterator SplitBefore, Instruction **ThenTerm, Instruction **ElseTerm, MDNode *BranchWeights=nullptr, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr)
SplitBlockAndInsertIfThenElse is similar to SplitBlockAndInsertIfThen, but also creates the ElseBlock...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
format_object< Ts... > format(const char *Fmt, const Ts &... Vals)
These are helper functions used to produce formatted output.
Definition Format.h:129
Error make_error(ArgTs &&... Args)
Make a Error instance representing failure using the given error info type.
Definition Error.h:340
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
void cantFail(Error Err, const char *Msg=nullptr)
Report a fatal error if Err is a failure value.
Definition Error.h:769
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
LLVM_ABI bool MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU=nullptr, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, MemoryDependenceResults *MemDep=nullptr, bool PredecessorWithTwoSuccessors=false, DominatorTree *DT=nullptr)
Attempts to merge a block into its predecessor, if possible.
@ Mul
Product of integers.
@ Add
Sum of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
LLVM_ABI void remapInstructionsInBlocks(ArrayRef< BasicBlock * > Blocks, ValueToValueMapTy &VMap)
Remaps instructions in Blocks using the mapping in VMap.
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, llvm::OptimizationRemarkEmitter &ORE, int OptLevel, std::optional< unsigned > UserThreshold, std::optional< unsigned > UserCount, std::optional< bool > UserAllowPartial, std::optional< bool > UserRuntime, std::optional< bool > UserUpperBound, std::optional< unsigned > UserFullUnrollMaxCount)
Gather the various unrolling parameters based on the defaults, compiler flags, TTI overrides and user...
ValueMap< const Value *, WeakTrackingVH > ValueToValueMapTy
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto predecessors(const MachineBasicBlock *BB)
PointerUnion< const Value *, const PseudoSourceValue * > ValueType
LLVM_ABI Constant * ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val, ArrayRef< unsigned > Idxs)
Attempt to constant fold an insertvalue instruction with the specified operands and indices.
@ Continue
Definition DWP.h:22
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void DeleteDeadBlocks(ArrayRef< BasicBlock * > BBs, DomTreeUpdater *DTU=nullptr, bool KeepOneInputPHIs=false)
Delete the specified blocks from BB.
bool to_integer(StringRef S, N &Num, unsigned Base=0)
Convert the string S to an integer of the specified type using the radix Base. If Base is 0,...
static auto filterDbgVars(iterator_range< simple_ilist< DbgRecord >::iterator > R)
Filter the DbgRecord range to DbgVariableRecord types only and downcast.
LLVM_ABI void computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, AssumptionCache *AC, ScalarEvolution &SE, const SmallPtrSetImpl< const Value * > &EphValues, OptimizationRemarkEmitter *ORE, unsigned TripCount, unsigned MaxTripCount, bool MaxOrZero, unsigned TripMultiple, const UnrollCostEstimator &UCE, TargetTransformInfo::UnrollingPreferences &UP, TargetTransformInfo::PeelingPreferences &PP)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static LLVM_ABI void collectEphemeralValues(const Loop *L, AssumptionCache *AC, SmallPtrSetImpl< const Value * > &EphValues)
Collect a loop's ephemeral values (those used only by an assume or similar intrinsics in the loop).
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
A struct to pack the relevant information for an OpenMP affinity clause.
a struct to pack relevant information while generating atomic Ops
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
This structure contains combined information generated for mappable clauses, including base pointers,...
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
static const Target * lookupTarget(StringRef TripleStr, std::string &Error)
lookupTarget - Lookup a target based on a target triple.
Parameters that control the generic loop unrolling transformation.
unsigned Count
A forced unrolling factor (the number of concatenated bodies of the original loop in the unrolled loo...
unsigned Threshold
The cost threshold for the unrolled loop.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...