LLVM 20.0.0git
OMPIRBuilder.h
Go to the documentation of this file.
1//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the OpenMPIRBuilder class and helpers used as a convenient
10// way to create LLVM instructions for OpenMP directives.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16
20#include "llvm/IR/DebugLoc.h"
21#include "llvm/IR/IRBuilder.h"
22#include "llvm/IR/Module.h"
25#include <forward_list>
26#include <map>
27#include <optional>
28
29namespace llvm {
30class CanonicalLoopInfo;
31struct TargetRegionEntryInfo;
32class OffloadEntriesInfoManager;
33class OpenMPIRBuilder;
34
35/// Move the instruction after an InsertPoint to the beginning of another
36/// BasicBlock.
37///
38/// The instructions after \p IP are moved to the beginning of \p New which must
39/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
40/// \p New will be added such that there is no semantic change. Otherwise, the
41/// \p IP insert block remains degenerate and it is up to the caller to insert a
42/// terminator.
43void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New,
44 bool CreateBranch);
45
46/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
47/// insert location will stick to after the instruction before the insertion
48/// point (instead of moving with the instruction the InsertPoint stores
49/// internally).
50void spliceBB(IRBuilder<> &Builder, BasicBlock *New, bool CreateBranch);
51
52/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
53/// (missing the terminator).
54///
55/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
56/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
57/// is true, a branch to the new successor will new created such that
58/// semantically there is no change; otherwise the block of the insertion point
59/// remains degenerate and it is the caller's responsibility to insert a
60/// terminator. Returns the new successor block.
61BasicBlock *splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch,
62 llvm::Twine Name = {});
63
64/// Split a BasicBlock at \p Builder's insertion point, even if the block is
65/// degenerate (missing the terminator). Its new insert location will stick to
66/// after the instruction before the insertion point (instead of moving with the
67/// instruction the InsertPoint stores internally).
68BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
69 llvm::Twine Name = {});
70
71/// Split a BasicBlock at \p Builder's insertion point, even if the block is
72/// degenerate (missing the terminator). Its new insert location will stick to
73/// after the instruction before the insertion point (instead of moving with the
74/// instruction the InsertPoint stores internally).
75BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch, llvm::Twine Name);
76
77/// Like splitBB, but reuses the current block's name for the new name.
78BasicBlock *splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch,
79 llvm::Twine Suffix = ".split");
80
81/// Captures attributes that affect generating LLVM-IR using the
82/// OpenMPIRBuilder and related classes. Note that not all attributes are
83/// required for all classes or functions. In some use cases the configuration
84/// is not necessary at all, because because the only functions that are called
85/// are ones that are not dependent on the configuration.
87public:
88 /// Flag to define whether to generate code for the role of the OpenMP host
89 /// (if set to false) or device (if set to true) in an offloading context. It
90 /// is set when the -fopenmp-is-target-device compiler frontend option is
91 /// specified.
92 std::optional<bool> IsTargetDevice;
93
94 /// Flag for specifying if the compilation is done for an accelerator. It is
95 /// set according to the architecture of the target triple and currently only
96 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
97 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
98 /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
99 /// like target with the ability to work as the OpenMP host is added, or if
100 /// the capabilities of the currently supported GPU architectures are
101 /// expanded.
102 std::optional<bool> IsGPU;
103
104 /// Flag for specifying if LLVMUsed information should be emitted.
105 std::optional<bool> EmitLLVMUsedMetaInfo;
106
107 /// Flag for specifying if offloading is mandatory.
108 std::optional<bool> OpenMPOffloadMandatory;
109
110 /// First separator used between the initial two parts of a name.
111 std::optional<StringRef> FirstSeparator;
112 /// Separator used between all of the rest consecutive parts of s name
113 std::optional<StringRef> Separator;
114
115 // Grid Value for the GPU target
116 std::optional<omp::GV> GridValue;
117
118 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
119 /// false`), this contains the list of offloading triples associated, if any.
121
125 bool HasRequiresReverseOffload,
126 bool HasRequiresUnifiedAddress,
127 bool HasRequiresUnifiedSharedMemory,
128 bool HasRequiresDynamicAllocators);
129
130 // Getters functions that assert if the required values are not present.
131 bool isTargetDevice() const {
132 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
133 return *IsTargetDevice;
134 }
135
136 bool isGPU() const {
137 assert(IsGPU.has_value() && "IsGPU is not set");
138 return *IsGPU;
139 }
140
142 assert(OpenMPOffloadMandatory.has_value() &&
143 "OpenMPOffloadMandatory is not set");
145 }
146
148 assert(GridValue.has_value() && "GridValue is not set");
149 return *GridValue;
150 }
151
152 bool hasRequiresFlags() const { return RequiresFlags; }
153 bool hasRequiresReverseOffload() const;
154 bool hasRequiresUnifiedAddress() const;
156 bool hasRequiresDynamicAllocators() const;
157
158 /// Returns requires directive clauses as flags compatible with those expected
159 /// by libomptarget.
160 int64_t getRequiresFlags() const;
161
162 // Returns the FirstSeparator if set, otherwise use the default separator
163 // depending on isGPU
165 if (FirstSeparator.has_value())
166 return *FirstSeparator;
167 if (isGPU())
168 return "_";
169 return ".";
170 }
171
172 // Returns the Separator if set, otherwise use the default separator depending
173 // on isGPU
175 if (Separator.has_value())
176 return *Separator;
177 if (isGPU())
178 return "$";
179 return ".";
180 }
181
183 void setIsGPU(bool Value) { IsGPU = Value; }
189
194
195private:
196 /// Flags for specifying which requires directive clauses are present.
197 int64_t RequiresFlags;
198};
199
200/// Data structure to contain the information needed to uniquely identify
201/// a target entry.
203 /// The prefix used for kernel names.
204 static constexpr const char *KernelNamePrefix = "__omp_offloading_";
205
206 std::string ParentName;
207 unsigned DeviceID;
208 unsigned FileID;
209 unsigned Line;
210 unsigned Count;
211
214 unsigned FileID, unsigned Line, unsigned Count = 0)
216 Count(Count) {}
217
220 unsigned DeviceID, unsigned FileID,
221 unsigned Line, unsigned Count);
222
224 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
225 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
226 RHS.Count);
227 }
228};
229
230/// Class that manages information about offload code regions and data
232 /// Number of entries registered so far.
233 OpenMPIRBuilder *OMPBuilder;
234 unsigned OffloadingEntriesNum = 0;
235
236public:
237 /// Base class of the entries info.
239 public:
240 /// Kind of a given entry.
241 enum OffloadingEntryInfoKinds : unsigned {
242 /// Entry is a target region.
244 /// Entry is a declare target variable.
246 /// Invalid entry info.
248 };
249
250 protected:
252 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
253 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
254 uint32_t Flags)
255 : Flags(Flags), Order(Order), Kind(Kind) {}
256 ~OffloadEntryInfo() = default;
257
258 public:
259 bool isValid() const { return Order != ~0u; }
260 unsigned getOrder() const { return Order; }
261 OffloadingEntryInfoKinds getKind() const { return Kind; }
262 uint32_t getFlags() const { return Flags; }
263 void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
264 Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
266 assert(!Addr.pointsToAliveValue() && "Address has been set before!");
267 Addr = V;
268 }
269 static bool classof(const OffloadEntryInfo *Info) { return true; }
270
271 private:
272 /// Address of the entity that has to be mapped for offloading.
273 WeakTrackingVH Addr;
274
275 /// Flags associated with the device global.
276 uint32_t Flags = 0u;
277
278 /// Order this entry was emitted.
279 unsigned Order = ~0u;
280
282 };
283
284 /// Return true if a there are no entries defined.
285 bool empty() const;
286 /// Return number of entries defined so far.
287 unsigned size() const { return OffloadingEntriesNum; }
288
289 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
290
291 //
292 // Target region entries related.
293 //
294
295 /// Kind of the target registry entry.
297 /// Mark the entry as target region.
299 };
300
301 /// Target region entries info.
303 /// Address that can be used as the ID of the entry.
304 Constant *ID = nullptr;
305
306 public:
309 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
310 Constant *ID,
313 ID(ID) {
315 }
316
317 Constant *getID() const { return ID; }
318 void setID(Constant *V) {
319 assert(!ID && "ID has been set before!");
320 ID = V;
321 }
322 static bool classof(const OffloadEntryInfo *Info) {
323 return Info->getKind() == OffloadingEntryInfoTargetRegion;
324 }
325 };
326
327 /// Initialize target region entry.
328 /// This is ONLY needed for DEVICE compilation.
330 unsigned Order);
331 /// Register target region entry.
335 /// Return true if a target region entry with the provided information
336 /// exists.
338 bool IgnoreAddressId = false) const;
339
340 // Return the Name based on \a EntryInfo using the next available Count.
342 const TargetRegionEntryInfo &EntryInfo);
343
344 /// brief Applies action \a Action on all registered entries.
345 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
346 const OffloadEntryInfoTargetRegion &)>
348 void
350
351 //
352 // Device global variable entries related.
353 //
354
355 /// Kind of the global variable entry..
357 /// Mark the entry as a to declare target.
359 /// Mark the entry as a to declare target link.
361 /// Mark the entry as a declare target enter.
363 /// Mark the entry as having no declare target entry kind.
365 /// Mark the entry as a declare target indirect global.
367 /// Mark the entry as a register requires global.
369 };
370
371 /// Kind of device clause for declare target variables
372 /// and functions
373 /// NOTE: Currently not used as a part of a variable entry
374 /// used for Flang and Clang to interface with the variable
375 /// related registration functions
377 /// The target is marked for all devices
379 /// The target is marked for non-host devices
381 /// The target is marked for host devices
383 /// The target is marked as having no clause
385 };
386
387 /// Device global variable entries info.
389 /// Type of the global variable.
390 int64_t VarSize;
392 const std::string VarName;
393
394 public:
397 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order,
400 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
401 int64_t VarSize,
404 const std::string &VarName)
406 VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
408 }
409
410 int64_t getVarSize() const { return VarSize; }
411 StringRef getVarName() const { return VarName; }
412 void setVarSize(int64_t Size) { VarSize = Size; }
413 GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
414 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
415 static bool classof(const OffloadEntryInfo *Info) {
416 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
417 }
418 };
419
420 /// Initialize device global variable entry.
421 /// This is ONLY used for DEVICE compilation.
424 unsigned Order);
425
426 /// Register device global variable entry.
428 int64_t VarSize,
431 /// Checks if the variable with the given name has been registered already.
433 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
434 }
435 /// Applies action \a Action on all registered entries.
436 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
440
441private:
442 /// Return the count of entries at a particular source location.
443 unsigned
444 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
445
446 /// Update the count of entries at a particular source location.
447 void
448 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
449
451 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
452 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
453 EntryInfo.FileID, EntryInfo.Line, 0);
454 }
455
456 // Count of entries at a location.
457 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
458
459 // Storage for target region entries kind.
460 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
461 OffloadEntriesTargetRegionTy;
462 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
463 /// Storage for device global variable entries kind. The storage is to be
464 /// indexed by mangled name.
466 OffloadEntriesDeviceGlobalVarTy;
467 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
468};
469
470/// An interface to create LLVM-IR for OpenMP directives.
471///
472/// Each OpenMP directive has a corresponding public generator method.
474public:
475 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
476 /// not have an effect on \p M (see initialize)
478 : M(M), Builder(M.getContext()), OffloadInfoManager(this),
479 T(Triple(M.getTargetTriple())) {}
481
482 /// Initialize the internal state, this will put structures types and
483 /// potentially other helpers into the underlying module. Must be called
484 /// before any other method and only once! This internal state includes types
485 /// used in the OpenMPIRBuilder generated from OMPKinds.def.
486 void initialize();
487
489
490 /// Finalize the underlying module, e.g., by outlining regions.
491 /// \param Fn The function to be finalized. If not used,
492 /// all functions are finalized.
493 void finalize(Function *Fn = nullptr);
494
495 /// Add attributes known for \p FnID to \p Fn.
497
498 /// Type used throughout for insertion points.
500
501 /// Get the create a name using the platform specific separators.
502 /// \param Parts parts of the final name that needs separation
503 /// The created name has a first separator between the first and second part
504 /// and a second separator between all other parts.
505 /// E.g. with FirstSeparator "$" and Separator "." and
506 /// parts: "p1", "p2", "p3", "p4"
507 /// The resulting name is "p1$p2.p3.p4"
508 /// The separators are retrieved from the OpenMPIRBuilderConfig.
509 std::string createPlatformSpecificName(ArrayRef<StringRef> Parts) const;
510
511 /// Callback type for variable finalization (think destructors).
512 ///
513 /// \param CodeGenIP is the insertion point at which the finalization code
514 /// should be placed.
515 ///
516 /// A finalize callback knows about all objects that need finalization, e.g.
517 /// destruction, when the scope of the currently generated construct is left
518 /// at the time, and location, the callback is invoked.
519 using FinalizeCallbackTy = std::function<void(InsertPointTy CodeGenIP)>;
520
522 /// The finalization callback provided by the last in-flight invocation of
523 /// createXXXX for the directive of kind DK.
525
526 /// The directive kind of the innermost directive that has an associated
527 /// region which might require finalization when it is left.
528 omp::Directive DK;
529
530 /// Flag to indicate if the directive is cancellable.
532 };
533
534 /// Push a finalization callback on the finalization stack.
535 ///
536 /// NOTE: Temporary solution until Clang CG is gone.
538 FinalizationStack.push_back(FI);
539 }
540
541 /// Pop the last finalization callback from the finalization stack.
542 ///
543 /// NOTE: Temporary solution until Clang CG is gone.
545
546 /// Callback type for body (=inner region) code generation
547 ///
548 /// The callback takes code locations as arguments, each describing a
549 /// location where additional instructions can be inserted.
550 ///
551 /// The CodeGenIP may be in the middle of a basic block or point to the end of
552 /// it. The basic block may have a terminator or be degenerate. The callback
553 /// function may just insert instructions at that position, but also split the
554 /// block (without the Before argument of BasicBlock::splitBasicBlock such
555 /// that the identify of the split predecessor block is preserved) and insert
556 /// additional control flow, including branches that do not lead back to what
557 /// follows the CodeGenIP. Note that since the callback is allowed to split
558 /// the block, callers must assume that InsertPoints to positions in the
559 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
560 /// such InsertPoints need to be preserved, it can split the block itself
561 /// before calling the callback.
562 ///
563 /// AllocaIP and CodeGenIP must not point to the same position.
564 ///
565 /// \param AllocaIP is the insertion point at which new alloca instructions
566 /// should be placed. The BasicBlock it is pointing to must
567 /// not be split.
568 /// \param CodeGenIP is the insertion point at which the body code should be
569 /// placed.
571 function_ref<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
572
573 // This is created primarily for sections construct as llvm::function_ref
574 // (BodyGenCallbackTy) is not storable (as described in the comments of
575 // function_ref class - function_ref contains non-ownable reference
576 // to the callable.
578 std::function<void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
579
580 /// Callback type for loop body code generation.
581 ///
582 /// \param CodeGenIP is the insertion point where the loop's body code must be
583 /// placed. This will be a dedicated BasicBlock with a
584 /// conditional branch from the loop condition check and
585 /// terminated with an unconditional branch to the loop
586 /// latch.
587 /// \param IndVar is the induction variable usable at the insertion point.
589 function_ref<void(InsertPointTy CodeGenIP, Value *IndVar)>;
590
591 /// Callback type for variable privatization (think copy & default
592 /// constructor).
593 ///
594 /// \param AllocaIP is the insertion point at which new alloca instructions
595 /// should be placed.
596 /// \param CodeGenIP is the insertion point at which the privatization code
597 /// should be placed.
598 /// \param Original The value being copied/created, should not be used in the
599 /// generated IR.
600 /// \param Inner The equivalent of \p Original that should be used in the
601 /// generated IR; this is equal to \p Original if the value is
602 /// a pointer and can thus be passed directly, otherwise it is
603 /// an equivalent but different value.
604 /// \param ReplVal The replacement value, thus a copy or new created version
605 /// of \p Inner.
606 ///
607 /// \returns The new insertion point where code generation continues and
608 /// \p ReplVal the replacement value.
610 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
611 Value &Inner, Value *&ReplVal)>;
612
613 /// Description of a LLVM-IR insertion point (IP) and a debug/source location
614 /// (filename, line, column, ...).
617 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
620 : IP(IP), DL(DL) {}
623 };
624
625 /// Emitter methods for OpenMP directives.
626 ///
627 ///{
628
629 /// Generator for '#omp barrier'
630 ///
631 /// \param Loc The location where the barrier directive was encountered.
632 /// \param Kind The kind of directive that caused the barrier.
633 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
634 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
635 /// should be checked and acted upon.
636 /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
637 ///
638 /// \returns The insertion point after the barrier.
640 omp::Directive Kind, bool ForceSimpleCall = false,
641 bool CheckCancelFlag = true);
642
643 /// Generator for '#omp cancel'
644 ///
645 /// \param Loc The location where the directive was encountered.
646 /// \param IfCondition The evaluated 'if' clause expression, if any.
647 /// \param CanceledDirective The kind of directive that is cancled.
648 ///
649 /// \returns The insertion point after the barrier.
650 InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition,
651 omp::Directive CanceledDirective);
652
653 /// Generator for '#omp parallel'
654 ///
655 /// \param Loc The insert and source location description.
656 /// \param AllocaIP The insertion points to be used for alloca instructions.
657 /// \param BodyGenCB Callback that will generate the region code.
658 /// \param PrivCB Callback to copy a given variable (think copy constructor).
659 /// \param FiniCB Callback to finalize variable copies.
660 /// \param IfCondition The evaluated 'if' clause expression, if any.
661 /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
662 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
663 /// \param IsCancellable Flag to indicate a cancellable parallel region.
664 ///
665 /// \returns The insertion position *after* the parallel.
668 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
669 FinalizeCallbackTy FiniCB, Value *IfCondition,
670 Value *NumThreads, omp::ProcBindKind ProcBind,
671 bool IsCancellable);
672
673 /// Generator for the control flow structure of an OpenMP canonical loop.
674 ///
675 /// This generator operates on the logical iteration space of the loop, i.e.
676 /// the caller only has to provide a loop trip count of the loop as defined by
677 /// base language semantics. The trip count is interpreted as an unsigned
678 /// integer. The induction variable passed to \p BodyGenCB will be of the same
679 /// type and run from 0 to \p TripCount - 1. It is up to the callback to
680 /// convert the logical iteration variable to the loop counter variable in the
681 /// loop body.
682 ///
683 /// \param Loc The insert and source location description. The insert
684 /// location can be between two instructions or the end of a
685 /// degenerate block (e.g. a BB under construction).
686 /// \param BodyGenCB Callback that will generate the loop body code.
687 /// \param TripCount Number of iterations the loop body is executed.
688 /// \param Name Base name used to derive BB and instruction names.
689 ///
690 /// \returns An object representing the created control flow structure which
691 /// can be used for loop-associated directives.
693 LoopBodyGenCallbackTy BodyGenCB,
694 Value *TripCount,
695 const Twine &Name = "loop");
696
697 /// Generator for the control flow structure of an OpenMP canonical loop.
698 ///
699 /// Instead of a logical iteration space, this allows specifying user-defined
700 /// loop counter values using increment, upper- and lower bounds. To
701 /// disambiguate the terminology when counting downwards, instead of lower
702 /// bounds we use \p Start for the loop counter value in the first body
703 /// iteration.
704 ///
705 /// Consider the following limitations:
706 ///
707 /// * A loop counter space over all integer values of its bit-width cannot be
708 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be
709 /// stored into an 8 bit integer):
710 ///
711 /// DO I = 0, 255, 1
712 ///
713 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
714 /// effectively counting downwards:
715 ///
716 /// for (uint8_t i = 100u; i > 0; i += 127u)
717 ///
718 ///
719 /// TODO: May need to add additional parameters to represent:
720 ///
721 /// * Allow representing downcounting with unsigned integers.
722 ///
723 /// * Sign of the step and the comparison operator might disagree:
724 ///
725 /// for (int i = 0; i < 42; i -= 1u)
726 ///
727 //
728 /// \param Loc The insert and source location description.
729 /// \param BodyGenCB Callback that will generate the loop body code.
730 /// \param Start Value of the loop counter for the first iterations.
731 /// \param Stop Loop counter values past this will stop the loop.
732 /// \param Step Loop counter increment after each iteration; negative
733 /// means counting down.
734 /// \param IsSigned Whether Start, Stop and Step are signed integers.
735 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
736 /// counter.
737 /// \param ComputeIP Insertion point for instructions computing the trip
738 /// count. Can be used to ensure the trip count is available
739 /// at the outermost loop of a loop nest. If not set,
740 /// defaults to the preheader of the generated loop.
741 /// \param Name Base name used to derive BB and instruction names.
742 ///
743 /// \returns An object representing the created control flow structure which
744 /// can be used for loop-associated directives.
746 LoopBodyGenCallbackTy BodyGenCB,
747 Value *Start, Value *Stop, Value *Step,
748 bool IsSigned, bool InclusiveStop,
749 InsertPointTy ComputeIP = {},
750 const Twine &Name = "loop");
751
752 /// Collapse a loop nest into a single loop.
753 ///
754 /// Merges loops of a loop nest into a single CanonicalLoopNest representation
755 /// that has the same number of innermost loop iterations as the origin loop
756 /// nest. The induction variables of the input loops are derived from the
757 /// collapsed loop's induction variable. This is intended to be used to
758 /// implement OpenMP's collapse clause. Before applying a directive,
759 /// collapseLoops normalizes a loop nest to contain only a single loop and the
760 /// directive's implementation does not need to handle multiple loops itself.
761 /// This does not remove the need to handle all loop nest handling by
762 /// directives, such as the ordered(<n>) clause or the simd schedule-clause
763 /// modifier of the worksharing-loop directive.
764 ///
765 /// Example:
766 /// \code
767 /// for (int i = 0; i < 7; ++i) // Canonical loop "i"
768 /// for (int j = 0; j < 9; ++j) // Canonical loop "j"
769 /// body(i, j);
770 /// \endcode
771 ///
772 /// After collapsing with Loops={i,j}, the loop is changed to
773 /// \code
774 /// for (int ij = 0; ij < 63; ++ij) {
775 /// int i = ij / 9;
776 /// int j = ij % 9;
777 /// body(i, j);
778 /// }
779 /// \endcode
780 ///
781 /// In the current implementation, the following limitations apply:
782 ///
783 /// * All input loops have an induction variable of the same type.
784 ///
785 /// * The collapsed loop will have the same trip count integer type as the
786 /// input loops. Therefore it is possible that the collapsed loop cannot
787 /// represent all iterations of the input loops. For instance, assuming a
788 /// 32 bit integer type, and two input loops both iterating 2^16 times, the
789 /// theoretical trip count of the collapsed loop would be 2^32 iteration,
790 /// which cannot be represented in an 32-bit integer. Behavior is undefined
791 /// in this case.
792 ///
793 /// * The trip counts of every input loop must be available at \p ComputeIP.
794 /// Non-rectangular loops are not yet supported.
795 ///
796 /// * At each nest level, code between a surrounding loop and its nested loop
797 /// is hoisted into the loop body, and such code will be executed more
798 /// often than before collapsing (or not at all if any inner loop iteration
799 /// has a trip count of 0). This is permitted by the OpenMP specification.
800 ///
801 /// \param DL Debug location for instructions added for collapsing,
802 /// such as instructions to compute/derive the input loop's
803 /// induction variables.
804 /// \param Loops Loops in the loop nest to collapse. Loops are specified
805 /// from outermost-to-innermost and every control flow of a
806 /// loop's body must pass through its directly nested loop.
807 /// \param ComputeIP Where additional instruction that compute the collapsed
808 /// trip count. If not set, defaults to before the generated
809 /// loop.
810 ///
811 /// \returns The CanonicalLoopInfo object representing the collapsed loop.
814 InsertPointTy ComputeIP);
815
816 /// Get the default alignment value for given target
817 ///
818 /// \param TargetTriple Target triple
819 /// \param Features StringMap which describes extra CPU features
820 static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
821 const StringMap<bool> &Features);
822
823 /// Retrieve (or create if non-existent) the address of a declare
824 /// target variable, used in conjunction with registerTargetGlobalVariable
825 /// to create declare target global variables.
826 ///
827 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
828 /// clause used in conjunction with the variable being registered (link,
829 /// to, enter).
830 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
831 /// clause used in conjunction with the variable being registered (nohost,
832 /// host, any)
833 /// \param IsDeclaration - boolean stating if the variable being registered
834 /// is a declaration-only and not a definition
835 /// \param IsExternallyVisible - boolean stating if the variable is externally
836 /// visible
837 /// \param EntryInfo - Unique entry information for the value generated
838 /// using getTargetEntryUniqueInfo, used to name generated pointer references
839 /// to the declare target variable
840 /// \param MangledName - the mangled name of the variable being registered
841 /// \param GeneratedRefs - references generated by invocations of
842 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
843 /// these are required by Clang for book keeping.
844 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
845 /// \param TargetTriple - The OpenMP device target triple we are compiling
846 /// for
847 /// \param LlvmPtrTy - The type of the variable we are generating or
848 /// retrieving an address for
849 /// \param GlobalInitializer - a lambda function which creates a constant
850 /// used for initializing a pointer reference to the variable in certain
851 /// cases. If a nullptr is passed, it will default to utilising the original
852 /// variable to initialize the pointer reference.
853 /// \param VariableLinkage - a lambda function which returns the variables
854 /// linkage type, if unspecified and a nullptr is given, it will instead
855 /// utilise the linkage stored on the existing global variable in the
856 /// LLVMModule.
860 bool IsDeclaration, bool IsExternallyVisible,
861 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
862 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
863 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
864 std::function<Constant *()> GlobalInitializer,
865 std::function<GlobalValue::LinkageTypes()> VariableLinkage);
866
867 /// Registers a target variable for device or host.
868 ///
869 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
870 /// clause used in conjunction with the variable being registered (link,
871 /// to, enter).
872 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
873 /// clause used in conjunction with the variable being registered (nohost,
874 /// host, any)
875 /// \param IsDeclaration - boolean stating if the variable being registered
876 /// is a declaration-only and not a definition
877 /// \param IsExternallyVisible - boolean stating if the variable is externally
878 /// visible
879 /// \param EntryInfo - Unique entry information for the value generated
880 /// using getTargetEntryUniqueInfo, used to name generated pointer references
881 /// to the declare target variable
882 /// \param MangledName - the mangled name of the variable being registered
883 /// \param GeneratedRefs - references generated by invocations of
884 /// registerTargetGlobalVariable these are required by Clang for book
885 /// keeping.
886 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
887 /// \param TargetTriple - The OpenMP device target triple we are compiling
888 /// for
889 /// \param GlobalInitializer - a lambda function which creates a constant
890 /// used for initializing a pointer reference to the variable in certain
891 /// cases. If a nullptr is passed, it will default to utilising the original
892 /// variable to initialize the pointer reference.
893 /// \param VariableLinkage - a lambda function which returns the variables
894 /// linkage type, if unspecified and a nullptr is given, it will instead
895 /// utilise the linkage stored on the existing global variable in the
896 /// LLVMModule.
897 /// \param LlvmPtrTy - The type of the variable we are generating or
898 /// retrieving an address for
899 /// \param Addr - the original llvm value (addr) of the variable to be
900 /// registered
904 bool IsDeclaration, bool IsExternallyVisible,
905 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
906 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
907 std::vector<Triple> TargetTriple,
908 std::function<Constant *()> GlobalInitializer,
909 std::function<GlobalValue::LinkageTypes()> VariableLinkage,
910 Type *LlvmPtrTy, Constant *Addr);
911
912 /// Get the offset of the OMP_MAP_MEMBER_OF field.
913 unsigned getFlagMemberOffset();
914
915 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
916 /// the position given.
917 /// \param Position - A value indicating the position of the parent
918 /// of the member in the kernel argument structure, often retrieved
919 /// by the parents position in the combined information vectors used
920 /// to generate the structure itself. Multiple children (member's of)
921 /// with the same parent will use the same returned member flag.
923
924 /// Given an initial flag set, this function modifies it to contain
925 /// the passed in MemberOfFlag generated from the getMemberOfFlag
926 /// function. The results are dependent on the existing flag bits
927 /// set in the original flag set.
928 /// \param Flags - The original set of flags to be modified with the
929 /// passed in MemberOfFlag.
930 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
931 /// slightly based on the getMemberOfFlag which adjusts the flag bits
932 /// based on the members position in its parent.
934 omp::OpenMPOffloadMappingFlags MemberOfFlag);
935
936private:
937 /// Modifies the canonical loop to be a statically-scheduled workshare loop
938 /// which is executed on the device
939 ///
940 /// This takes a \p CLI representing a canonical loop, such as the one
941 /// created by \see createCanonicalLoop and emits additional instructions to
942 /// turn it into a workshare loop. In particular, it calls to an OpenMP
943 /// runtime function in the preheader to call OpenMP device rtl function
944 /// which handles worksharing of loop body interations.
945 ///
946 /// \param DL Debug location for instructions added for the
947 /// workshare-loop construct itself.
948 /// \param CLI A descriptor of the canonical loop to workshare.
949 /// \param AllocaIP An insertion point for Alloca instructions usable in the
950 /// preheader of the loop.
951 /// \param LoopType Information about type of loop worksharing.
952 /// It corresponds to type of loop workshare OpenMP pragma.
953 ///
954 /// \returns Point where to insert code after the workshare construct.
955 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
956 InsertPointTy AllocaIP,
957 omp::WorksharingLoopType LoopType);
958
959 /// Modifies the canonical loop to be a statically-scheduled workshare loop.
960 ///
961 /// This takes a \p LoopInfo representing a canonical loop, such as the one
962 /// created by \p createCanonicalLoop and emits additional instructions to
963 /// turn it into a workshare loop. In particular, it calls to an OpenMP
964 /// runtime function in the preheader to obtain the loop bounds to be used in
965 /// the current thread, updates the relevant instructions in the canonical
966 /// loop and calls to an OpenMP runtime finalization function after the loop.
967 ///
968 /// \param DL Debug location for instructions added for the
969 /// workshare-loop construct itself.
970 /// \param CLI A descriptor of the canonical loop to workshare.
971 /// \param AllocaIP An insertion point for Alloca instructions usable in the
972 /// preheader of the loop.
973 /// \param NeedsBarrier Indicates whether a barrier must be inserted after
974 /// the loop.
975 ///
976 /// \returns Point where to insert code after the workshare construct.
977 InsertPointTy applyStaticWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
978 InsertPointTy AllocaIP,
979 bool NeedsBarrier);
980
981 /// Modifies the canonical loop a statically-scheduled workshare loop with a
982 /// user-specified chunk size.
983 ///
984 /// \param DL Debug location for instructions added for the
985 /// workshare-loop construct itself.
986 /// \param CLI A descriptor of the canonical loop to workshare.
987 /// \param AllocaIP An insertion point for Alloca instructions usable in
988 /// the preheader of the loop.
989 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
990 /// loop.
991 /// \param ChunkSize The user-specified chunk size.
992 ///
993 /// \returns Point where to insert code after the workshare construct.
994 InsertPointTy applyStaticChunkedWorkshareLoop(DebugLoc DL,
996 InsertPointTy AllocaIP,
997 bool NeedsBarrier,
998 Value *ChunkSize);
999
1000 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1001 ///
1002 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1003 /// created by \p createCanonicalLoop and emits additional instructions to
1004 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1005 /// runtime function in the preheader to obtain, and then in each iteration
1006 /// to update the loop counter.
1007 ///
1008 /// \param DL Debug location for instructions added for the
1009 /// workshare-loop construct itself.
1010 /// \param CLI A descriptor of the canonical loop to workshare.
1011 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1012 /// preheader of the loop.
1013 /// \param SchedType Type of scheduling to be passed to the init function.
1014 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1015 /// the loop.
1016 /// \param Chunk The size of loop chunk considered as a unit when
1017 /// scheduling. If \p nullptr, defaults to 1.
1018 ///
1019 /// \returns Point where to insert code after the workshare construct.
1020 InsertPointTy applyDynamicWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI,
1021 InsertPointTy AllocaIP,
1022 omp::OMPScheduleType SchedType,
1023 bool NeedsBarrier,
1024 Value *Chunk = nullptr);
1025
1026 /// Create alternative version of the loop to support if clause
1027 ///
1028 /// OpenMP if clause can require to generate second loop. This loop
1029 /// will be executed when if clause condition is not met. createIfVersion
1030 /// adds branch instruction to the copied loop if \p ifCond is not met.
1031 ///
1032 /// \param Loop Original loop which should be versioned.
1033 /// \param IfCond Value which corresponds to if clause condition
1034 /// \param VMap Value to value map to define relation between
1035 /// original and copied loop values and loop blocks.
1036 /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1037 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1038 ValueToValueMapTy &VMap, const Twine &NamePrefix = "");
1039
1040public:
1041 /// Modifies the canonical loop to be a workshare loop.
1042 ///
1043 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1044 /// created by \p createCanonicalLoop and emits additional instructions to
1045 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1046 /// runtime function in the preheader to obtain the loop bounds to be used in
1047 /// the current thread, updates the relevant instructions in the canonical
1048 /// loop and calls to an OpenMP runtime finalization function after the loop.
1049 ///
1050 /// The concrete transformation is done by applyStaticWorkshareLoop,
1051 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1052 /// on the value of \p SchedKind and \p ChunkSize.
1053 ///
1054 /// \param DL Debug location for instructions added for the
1055 /// workshare-loop construct itself.
1056 /// \param CLI A descriptor of the canonical loop to workshare.
1057 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1058 /// preheader of the loop.
1059 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1060 /// the loop.
1061 /// \param SchedKind Scheduling algorithm to use.
1062 /// \param ChunkSize The chunk size for the inner loop.
1063 /// \param HasSimdModifier Whether the simd modifier is present in the
1064 /// schedule clause.
1065 /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1066 /// the schedule clause.
1067 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1068 /// present in the schedule clause.
1069 /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1070 /// present.
1071 /// \param LoopType Information about type of loop worksharing.
1072 /// It corresponds to type of loop workshare OpenMP pragma.
1073 ///
1074 /// \returns Point where to insert code after the workshare construct.
1077 bool NeedsBarrier,
1078 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1079 Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1080 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1081 bool HasOrderedClause = false,
1082 omp::WorksharingLoopType LoopType =
1084
1085 /// Tile a loop nest.
1086 ///
1087 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1088 /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1089 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1090 /// of every loop and every tile sizes must be usable in the outermost
1091 /// loop's preheader. This implies that the loop nest is rectangular.
1092 ///
1093 /// Example:
1094 /// \code
1095 /// for (int i = 0; i < 15; ++i) // Canonical loop "i"
1096 /// for (int j = 0; j < 14; ++j) // Canonical loop "j"
1097 /// body(i, j);
1098 /// \endcode
1099 ///
1100 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1101 /// \code
1102 /// for (int i1 = 0; i1 < 3; ++i1)
1103 /// for (int j1 = 0; j1 < 2; ++j1)
1104 /// for (int i2 = 0; i2 < 5; ++i2)
1105 /// for (int j2 = 0; j2 < 7; ++j2)
1106 /// body(i1*3+i2, j1*3+j2);
1107 /// \endcode
1108 ///
1109 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1110 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1111 /// handles non-constant trip counts, non-constant tile sizes and trip counts
1112 /// that are not multiples of the tile size. In the latter case the tile loop
1113 /// of the last floor-loop iteration will have fewer iterations than specified
1114 /// as its tile size.
1115 ///
1116 ///
1117 /// @param DL Debug location for instructions added by tiling, for
1118 /// instance the floor- and tile trip count computation.
1119 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are
1120 /// invalidated by this method, i.e. should not used after
1121 /// tiling.
1122 /// @param TileSizes For each loop in \p Loops, the tile size for that
1123 /// dimensions.
1124 ///
1125 /// \returns A list of generated loops. Contains twice as many loops as the
1126 /// input loop nest; the first half are the floor loops and the
1127 /// second half are the tile loops.
1128 std::vector<CanonicalLoopInfo *>
1130 ArrayRef<Value *> TileSizes);
1131
1132 /// Fully unroll a loop.
1133 ///
1134 /// Instead of unrolling the loop immediately (and duplicating its body
1135 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1136 /// metadata.
1137 ///
1138 /// \param DL Debug location for instructions added by unrolling.
1139 /// \param Loop The loop to unroll. The loop will be invalidated.
1141
1142 /// Fully or partially unroll a loop. How the loop is unrolled is determined
1143 /// using LLVM's LoopUnrollPass.
1144 ///
1145 /// \param DL Debug location for instructions added by unrolling.
1146 /// \param Loop The loop to unroll. The loop will be invalidated.
1148
1149 /// Partially unroll a loop.
1150 ///
1151 /// The CanonicalLoopInfo of the unrolled loop for use with chained
1152 /// loop-associated directive can be requested using \p UnrolledCLI. Not
1153 /// needing the CanonicalLoopInfo allows more efficient code generation by
1154 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1155 /// A loop-associated directive applied to the unrolled loop needs to know the
1156 /// new trip count which means that if using a heuristically determined unroll
1157 /// factor (\p Factor == 0), that factor must be computed immediately. We are
1158 /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1159 /// but which assumes that some canonicalization has taken place (e.g.
1160 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1161 /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1162 ///
1163 /// \param DL Debug location for instructions added by unrolling.
1164 /// \param Loop The loop to unroll. The loop will be invalidated.
1165 /// \param Factor The factor to unroll the loop by. A factor of 0
1166 /// indicates that a heuristic should be used to determine
1167 /// the unroll-factor.
1168 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1169 /// partially unrolled loop. Otherwise, uses loop metadata
1170 /// to defer unrolling to the LoopUnrollPass.
1171 void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor,
1172 CanonicalLoopInfo **UnrolledCLI);
1173
1174 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1175 /// is cloned. The metadata which prevents vectorization is added to
1176 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1177 /// to false.
1178 ///
1179 /// \param Loop The loop to simd-ize.
1180 /// \param AlignedVars The map which containts pairs of the pointer
1181 /// and its corresponding alignment.
1182 /// \param IfCond The value which corresponds to the if clause
1183 /// condition.
1184 /// \param Order The enum to map order clause.
1185 /// \param Simdlen The Simdlen length to apply to the simd loop.
1186 /// \param Safelen The Safelen length to apply to the simd loop.
1188 MapVector<Value *, Value *> AlignedVars, Value *IfCond,
1189 omp::OrderKind Order, ConstantInt *Simdlen,
1190 ConstantInt *Safelen);
1191
1192 /// Generator for '#omp flush'
1193 ///
1194 /// \param Loc The location where the flush directive was encountered
1195 void createFlush(const LocationDescription &Loc);
1196
1197 /// Generator for '#omp taskwait'
1198 ///
1199 /// \param Loc The location where the taskwait directive was encountered.
1200 void createTaskwait(const LocationDescription &Loc);
1201
1202 /// Generator for '#omp taskyield'
1203 ///
1204 /// \param Loc The location where the taskyield directive was encountered.
1205 void createTaskyield(const LocationDescription &Loc);
1206
1207 /// A struct to pack the relevant information for an OpenMP depend clause.
1208 struct DependData {
1212 explicit DependData() = default;
1214 Value *DepVal)
1216 };
1217
1218 /// Generator for `#omp task`
1219 ///
1220 /// \param Loc The location where the task construct was encountered.
1221 /// \param AllocaIP The insertion point to be used for alloca instructions.
1222 /// \param BodyGenCB Callback that will generate the region code.
1223 /// \param Tied True if the task is tied, false if the task is untied.
1224 /// \param Final i1 value which is `true` if the task is final, `false` if the
1225 /// task is not final.
1226 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1227 /// task is generated, and the encountering thread must
1228 /// suspend the current task region, for which execution
1229 /// cannot be resumed until execution of the structured
1230 /// block that is associated with the generated task is
1231 /// completed.
1232 InsertPointTy createTask(const LocationDescription &Loc,
1233 InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB,
1234 bool Tied = true, Value *Final = nullptr,
1235 Value *IfCondition = nullptr,
1236 SmallVector<DependData> Dependencies = {});
1237
1238 /// Generator for the taskgroup construct
1239 ///
1240 /// \param Loc The location where the taskgroup construct was encountered.
1241 /// \param AllocaIP The insertion point to be used for alloca instructions.
1242 /// \param BodyGenCB Callback that will generate the region code.
1243 InsertPointTy createTaskgroup(const LocationDescription &Loc,
1244 InsertPointTy AllocaIP,
1245 BodyGenCallbackTy BodyGenCB);
1246
1248 std::function<std::tuple<std::string, uint64_t>()>;
1249
1250 /// Creates a unique info for a target entry when provided a filename and
1251 /// line number from.
1252 ///
1253 /// \param CallBack A callback function which should return filename the entry
1254 /// resides in as well as the line number for the target entry
1255 /// \param ParentName The name of the parent the target entry resides in, if
1256 /// any.
1259 StringRef ParentName = "");
1260
1261 /// Enum class for the RedctionGen CallBack type to be used.
1263
1264 /// ReductionGen CallBack for Clang
1265 ///
1266 /// \param CodeGenIP InsertPoint for CodeGen.
1267 /// \param Index Index of the ReductionInfo to generate code for.
1268 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1269 /// codegen, used for fixup later.
1270 /// \param RHSPtr Optionally used by Clang to
1271 /// return the RHSPtr it used for codegen, used for fixup later.
1272 /// \param CurFn Optionally used by Clang to pass in the Current Function as
1273 /// Clang context may be old.
1275 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1276 Value **LHS, Value **RHS, Function *CurFn)>;
1277
1278 /// ReductionGen CallBack for MLIR
1279 ///
1280 /// \param CodeGenIP InsertPoint for CodeGen.
1281 /// \param LHS Pass in the LHS Value to be used for CodeGen.
1282 /// \param RHS Pass in the RHS Value to be used for CodeGen.
1283 using ReductionGenCBTy = std::function<InsertPointTy(
1284 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1285
1286 /// Functions used to generate atomic reductions. Such functions take two
1287 /// Values representing pointers to LHS and RHS of the reduction, as well as
1288 /// the element type of these pointers. They are expected to atomically
1289 /// update the LHS to the reduced value.
1291 std::function<InsertPointTy(InsertPointTy, Type *, Value *, Value *)>;
1292
1293 /// Enum class for reduction evaluation types scalar, complex and aggregate.
1295
1296 /// Information about an OpenMP reduction.
1307 : ElementType(nullptr), Variable(nullptr),
1310
1311 /// Reduction element type, must match pointee type of variable.
1313
1314 /// Reduction variable of pointer type.
1316
1317 /// Thread-private partial reduction variable.
1319
1320 /// Reduction evaluation kind - scalar, complex or aggregate.
1322
1323 /// Callback for generating the reduction body. The IR produced by this will
1324 /// be used to combine two values in a thread-safe context, e.g., under
1325 /// lock or within the same thread, and therefore need not be atomic.
1327
1328 /// Clang callback for generating the reduction body. The IR produced by
1329 /// this will be used to combine two values in a thread-safe context, e.g.,
1330 /// under lock or within the same thread, and therefore need not be atomic.
1332
1333 /// Callback for generating the atomic reduction body, may be null. The IR
1334 /// produced by this will be used to atomically combine two values during
1335 /// reduction. If null, the implementation will use the non-atomic version
1336 /// along with the appropriate synchronization mechanisms.
1338 };
1339
1340 enum class CopyAction : unsigned {
1341 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1342 // the warp using shuffle instructions.
1344 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1345 ThreadCopy,
1346 };
1347
1352 };
1353
1354 /// Supporting functions for Reductions CodeGen.
1355private:
1356 /// Emit the llvm.used metadata.
1357 void emitUsed(StringRef Name, std::vector<llvm::WeakTrackingVH> &List);
1358
1359 /// Get the id of the current thread on the GPU.
1360 Value *getGPUThreadID();
1361
1362 /// Get the GPU warp size.
1363 Value *getGPUWarpSize();
1364
1365 /// Get the id of the warp in the block.
1366 /// We assume that the warp size is 32, which is always the case
1367 /// on the NVPTX device, to generate more efficient code.
1368 Value *getNVPTXWarpID();
1369
1370 /// Get the id of the current lane in the Warp.
1371 /// We assume that the warp size is 32, which is always the case
1372 /// on the NVPTX device, to generate more efficient code.
1373 Value *getNVPTXLaneID();
1374
1375 /// Cast value to the specified type.
1376 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1377
1378 /// This function creates calls to one of two shuffle functions to copy
1379 /// variables between lanes in a warp.
1380 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1381 Type *ElementType, Value *Offset);
1382
1383 /// Function to shuffle over the value from the remote lane.
1384 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1385 Type *ElementType, Value *Offset,
1386 Type *ReductionArrayTy);
1387
1388 /// Emit instructions to copy a Reduce list, which contains partially
1389 /// aggregated values, in the specified direction.
1390 void emitReductionListCopy(
1391 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1392 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1393 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1394
1395 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1396 /// in the same warp. It uses shuffle instructions to copy over data from
1397 /// a remote lane's stack. The reduction algorithm performed is specified
1398 /// by the fourth parameter.
1399 ///
1400 /// Algorithm Versions.
1401 /// Full Warp Reduce (argument value 0):
1402 /// This algorithm assumes that all 32 lanes are active and gathers
1403 /// data from these 32 lanes, producing a single resultant value.
1404 /// Contiguous Partial Warp Reduce (argument value 1):
1405 /// This algorithm assumes that only a *contiguous* subset of lanes
1406 /// are active. This happens for the last warp in a parallel region
1407 /// when the user specified num_threads is not an integer multiple of
1408 /// 32. This contiguous subset always starts with the zeroth lane.
1409 /// Partial Warp Reduce (argument value 2):
1410 /// This algorithm gathers data from any number of lanes at any position.
1411 /// All reduced values are stored in the lowest possible lane. The set
1412 /// of problems every algorithm addresses is a super set of those
1413 /// addressable by algorithms with a lower version number. Overhead
1414 /// increases as algorithm version increases.
1415 ///
1416 /// Terminology
1417 /// Reduce element:
1418 /// Reduce element refers to the individual data field with primitive
1419 /// data types to be combined and reduced across threads.
1420 /// Reduce list:
1421 /// Reduce list refers to a collection of local, thread-private
1422 /// reduce elements.
1423 /// Remote Reduce list:
1424 /// Remote Reduce list refers to a collection of remote (relative to
1425 /// the current thread) reduce elements.
1426 ///
1427 /// We distinguish between three states of threads that are important to
1428 /// the implementation of this function.
1429 /// Alive threads:
1430 /// Threads in a warp executing the SIMT instruction, as distinguished from
1431 /// threads that are inactive due to divergent control flow.
1432 /// Active threads:
1433 /// The minimal set of threads that has to be alive upon entry to this
1434 /// function. The computation is correct iff active threads are alive.
1435 /// Some threads are alive but they are not active because they do not
1436 /// contribute to the computation in any useful manner. Turning them off
1437 /// may introduce control flow overheads without any tangible benefits.
1438 /// Effective threads:
1439 /// In order to comply with the argument requirements of the shuffle
1440 /// function, we must keep all lanes holding data alive. But at most
1441 /// half of them perform value aggregation; we refer to this half of
1442 /// threads as effective. The other half is simply handing off their
1443 /// data.
1444 ///
1445 /// Procedure
1446 /// Value shuffle:
1447 /// In this step active threads transfer data from higher lane positions
1448 /// in the warp to lower lane positions, creating Remote Reduce list.
1449 /// Value aggregation:
1450 /// In this step, effective threads combine their thread local Reduce list
1451 /// with Remote Reduce list and store the result in the thread local
1452 /// Reduce list.
1453 /// Value copy:
1454 /// In this step, we deal with the assumption made by algorithm 2
1455 /// (i.e. contiguity assumption). When we have an odd number of lanes
1456 /// active, say 2k+1, only k threads will be effective and therefore k
1457 /// new values will be produced. However, the Reduce list owned by the
1458 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1459 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1460 /// that the contiguity assumption still holds.
1461 ///
1462 /// \param ReductionInfos Array type containing the ReductionOps.
1463 /// \param ReduceFn The reduction function.
1464 /// \param FuncAttrs Optional param to specify any function attributes that
1465 /// need to be copied to the new function.
1466 ///
1467 /// \return The ShuffleAndReduce function.
1468 Function *emitShuffleAndReduceFunction(
1470 Function *ReduceFn, AttributeList FuncAttrs);
1471
1472 /// This function emits a helper that gathers Reduce lists from the first
1473 /// lane of every active warp to lanes in the first warp.
1474 ///
1475 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1476 /// shared smem[warp_size];
1477 /// For all data entries D in reduce_data:
1478 /// sync
1479 /// If (I am the first lane in each warp)
1480 /// Copy my local D to smem[warp_id]
1481 /// sync
1482 /// if (I am the first warp)
1483 /// Copy smem[thread_id] to my local D
1484 ///
1485 /// \param Loc The insert and source location description.
1486 /// \param ReductionInfos Array type containing the ReductionOps.
1487 /// \param FuncAttrs Optional param to specify any function attributes that
1488 /// need to be copied to the new function.
1489 ///
1490 /// \return The InterWarpCopy function.
1491 Function *emitInterWarpCopyFunction(const LocationDescription &Loc,
1492 ArrayRef<ReductionInfo> ReductionInfos,
1493 AttributeList FuncAttrs);
1494
1495 /// This function emits a helper that copies all the reduction variables from
1496 /// the team into the provided global buffer for the reduction variables.
1497 ///
1498 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1499 /// For all data entries D in reduce_data:
1500 /// Copy local D to buffer.D[Idx]
1501 ///
1502 /// \param ReductionInfos Array type containing the ReductionOps.
1503 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1504 /// \param FuncAttrs Optional param to specify any function attributes that
1505 /// need to be copied to the new function.
1506 ///
1507 /// \return The ListToGlobalCopy function.
1508 Function *emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1509 Type *ReductionsBufferTy,
1510 AttributeList FuncAttrs);
1511
1512 /// This function emits a helper that copies all the reduction variables from
1513 /// the team into the provided global buffer for the reduction variables.
1514 ///
1515 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1516 /// For all data entries D in reduce_data:
1517 /// Copy buffer.D[Idx] to local D;
1518 ///
1519 /// \param ReductionInfos Array type containing the ReductionOps.
1520 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1521 /// \param FuncAttrs Optional param to specify any function attributes that
1522 /// need to be copied to the new function.
1523 ///
1524 /// \return The GlobalToList function.
1525 Function *emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1526 Type *ReductionsBufferTy,
1527 AttributeList FuncAttrs);
1528
1529 /// This function emits a helper that reduces all the reduction variables from
1530 /// the team into the provided global buffer for the reduction variables.
1531 ///
1532 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1533 /// void *GlobPtrs[];
1534 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1535 /// ...
1536 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1537 /// reduce_function(GlobPtrs, reduce_data);
1538 ///
1539 /// \param ReductionInfos Array type containing the ReductionOps.
1540 /// \param ReduceFn The reduction function.
1541 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1542 /// \param FuncAttrs Optional param to specify any function attributes that
1543 /// need to be copied to the new function.
1544 ///
1545 /// \return The ListToGlobalReduce function.
1546 Function *
1547 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1548 Function *ReduceFn, Type *ReductionsBufferTy,
1549 AttributeList FuncAttrs);
1550
1551 /// This function emits a helper that reduces all the reduction variables from
1552 /// the team into the provided global buffer for the reduction variables.
1553 ///
1554 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1555 /// void *GlobPtrs[];
1556 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1557 /// ...
1558 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1559 /// reduce_function(reduce_data, GlobPtrs);
1560 ///
1561 /// \param ReductionInfos Array type containing the ReductionOps.
1562 /// \param ReduceFn The reduction function.
1563 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1564 /// \param FuncAttrs Optional param to specify any function attributes that
1565 /// need to be copied to the new function.
1566 ///
1567 /// \return The GlobalToListReduce function.
1568 Function *
1569 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1570 Function *ReduceFn, Type *ReductionsBufferTy,
1571 AttributeList FuncAttrs);
1572
1573 /// Get the function name of a reduction function.
1574 std::string getReductionFuncName(StringRef Name) const;
1575
1576 /// Emits reduction function.
1577 /// \param ReducerName Name of the function calling the reduction.
1578 /// \param ReductionInfos Array type containing the ReductionOps.
1579 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1580 /// CodeGenCB kind.
1581 /// \param FuncAttrs Optional param to specify any function attributes that
1582 /// need to be copied to the new function.
1583 ///
1584 /// \return The reduction function.
1585 Function *createReductionFunction(
1586 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1588 AttributeList FuncAttrs = {});
1589
1590public:
1591 ///
1592 /// Design of OpenMP reductions on the GPU
1593 ///
1594 /// Consider a typical OpenMP program with one or more reduction
1595 /// clauses:
1596 ///
1597 /// float foo;
1598 /// double bar;
1599 /// #pragma omp target teams distribute parallel for \
1600 /// reduction(+:foo) reduction(*:bar)
1601 /// for (int i = 0; i < N; i++) {
1602 /// foo += A[i]; bar *= B[i];
1603 /// }
1604 ///
1605 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1606 /// all teams. In our OpenMP implementation on the NVPTX device an
1607 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1608 /// within a team are mapped to CUDA threads within a threadblock.
1609 /// Our goal is to efficiently aggregate values across all OpenMP
1610 /// threads such that:
1611 ///
1612 /// - the compiler and runtime are logically concise, and
1613 /// - the reduction is performed efficiently in a hierarchical
1614 /// manner as follows: within OpenMP threads in the same warp,
1615 /// across warps in a threadblock, and finally across teams on
1616 /// the NVPTX device.
1617 ///
1618 /// Introduction to Decoupling
1619 ///
1620 /// We would like to decouple the compiler and the runtime so that the
1621 /// latter is ignorant of the reduction variables (number, data types)
1622 /// and the reduction operators. This allows a simpler interface
1623 /// and implementation while still attaining good performance.
1624 ///
1625 /// Pseudocode for the aforementioned OpenMP program generated by the
1626 /// compiler is as follows:
1627 ///
1628 /// 1. Create private copies of reduction variables on each OpenMP
1629 /// thread: 'foo_private', 'bar_private'
1630 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1631 /// to it and writes the result in 'foo_private' and 'bar_private'
1632 /// respectively.
1633 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1634 /// and store the result on the team master:
1635 ///
1636 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
1637 /// reduceData, shuffleReduceFn, interWarpCpyFn)
1638 ///
1639 /// where:
1640 /// struct ReduceData {
1641 /// double *foo;
1642 /// double *bar;
1643 /// } reduceData
1644 /// reduceData.foo = &foo_private
1645 /// reduceData.bar = &bar_private
1646 ///
1647 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1648 /// auxiliary functions generated by the compiler that operate on
1649 /// variables of type 'ReduceData'. They aid the runtime perform
1650 /// algorithmic steps in a data agnostic manner.
1651 ///
1652 /// 'shuffleReduceFn' is a pointer to a function that reduces data
1653 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
1654 /// same warp. It takes the following arguments as input:
1655 ///
1656 /// a. variable of type 'ReduceData' on the calling lane,
1657 /// b. its lane_id,
1658 /// c. an offset relative to the current lane_id to generate a
1659 /// remote_lane_id. The remote lane contains the second
1660 /// variable of type 'ReduceData' that is to be reduced.
1661 /// d. an algorithm version parameter determining which reduction
1662 /// algorithm to use.
1663 ///
1664 /// 'shuffleReduceFn' retrieves data from the remote lane using
1665 /// efficient GPU shuffle intrinsics and reduces, using the
1666 /// algorithm specified by the 4th parameter, the two operands
1667 /// element-wise. The result is written to the first operand.
1668 ///
1669 /// Different reduction algorithms are implemented in different
1670 /// runtime functions, all calling 'shuffleReduceFn' to perform
1671 /// the essential reduction step. Therefore, based on the 4th
1672 /// parameter, this function behaves slightly differently to
1673 /// cooperate with the runtime to ensure correctness under
1674 /// different circumstances.
1675 ///
1676 /// 'InterWarpCpyFn' is a pointer to a function that transfers
1677 /// reduced variables across warps. It tunnels, through CUDA
1678 /// shared memory, the thread-private data of type 'ReduceData'
1679 /// from lane 0 of each warp to a lane in the first warp.
1680 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1681 /// The last team writes the global reduced value to memory.
1682 ///
1683 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1684 /// reduceData, shuffleReduceFn, interWarpCpyFn,
1685 /// scratchpadCopyFn, loadAndReduceFn)
1686 ///
1687 /// 'scratchpadCopyFn' is a helper that stores reduced
1688 /// data from the team master to a scratchpad array in
1689 /// global memory.
1690 ///
1691 /// 'loadAndReduceFn' is a helper that loads data from
1692 /// the scratchpad array and reduces it with the input
1693 /// operand.
1694 ///
1695 /// These compiler generated functions hide address
1696 /// calculation and alignment information from the runtime.
1697 /// 5. if ret == 1:
1698 /// The team master of the last team stores the reduced
1699 /// result to the globals in memory.
1700 /// foo += reduceData.foo; bar *= reduceData.bar
1701 ///
1702 ///
1703 /// Warp Reduction Algorithms
1704 ///
1705 /// On the warp level, we have three algorithms implemented in the
1706 /// OpenMP runtime depending on the number of active lanes:
1707 ///
1708 /// Full Warp Reduction
1709 ///
1710 /// The reduce algorithm within a warp where all lanes are active
1711 /// is implemented in the runtime as follows:
1712 ///
1713 /// full_warp_reduce(void *reduce_data,
1714 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1715 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1716 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
1717 /// }
1718 ///
1719 /// The algorithm completes in log(2, WARPSIZE) steps.
1720 ///
1721 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1722 /// not used therefore we save instructions by not retrieving lane_id
1723 /// from the corresponding special registers. The 4th parameter, which
1724 /// represents the version of the algorithm being used, is set to 0 to
1725 /// signify full warp reduction.
1726 ///
1727 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1728 ///
1729 /// #reduce_elem refers to an element in the local lane's data structure
1730 /// #remote_elem is retrieved from a remote lane
1731 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1732 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1733 ///
1734 /// Contiguous Partial Warp Reduction
1735 ///
1736 /// This reduce algorithm is used within a warp where only the first
1737 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1738 /// number of OpenMP threads in a parallel region is not a multiple of
1739 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
1740 ///
1741 /// void
1742 /// contiguous_partial_reduce(void *reduce_data,
1743 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1744 /// int size, int lane_id) {
1745 /// int curr_size;
1746 /// int offset;
1747 /// curr_size = size;
1748 /// mask = curr_size/2;
1749 /// while (offset>0) {
1750 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1751 /// curr_size = (curr_size+1)/2;
1752 /// offset = curr_size/2;
1753 /// }
1754 /// }
1755 ///
1756 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1757 ///
1758 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1759 /// if (lane_id < offset)
1760 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1761 /// else
1762 /// reduce_elem = remote_elem
1763 ///
1764 /// This algorithm assumes that the data to be reduced are located in a
1765 /// contiguous subset of lanes starting from the first. When there is
1766 /// an odd number of active lanes, the data in the last lane is not
1767 /// aggregated with any other lane's dat but is instead copied over.
1768 ///
1769 /// Dispersed Partial Warp Reduction
1770 ///
1771 /// This algorithm is used within a warp when any discontiguous subset of
1772 /// lanes are active. It is used to implement the reduction operation
1773 /// across lanes in an OpenMP simd region or in a nested parallel region.
1774 ///
1775 /// void
1776 /// dispersed_partial_reduce(void *reduce_data,
1777 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1778 /// int size, remote_id;
1779 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1780 /// do {
1781 /// remote_id = next_active_lane_id_right_after_me();
1782 /// # the above function returns 0 of no active lane
1783 /// # is present right after the current lane.
1784 /// size = number_of_active_lanes_in_this_warp();
1785 /// logical_lane_id /= 2;
1786 /// ShuffleReduceFn(reduce_data, logical_lane_id,
1787 /// remote_id-1-threadIdx.x, 2);
1788 /// } while (logical_lane_id % 2 == 0 && size > 1);
1789 /// }
1790 ///
1791 /// There is no assumption made about the initial state of the reduction.
1792 /// Any number of lanes (>=1) could be active at any position. The reduction
1793 /// result is returned in the first active lane.
1794 ///
1795 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1796 ///
1797 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1798 /// if (lane_id % 2 == 0 && offset > 0)
1799 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
1800 /// else
1801 /// reduce_elem = remote_elem
1802 ///
1803 ///
1804 /// Intra-Team Reduction
1805 ///
1806 /// This function, as implemented in the runtime call
1807 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
1808 /// threads in a team. It first reduces within a warp using the
1809 /// aforementioned algorithms. We then proceed to gather all such
1810 /// reduced values at the first warp.
1811 ///
1812 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
1813 /// data from each of the "warp master" (zeroth lane of each warp, where
1814 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
1815 /// a mathematical sense) the problem of reduction across warp masters in
1816 /// a block to the problem of warp reduction.
1817 ///
1818 ///
1819 /// Inter-Team Reduction
1820 ///
1821 /// Once a team has reduced its data to a single value, it is stored in
1822 /// a global scratchpad array. Since each team has a distinct slot, this
1823 /// can be done without locking.
1824 ///
1825 /// The last team to write to the scratchpad array proceeds to reduce the
1826 /// scratchpad array. One or more workers in the last team use the helper
1827 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
1828 /// the k'th worker reduces every k'th element.
1829 ///
1830 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
1831 /// reduce across workers and compute a globally reduced value.
1832 ///
1833 /// \param Loc The location where the reduction was
1834 /// encountered. Must be within the associate
1835 /// directive and after the last local access to the
1836 /// reduction variables.
1837 /// \param AllocaIP An insertion point suitable for allocas usable
1838 /// in reductions.
1839 /// \param CodeGenIP An insertion point suitable for code
1840 /// generation. \param ReductionInfos A list of info on each reduction
1841 /// variable. \param IsNoWait Optional flag set if the reduction is
1842 /// marked as
1843 /// nowait.
1844 /// \param IsTeamsReduction Optional flag set if it is a teams
1845 /// reduction.
1846 /// \param HasDistribute Optional flag set if it is a
1847 /// distribute reduction.
1848 /// \param GridValue Optional GPU grid value.
1849 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
1850 /// used for teams reduction.
1851 /// \param SrcLocInfo Source location information global.
1853 const LocationDescription &Loc, InsertPointTy AllocaIP,
1854 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
1855 bool IsNoWait = false, bool IsTeamsReduction = false,
1856 bool HasDistribute = false,
1858 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
1859 Value *SrcLocInfo = nullptr);
1860
1861 // TODO: provide atomic and non-atomic reduction generators for reduction
1862 // operators defined by the OpenMP specification.
1863
1864 /// Generator for '#omp reduction'.
1865 ///
1866 /// Emits the IR instructing the runtime to perform the specific kind of
1867 /// reductions. Expects reduction variables to have been privatized and
1868 /// initialized to reduction-neutral values separately. Emits the calls to
1869 /// runtime functions as well as the reduction function and the basic blocks
1870 /// performing the reduction atomically and non-atomically.
1871 ///
1872 /// The code emitted for the following:
1873 ///
1874 /// \code
1875 /// type var_1;
1876 /// type var_2;
1877 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
1878 /// /* body */;
1879 /// \endcode
1880 ///
1881 /// corresponds to the following sketch.
1882 ///
1883 /// \code
1884 /// void _outlined_par() {
1885 /// // N is the number of different reductions.
1886 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
1887 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
1888 /// _omp_reduction_func,
1889 /// _gomp_critical_user.reduction.var)) {
1890 /// case 1: {
1891 /// var_1 = var_1 <reduction-op> privatized_var_1;
1892 /// var_2 = var_2 <reduction-op> privatized_var_2;
1893 /// // ...
1894 /// __kmpc_end_reduce(...);
1895 /// break;
1896 /// }
1897 /// case 2: {
1898 /// _Atomic<ReductionOp>(var_1, privatized_var_1);
1899 /// _Atomic<ReductionOp>(var_2, privatized_var_2);
1900 /// // ...
1901 /// break;
1902 /// }
1903 /// default: break;
1904 /// }
1905 /// }
1906 ///
1907 /// void _omp_reduction_func(void **lhs, void **rhs) {
1908 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
1909 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
1910 /// // ...
1911 /// }
1912 /// \endcode
1913 ///
1914 /// \param Loc The location where the reduction was
1915 /// encountered. Must be within the associate
1916 /// directive and after the last local access to the
1917 /// reduction variables.
1918 /// \param AllocaIP An insertion point suitable for allocas usable
1919 /// in reductions.
1920 /// \param ReductionInfos A list of info on each reduction variable.
1921 /// \param IsNoWait A flag set if the reduction is marked as nowait.
1922 /// \param IsByRef A flag set if the reduction is using reference
1923 /// or direct value.
1924 InsertPointTy createReductions(const LocationDescription &Loc,
1925 InsertPointTy AllocaIP,
1926 ArrayRef<ReductionInfo> ReductionInfos,
1927 ArrayRef<bool> IsByRef, bool IsNoWait = false);
1928
1929 ///}
1930
1931 /// Return the insertion point used by the underlying IRBuilder.
1933
1934 /// Update the internal location to \p Loc.
1936 Builder.restoreIP(Loc.IP);
1938 return Loc.IP.getBlock() != nullptr;
1939 }
1940
1941 /// Return the function declaration for the runtime function with \p FnID.
1944
1946
1947 /// Return the (LLVM-IR) string describing the source location \p LocStr.
1948 Constant *getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize);
1949
1950 /// Return the (LLVM-IR) string describing the default source location.
1952
1953 /// Return the (LLVM-IR) string describing the source location identified by
1954 /// the arguments.
1955 Constant *getOrCreateSrcLocStr(StringRef FunctionName, StringRef FileName,
1956 unsigned Line, unsigned Column,
1957 uint32_t &SrcLocStrSize);
1958
1959 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
1960 /// fallback if \p DL does not specify the function name.
1962 Function *F = nullptr);
1963
1964 /// Return the (LLVM-IR) string describing the source location \p Loc.
1965 Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
1966 uint32_t &SrcLocStrSize);
1967
1968 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
1969 /// TODO: Create a enum class for the Reserve2Flags
1970 Constant *getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize,
1971 omp::IdentFlag Flags = omp::IdentFlag(0),
1972 unsigned Reserve2Flags = 0);
1973
1974 /// Create a hidden global flag \p Name in the module with initial value \p
1975 /// Value.
1977
1978 /// Generate control flow and cleanup for cancellation.
1979 ///
1980 /// \param CancelFlag Flag indicating if the cancellation is performed.
1981 /// \param CanceledDirective The kind of directive that is cancled.
1982 /// \param ExitCB Extra code to be generated in the exit block.
1983 void emitCancelationCheckImpl(Value *CancelFlag,
1984 omp::Directive CanceledDirective,
1985 FinalizeCallbackTy ExitCB = {});
1986
1987 /// Generate a target region entry call.
1988 ///
1989 /// \param Loc The location at which the request originated and is fulfilled.
1990 /// \param AllocaIP The insertion point to be used for alloca instructions.
1991 /// \param Return Return value of the created function returned by reference.
1992 /// \param DeviceID Identifier for the device via the 'device' clause.
1993 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
1994 /// or 0 if unspecified and -1 if there is no 'teams' clause.
1995 /// \param NumThreads Number of threads via the 'thread_limit' clause.
1996 /// \param HostPtr Pointer to the host-side pointer of the target kernel.
1997 /// \param KernelArgs Array of arguments to the kernel.
1998 InsertPointTy emitTargetKernel(const LocationDescription &Loc,
1999 InsertPointTy AllocaIP, Value *&Return,
2000 Value *Ident, Value *DeviceID, Value *NumTeams,
2001 Value *NumThreads, Value *HostPtr,
2002 ArrayRef<Value *> KernelArgs);
2003
2004 /// Generate a flush runtime call.
2005 ///
2006 /// \param Loc The location at which the request originated and is fulfilled.
2007 void emitFlush(const LocationDescription &Loc);
2008
2009 /// The finalization stack made up of finalize callbacks currently in-flight,
2010 /// wrapped into FinalizationInfo objects that reference also the finalization
2011 /// target block and the kind of cancellable directive.
2013
2014 /// Return true if the last entry in the finalization stack is of kind \p DK
2015 /// and cancellable.
2016 bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2017 return !FinalizationStack.empty() &&
2018 FinalizationStack.back().IsCancellable &&
2019 FinalizationStack.back().DK == DK;
2020 }
2021
2022 /// Generate a taskwait runtime call.
2023 ///
2024 /// \param Loc The location at which the request originated and is fulfilled.
2025 void emitTaskwaitImpl(const LocationDescription &Loc);
2026
2027 /// Generate a taskyield runtime call.
2028 ///
2029 /// \param Loc The location at which the request originated and is fulfilled.
2030 void emitTaskyieldImpl(const LocationDescription &Loc);
2031
2032 /// Return the current thread ID.
2033 ///
2034 /// \param Ident The ident (ident_t*) describing the query origin.
2036
2037 /// The OpenMPIRBuilder Configuration
2039
2040 /// The underlying LLVM-IR module
2042
2043 /// The LLVM-IR Builder used to create IR.
2045
2046 /// Map to remember source location strings
2048
2049 /// Map to remember existing ident_t*.
2051
2052 /// Info manager to keep track of target regions.
2054
2055 /// The target triple of the underlying module.
2056 const Triple T;
2057
2058 /// Helper that contains information about regions we need to outline
2059 /// during finalization.
2061 using PostOutlineCBTy = std::function<void(Function &)>;
2065
2066 /// Collect all blocks in between EntryBB and ExitBB in both the given
2067 /// vector and set.
2069 SmallVectorImpl<BasicBlock *> &BlockVector);
2070
2071 /// Return the function that contains the region to be outlined.
2072 Function *getFunction() const { return EntryBB->getParent(); }
2073 };
2074
2075 /// Collection of regions that need to be outlined during finalization.
2077
2078 /// A collection of candidate target functions that's constant allocas will
2079 /// attempt to be raised on a call of finalize after all currently enqueued
2080 /// outline info's have been processed.
2082
2083 /// Collection of owned canonical loop objects that eventually need to be
2084 /// free'd.
2085 std::forward_list<CanonicalLoopInfo> LoopInfos;
2086
2087 /// Add a new region that will be outlined later.
2088 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2089
2090 /// An ordered map of auto-generated variables to their unique names.
2091 /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2092 /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2093 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2094 /// variables.
2096
2097 /// Computes the size of type in bytes.
2098 Value *getSizeInBytes(Value *BasePtr);
2099
2100 // Emit a branch from the current block to the Target block only if
2101 // the current block has a terminator.
2103
2104 // If BB has no use then delete it and return. Else place BB after the current
2105 // block, if possible, or else at the end of the function. Also add a branch
2106 // from current block to BB if current block does not have a terminator.
2107 void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished = false);
2108
2109 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2110 /// Here is the logic:
2111 /// if (Cond) {
2112 /// ThenGen();
2113 /// } else {
2114 /// ElseGen();
2115 /// }
2117 BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP = {});
2118
2119 /// Create the global variable holding the offload mappings information.
2121 std::string VarName);
2122
2123 /// Create the global variable holding the offload names information.
2126 std::string VarName);
2127
2130 AllocaInst *Args = nullptr;
2132 };
2133
2134 /// Create the allocas instruction used in call to mapper functions.
2136 InsertPointTy AllocaIP, unsigned NumOperands,
2138
2139 /// Create the call for the target mapper function.
2140 /// \param Loc The source location description.
2141 /// \param MapperFunc Function to be called.
2142 /// \param SrcLocInfo Source location information global.
2143 /// \param MaptypesArg The argument types.
2144 /// \param MapnamesArg The argument names.
2145 /// \param MapperAllocas The AllocaInst used for the call.
2146 /// \param DeviceID Device ID for the call.
2147 /// \param NumOperands Number of operands in the call.
2148 void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc,
2149 Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg,
2150 struct MapperAllocas &MapperAllocas, int64_t DeviceID,
2151 unsigned NumOperands);
2152
2153 /// Container for the arguments used to pass data to the runtime library.
2155 /// The array of base pointer passed to the runtime library.
2157 /// The array of section pointers passed to the runtime library.
2159 /// The array of sizes passed to the runtime library.
2160 Value *SizesArray = nullptr;
2161 /// The array of map types passed to the runtime library for the beginning
2162 /// of the region or for the entire region if there are no separate map
2163 /// types for the region end.
2165 /// The array of map types passed to the runtime library for the end of the
2166 /// region, or nullptr if there are no separate map types for the region
2167 /// end.
2169 /// The array of user-defined mappers passed to the runtime library.
2171 /// The array of original declaration names of mapped pointers sent to the
2172 /// runtime library for debugging
2174
2175 explicit TargetDataRTArgs() {}
2184 };
2185
2186 /// Data structure that contains the needed information to construct the
2187 /// kernel args vector.
2189 /// Number of arguments passed to the runtime library.
2190 unsigned NumTargetItems = 0;
2191 /// Arguments passed to the runtime library
2193 /// The number of iterations
2195 /// The number of teams.
2197 /// The number of threads.
2198 Value *NumThreads = nullptr;
2199 /// The size of the dynamic shared memory.
2201 /// True if the kernel has 'no wait' clause.
2202 bool HasNoWait = false;
2203
2204 // Constructors for TargetKernelArgs.
2213 };
2214
2215 /// Create the kernel args vector used by emitTargetKernel. This function
2216 /// creates various constant values that are used in the resulting args
2217 /// vector.
2218 static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2220 SmallVector<Value *> &ArgsVector);
2221
2222 /// Struct that keeps the information that should be kept throughout
2223 /// a 'target data' region.
2225 /// Set to true if device pointer information have to be obtained.
2226 bool RequiresDevicePointerInfo = false;
2227 /// Set to true if Clang emits separate runtime calls for the beginning and
2228 /// end of the region. These calls might have separate map type arrays.
2229 bool SeparateBeginEndCalls = false;
2230
2231 public:
2233
2236
2237 /// Indicate whether any user-defined mapper exists.
2238 bool HasMapper = false;
2239 /// The total number of pointers passed to the runtime library.
2240 unsigned NumberOfPtrs = 0u;
2241
2242 bool EmitDebug = false;
2243
2244 explicit TargetDataInfo() {}
2245 explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2246 bool SeparateBeginEndCalls)
2247 : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2248 SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2249 /// Clear information about the data arrays.
2252 HasMapper = false;
2253 NumberOfPtrs = 0u;
2254 }
2255 /// Return true if the current target data information has valid arrays.
2256 bool isValid() {
2260 }
2261 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2262 bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2263 };
2264
2272
2273 /// This structure contains combined information generated for mappable
2274 /// clauses, including base pointers, pointers, sizes, map types, user-defined
2275 /// mappers, and non-contiguous information.
2276 struct MapInfosTy {
2278 bool IsNonContiguous = false;
2283 };
2291
2292 /// Append arrays in \a CurInfo.
2293 void append(MapInfosTy &CurInfo) {
2295 CurInfo.BasePointers.end());
2296 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2298 CurInfo.DevicePointers.end());
2299 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2300 Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2301 Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2303 CurInfo.NonContigInfo.Dims.end());
2305 CurInfo.NonContigInfo.Offsets.end());
2307 CurInfo.NonContigInfo.Counts.end());
2309 CurInfo.NonContigInfo.Strides.end());
2310 }
2311 };
2312
2313 /// Callback function type for functions emitting the host fallback code that
2314 /// is executed when the kernel launch fails. It takes an insertion point as
2315 /// parameter where the code should be emitted. It returns an insertion point
2316 /// that points right after after the emitted code.
2318
2319 /// Generate a target region entry call and host fallback call.
2320 ///
2321 /// \param Loc The location at which the request originated and is fulfilled.
2322 /// \param OutlinedFn The outlined kernel function.
2323 /// \param OutlinedFnID The ooulined function ID.
2324 /// \param EmitTargetCallFallbackCB Call back function to generate host
2325 /// fallback code.
2326 /// \param Args Data structure holding information about the kernel arguments.
2327 /// \param DeviceID Identifier for the device via the 'device' clause.
2328 /// \param RTLoc Source location identifier
2329 /// \param AllocaIP The insertion point to be used for alloca instructions.
2331 const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID,
2332 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2333 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2334
2335 /// Generate a target-task for the target construct
2336 ///
2337 /// \param OutlinedFn The outlined device/target kernel function.
2338 /// \param OutlinedFnID The ooulined function ID.
2339 /// \param EmitTargetCallFallbackCB Call back function to generate host
2340 /// fallback code.
2341 /// \param Args Data structure holding information about the kernel arguments.
2342 /// \param DeviceID Identifier for the device via the 'device' clause.
2343 /// \param RTLoc Source location identifier
2344 /// \param AllocaIP The insertion point to be used for alloca instructions.
2345 /// \param Dependencies Vector of DependData objects holding information of
2346 /// dependencies as specified by the 'depend' clause.
2347 /// \param HasNoWait True if the target construct had 'nowait' on it, false
2348 /// otherwise
2350 Function *OutlinedFn, Value *OutlinedFnID,
2351 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2352 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP,
2353 SmallVector<OpenMPIRBuilder::DependData> &Dependencies, bool HasNoWait);
2354
2355 /// Emit the arguments to be passed to the runtime library based on the
2356 /// arrays of base pointers, pointers, sizes, map types, and mappers. If
2357 /// ForEndCall, emit map types to be passed for the end of the region instead
2358 /// of the beginning.
2362 bool ForEndCall = false);
2363
2364 /// Emit an array of struct descriptors to be assigned to the offload args.
2366 InsertPointTy CodeGenIP,
2367 MapInfosTy &CombinedInfo,
2369
2370 /// Emit the arrays used to pass the captures and map information to the
2371 /// offloading runtime library. If there is no map or capture information,
2372 /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2373 /// that contains information generated for mappable clauses,
2374 /// including base pointers, pointers, sizes, map types, user-defined mappers.
2376 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2377 TargetDataInfo &Info, bool IsNonContiguous = false,
2378 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2379 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2380
2381 /// Allocates memory for and populates the arrays required for offloading
2382 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2383 /// emits their base addresses as arguments to be passed to the runtime
2384 /// library. In essence, this function is a combination of
2385 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2386 /// be preferred by clients of OpenMPIRBuilder.
2388 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2389 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2390 bool IsNonContiguous = false, bool ForEndCall = false,
2391 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2392 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
2393
2394 /// Creates offloading entry for the provided entry ID \a ID, address \a
2395 /// Addr, size \a Size, and flags \a Flags.
2397 int32_t Flags, GlobalValue::LinkageTypes,
2398 StringRef Name = "");
2399
2400 /// The kind of errors that can occur when emitting the offload entries and
2401 /// metadata.
2407
2408 /// Callback function type
2410 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2411
2412 // Emit the offloading entries and metadata so that the device codegen side
2413 // can easily figure out what to emit. The produced metadata looks like
2414 // this:
2415 //
2416 // !omp_offload.info = !{!1, ...}
2417 //
2418 // We only generate metadata for function that contain target regions.
2420 EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2421
2422public:
2423 /// Generator for __kmpc_copyprivate
2424 ///
2425 /// \param Loc The source location description.
2426 /// \param BufSize Number of elements in the buffer.
2427 /// \param CpyBuf List of pointers to data to be copied.
2428 /// \param CpyFn function to call for copying data.
2429 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2430 ///
2431 /// \return The insertion position *after* the CopyPrivate call.
2432
2434 llvm::Value *BufSize, llvm::Value *CpyBuf,
2435 llvm::Value *CpyFn, llvm::Value *DidIt);
2436
2437 /// Generator for '#omp single'
2438 ///
2439 /// \param Loc The source location description.
2440 /// \param BodyGenCB Callback that will generate the region code.
2441 /// \param FiniCB Callback to finalize variable copies.
2442 /// \param IsNowait If false, a barrier is emitted.
2443 /// \param CPVars copyprivate variables.
2444 /// \param CPFuncs copy functions to use for each copyprivate variable.
2445 ///
2446 /// \returns The insertion position *after* the single call.
2448 BodyGenCallbackTy BodyGenCB,
2449 FinalizeCallbackTy FiniCB, bool IsNowait,
2450 ArrayRef<llvm::Value *> CPVars = {},
2451 ArrayRef<llvm::Function *> CPFuncs = {});
2452
2453 /// Generator for '#omp master'
2454 ///
2455 /// \param Loc The insert and source location description.
2456 /// \param BodyGenCB Callback that will generate the region code.
2457 /// \param FiniCB Callback to finalize variable copies.
2458 ///
2459 /// \returns The insertion position *after* the master.
2460 InsertPointTy createMaster(const LocationDescription &Loc,
2461 BodyGenCallbackTy BodyGenCB,
2462 FinalizeCallbackTy FiniCB);
2463
2464 /// Generator for '#omp masked'
2465 ///
2466 /// \param Loc The insert and source location description.
2467 /// \param BodyGenCB Callback that will generate the region code.
2468 /// \param FiniCB Callback to finialize variable copies.
2469 ///
2470 /// \returns The insertion position *after* the masked.
2471 InsertPointTy createMasked(const LocationDescription &Loc,
2472 BodyGenCallbackTy BodyGenCB,
2473 FinalizeCallbackTy FiniCB, Value *Filter);
2474
2475 /// Generator for '#omp critical'
2476 ///
2477 /// \param Loc The insert and source location description.
2478 /// \param BodyGenCB Callback that will generate the region body code.
2479 /// \param FiniCB Callback to finalize variable copies.
2480 /// \param CriticalName name of the lock used by the critical directive
2481 /// \param HintInst Hint Instruction for hint clause associated with critical
2482 ///
2483 /// \returns The insertion position *after* the critical.
2484 InsertPointTy createCritical(const LocationDescription &Loc,
2485 BodyGenCallbackTy BodyGenCB,
2486 FinalizeCallbackTy FiniCB,
2487 StringRef CriticalName, Value *HintInst);
2488
2489 /// Generator for '#omp ordered depend (source | sink)'
2490 ///
2491 /// \param Loc The insert and source location description.
2492 /// \param AllocaIP The insertion point to be used for alloca instructions.
2493 /// \param NumLoops The number of loops in depend clause.
2494 /// \param StoreValues The value will be stored in vector address.
2495 /// \param Name The name of alloca instruction.
2496 /// \param IsDependSource If true, depend source; otherwise, depend sink.
2497 ///
2498 /// \return The insertion position *after* the ordered.
2499 InsertPointTy createOrderedDepend(const LocationDescription &Loc,
2500 InsertPointTy AllocaIP, unsigned NumLoops,
2501 ArrayRef<llvm::Value *> StoreValues,
2502 const Twine &Name, bool IsDependSource);
2503
2504 /// Generator for '#omp ordered [threads | simd]'
2505 ///
2506 /// \param Loc The insert and source location description.
2507 /// \param BodyGenCB Callback that will generate the region code.
2508 /// \param FiniCB Callback to finalize variable copies.
2509 /// \param IsThreads If true, with threads clause or without clause;
2510 /// otherwise, with simd clause;
2511 ///
2512 /// \returns The insertion position *after* the ordered.
2513 InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc,
2514 BodyGenCallbackTy BodyGenCB,
2515 FinalizeCallbackTy FiniCB,
2516 bool IsThreads);
2517
2518 /// Generator for '#omp sections'
2519 ///
2520 /// \param Loc The insert and source location description.
2521 /// \param AllocaIP The insertion points to be used for alloca instructions.
2522 /// \param SectionCBs Callbacks that will generate body of each section.
2523 /// \param PrivCB Callback to copy a given variable (think copy constructor).
2524 /// \param FiniCB Callback to finalize variable copies.
2525 /// \param IsCancellable Flag to indicate a cancellable parallel region.
2526 /// \param IsNowait If true, barrier - to ensure all sections are executed
2527 /// before moving forward will not be generated.
2528 /// \returns The insertion position *after* the sections.
2529 InsertPointTy createSections(const LocationDescription &Loc,
2530 InsertPointTy AllocaIP,
2531 ArrayRef<StorableBodyGenCallbackTy> SectionCBs,
2532 PrivatizeCallbackTy PrivCB,
2533 FinalizeCallbackTy FiniCB, bool IsCancellable,
2534 bool IsNowait);
2535
2536 /// Generator for '#omp section'
2537 ///
2538 /// \param Loc The insert and source location description.
2539 /// \param BodyGenCB Callback that will generate the region body code.
2540 /// \param FiniCB Callback to finalize variable copies.
2541 /// \returns The insertion position *after* the section.
2542 InsertPointTy createSection(const LocationDescription &Loc,
2543 BodyGenCallbackTy BodyGenCB,
2544 FinalizeCallbackTy FiniCB);
2545
2546 /// Generator for `#omp teams`
2547 ///
2548 /// \param Loc The location where the teams construct was encountered.
2549 /// \param BodyGenCB Callback that will generate the region code.
2550 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
2551 /// it is as if lower bound is specified as equal to upperbound. If
2552 /// this is non-null, then upperbound must also be non-null.
2553 /// \param NumTeamsUpper Upper bound on the number of teams.
2554 /// \param ThreadLimit on the number of threads that may participate in a
2555 /// contention group created by each team.
2556 /// \param IfExpr is the integer argument value of the if condition on the
2557 /// teams clause.
2559 createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
2560 Value *NumTeamsLower = nullptr, Value *NumTeamsUpper = nullptr,
2561 Value *ThreadLimit = nullptr, Value *IfExpr = nullptr);
2562
2563 /// Generate conditional branch and relevant BasicBlocks through which private
2564 /// threads copy the 'copyin' variables from Master copy to threadprivate
2565 /// copies.
2566 ///
2567 /// \param IP insertion block for copyin conditional
2568 /// \param MasterVarPtr a pointer to the master variable
2569 /// \param PrivateVarPtr a pointer to the threadprivate variable
2570 /// \param IntPtrTy Pointer size type
2571 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
2572 // and copy.in.end block
2573 ///
2574 /// \returns The insertion point where copying operation to be emitted.
2576 Value *PrivateAddr,
2577 llvm::IntegerType *IntPtrTy,
2578 bool BranchtoEnd = true);
2579
2580 /// Create a runtime call for kmpc_Alloc
2581 ///
2582 /// \param Loc The insert and source location description.
2583 /// \param Size Size of allocated memory space
2584 /// \param Allocator Allocator information instruction
2585 /// \param Name Name of call Instruction for OMP_alloc
2586 ///
2587 /// \returns CallInst to the OMP_Alloc call
2588 CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
2589 Value *Allocator, std::string Name = "");
2590
2591 /// Create a runtime call for kmpc_free
2592 ///
2593 /// \param Loc The insert and source location description.
2594 /// \param Addr Address of memory space to be freed
2595 /// \param Allocator Allocator information instruction
2596 /// \param Name Name of call Instruction for OMP_Free
2597 ///
2598 /// \returns CallInst to the OMP_Free call
2599 CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
2600 Value *Allocator, std::string Name = "");
2601
2602 /// Create a runtime call for kmpc_threadprivate_cached
2603 ///
2604 /// \param Loc The insert and source location description.
2605 /// \param Pointer pointer to data to be cached
2606 /// \param Size size of data to be cached
2607 /// \param Name Name of call Instruction for callinst
2608 ///
2609 /// \returns CallInst to the thread private cache call.
2610 CallInst *createCachedThreadPrivate(const LocationDescription &Loc,
2613 const llvm::Twine &Name = Twine(""));
2614
2615 /// Create a runtime call for __tgt_interop_init
2616 ///
2617 /// \param Loc The insert and source location description.
2618 /// \param InteropVar variable to be allocated
2619 /// \param InteropType type of interop operation
2620 /// \param Device devide to which offloading will occur
2621 /// \param NumDependences number of dependence variables
2622 /// \param DependenceAddress pointer to dependence variables
2623 /// \param HaveNowaitClause does nowait clause exist
2624 ///
2625 /// \returns CallInst to the __tgt_interop_init call
2626 CallInst *createOMPInteropInit(const LocationDescription &Loc,
2627 Value *InteropVar,
2628 omp::OMPInteropType InteropType, Value *Device,
2629 Value *NumDependences,
2630 Value *DependenceAddress,
2631 bool HaveNowaitClause);
2632
2633 /// Create a runtime call for __tgt_interop_destroy
2634 ///
2635 /// \param Loc The insert and source location description.
2636 /// \param InteropVar variable to be allocated
2637 /// \param Device devide to which offloading will occur
2638 /// \param NumDependences number of dependence variables
2639 /// \param DependenceAddress pointer to dependence variables
2640 /// \param HaveNowaitClause does nowait clause exist
2641 ///
2642 /// \returns CallInst to the __tgt_interop_destroy call
2643 CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
2644 Value *InteropVar, Value *Device,
2645 Value *NumDependences,
2646 Value *DependenceAddress,
2647 bool HaveNowaitClause);
2648
2649 /// Create a runtime call for __tgt_interop_use
2650 ///
2651 /// \param Loc The insert and source location description.
2652 /// \param InteropVar variable to be allocated
2653 /// \param Device devide to which offloading will occur
2654 /// \param NumDependences number of dependence variables
2655 /// \param DependenceAddress pointer to dependence variables
2656 /// \param HaveNowaitClause does nowait clause exist
2657 ///
2658 /// \returns CallInst to the __tgt_interop_use call
2659 CallInst *createOMPInteropUse(const LocationDescription &Loc,
2660 Value *InteropVar, Value *Device,
2661 Value *NumDependences, Value *DependenceAddress,
2662 bool HaveNowaitClause);
2663
2664 /// The `omp target` interface
2665 ///
2666 /// For more information about the usage of this interface,
2667 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
2668 ///
2669 ///{
2670
2671 /// Create a runtime call for kmpc_target_init
2672 ///
2673 /// \param Loc The insert and source location description.
2674 /// \param IsSPMD Flag to indicate if the kernel is an SPMD kernel or not.
2675 /// \param MinThreads Minimal number of threads, or 0.
2676 /// \param MaxThreads Maximal number of threads, or 0.
2677 /// \param MinTeams Minimal number of teams, or 0.
2678 /// \param MaxTeams Maximal number of teams, or 0.
2679 InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD,
2680 int32_t MinThreadsVal = 0,
2681 int32_t MaxThreadsVal = 0,
2682 int32_t MinTeamsVal = 0,
2683 int32_t MaxTeamsVal = 0);
2684
2685 /// Create a runtime call for kmpc_target_deinit
2686 ///
2687 /// \param Loc The insert and source location description.
2688 /// \param TeamsReductionDataSize The maximal size of all the reduction data
2689 /// for teams reduction.
2690 /// \param TeamsReductionBufferLength The number of elements (each of up to
2691 /// \p TeamsReductionDataSize size), in the teams reduction buffer.
2692 void createTargetDeinit(const LocationDescription &Loc,
2693 int32_t TeamsReductionDataSize = 0,
2694 int32_t TeamsReductionBufferLength = 1024);
2695
2696 ///}
2697
2698 /// Helpers to read/write kernel annotations from the IR.
2699 ///
2700 ///{
2701
2702 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
2703 /// is set.
2704 static std::pair<int32_t, int32_t>
2705 readThreadBoundsForKernel(const Triple &T, Function &Kernel);
2706 static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel,
2707 int32_t LB, int32_t UB);
2708
2709 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
2710 /// is set.
2711 static std::pair<int32_t, int32_t> readTeamBoundsForKernel(const Triple &T,
2712 Function &Kernel);
2713 static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB,
2714 int32_t UB);
2715 ///}
2716
2717private:
2718 // Sets the function attributes expected for the outlined function
2719 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
2720
2721 // Creates the function ID/Address for the given outlined function.
2722 // In the case of an embedded device function the address of the function is
2723 // used, in the case of a non-offload function a constant is created.
2724 Constant *createOutlinedFunctionID(Function *OutlinedFn,
2725 StringRef EntryFnIDName);
2726
2727 // Creates the region entry address for the outlined function
2728 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
2729 StringRef EntryFnName);
2730
2731public:
2732 /// Functions used to generate a function with the given name.
2733 using FunctionGenCallback = std::function<Function *(StringRef FunctionName)>;
2734
2735 /// Create a unique name for the entry function using the source location
2736 /// information of the current target region. The name will be something like:
2737 ///
2738 /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
2739 ///
2740 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
2741 /// mangled name of the function that encloses the target region and BB is the
2742 /// line number of the target region. CC is a count added when more than one
2743 /// region is located at the same location.
2744 ///
2745 /// If this target outline function is not an offload entry, we don't need to
2746 /// register it. This may happen if it is guarded by an if clause that is
2747 /// false at compile time, or no target archs have been specified.
2748 ///
2749 /// The created target region ID is used by the runtime library to identify
2750 /// the current target region, so it only has to be unique and not
2751 /// necessarily point to anything. It could be the pointer to the outlined
2752 /// function that implements the target region, but we aren't using that so
2753 /// that the compiler doesn't need to keep that, and could therefore inline
2754 /// the host function if proven worthwhile during optimization. In the other
2755 /// hand, if emitting code for the device, the ID has to be the function
2756 /// address so that it can retrieved from the offloading entry and launched
2757 /// by the runtime library. We also mark the outlined function to have
2758 /// external linkage in case we are emitting code for the device, because
2759 /// these functions will be entry points to the device.
2760 ///
2761 /// \param InfoManager The info manager keeping track of the offload entries
2762 /// \param EntryInfo The entry information about the function
2763 /// \param GenerateFunctionCallback The callback function to generate the code
2764 /// \param OutlinedFunction Pointer to the outlined function
2765 /// \param EntryFnIDName Name of the ID o be created
2767 FunctionGenCallback &GenerateFunctionCallback,
2768 bool IsOffloadEntry, Function *&OutlinedFn,
2769 Constant *&OutlinedFnID);
2770
2771 /// Registers the given function and sets up the attribtues of the function
2772 /// Returns the FunctionID.
2773 ///
2774 /// \param InfoManager The info manager keeping track of the offload entries
2775 /// \param EntryInfo The entry information about the function
2776 /// \param OutlinedFunction Pointer to the outlined function
2777 /// \param EntryFnName Name of the outlined function
2778 /// \param EntryFnIDName Name of the ID o be created
2780 Function *OutlinedFunction,
2781 StringRef EntryFnName,
2782 StringRef EntryFnIDName);
2783
2784 /// Type of BodyGen to use for region codegen
2785 ///
2786 /// Priv: If device pointer privatization is required, emit the body of the
2787 /// region here. It will have to be duplicated: with and without
2788 /// privatization.
2789 /// DupNoPriv: If we need device pointer privatization, we need
2790 /// to emit the body of the region with no privatization in the 'else' branch
2791 /// of the conditional.
2792 /// NoPriv: If we don't require privatization of device
2793 /// pointers, we emit the body in between the runtime calls. This avoids
2794 /// duplicating the body code.
2796
2797 /// Callback type for creating the map infos for the kernel parameters.
2798 /// \param CodeGenIP is the insertion point where code should be generated,
2799 /// if any.
2802
2803 /// Generator for '#omp target data'
2804 ///
2805 /// \param Loc The location where the target data construct was encountered.
2806 /// \param AllocaIP The insertion points to be used for alloca instructions.
2807 /// \param CodeGenIP The insertion point at which the target directive code
2808 /// should be placed.
2809 /// \param IsBegin If true then emits begin mapper call otherwise emits
2810 /// end mapper call.
2811 /// \param DeviceID Stores the DeviceID from the device clause.
2812 /// \param IfCond Value which corresponds to the if clause condition.
2813 /// \param Info Stores all information realted to the Target Data directive.
2814 /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
2815 /// \param BodyGenCB Optional Callback to generate the region code.
2816 /// \param DeviceAddrCB Optional callback to generate code related to
2817 /// use_device_ptr and use_device_addr.
2818 /// \param CustomMapperCB Optional callback to generate code related to
2819 /// custom mappers.
2821 const LocationDescription &Loc, InsertPointTy AllocaIP,
2822 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
2824 omp::RuntimeFunction *MapperFunc = nullptr,
2826 BodyGenTy BodyGenType)>
2827 BodyGenCB = nullptr,
2828 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
2829 function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
2830 Value *SrcLocInfo = nullptr);
2831
2833 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
2834
2836 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
2837 InsertPointTy CodeGenIP)>;
2838
2839 /// Generator for '#omp target'
2840 ///
2841 /// \param Loc where the target data construct was encountered.
2842 /// \param IsOffloadEntry whether it is an offload entry.
2843 /// \param CodeGenIP The insertion point where the call to the outlined
2844 /// function should be emitted.
2845 /// \param EntryInfo The entry information about the function.
2846 /// \param NumTeams Number of teams specified in the num_teams clause.
2847 /// \param NumThreads Number of teams specified in the thread_limit clause.
2848 /// \param Inputs The input values to the region that will be passed.
2849 /// as arguments to the outlined function.
2850 /// \param BodyGenCB Callback that will generate the region code.
2851 /// \param ArgAccessorFuncCB Callback that will generate accessors
2852 /// instructions for passed in target arguments where neccessary
2853 /// \param Dependencies A vector of DependData objects that carry
2854 // dependency information as passed in the depend clause
2856 bool IsOffloadEntry,
2859 TargetRegionEntryInfo &EntryInfo,
2860 ArrayRef<int32_t> NumTeams, int32_t NumThreads,
2862 GenMapInfoCallbackTy GenMapInfoCB,
2863 TargetBodyGenCallbackTy BodyGenCB,
2864 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
2865 SmallVector<DependData> Dependencies = {});
2866
2867 /// Returns __kmpc_for_static_init_* runtime function for the specified
2868 /// size \a IVSize and sign \a IVSigned. Will create a distribute call
2869 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
2870 FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned,
2871 bool IsGPUDistribute);
2872
2873 /// Returns __kmpc_dispatch_init_* runtime function for the specified
2874 /// size \a IVSize and sign \a IVSigned.
2875 FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned);
2876
2877 /// Returns __kmpc_dispatch_next_* runtime function for the specified
2878 /// size \a IVSize and sign \a IVSigned.
2879 FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned);
2880
2881 /// Returns __kmpc_dispatch_fini_* runtime function for the specified
2882 /// size \a IVSize and sign \a IVSigned.
2883 FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned);
2884
2885 /// Returns __kmpc_dispatch_deinit runtime function.
2887
2888 /// Declarations for LLVM-IR types (simple, array, function and structure) are
2889 /// generated below. Their names are defined and used in OpenMPKinds.def. Here
2890 /// we provide the declarations, the initializeTypes function will provide the
2891 /// values.
2892 ///
2893 ///{
2894#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
2895#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
2896 ArrayType *VarName##Ty = nullptr; \
2897 PointerType *VarName##PtrTy = nullptr;
2898#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
2899 FunctionType *VarName = nullptr; \
2900 PointerType *VarName##Ptr = nullptr;
2901#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
2902 StructType *VarName = nullptr; \
2903 PointerType *VarName##Ptr = nullptr;
2904#include "llvm/Frontend/OpenMP/OMPKinds.def"
2905
2906 ///}
2907
2908private:
2909 /// Create all simple and struct types exposed by the runtime and remember
2910 /// the llvm::PointerTypes of them for easy access later.
2911 void initializeTypes(Module &M);
2912
2913 /// Common interface for generating entry calls for OMP Directives.
2914 /// if the directive has a region/body, It will set the insertion
2915 /// point to the body
2916 ///
2917 /// \param OMPD Directive to generate entry blocks for
2918 /// \param EntryCall Call to the entry OMP Runtime Function
2919 /// \param ExitBB block where the region ends.
2920 /// \param Conditional indicate if the entry call result will be used
2921 /// to evaluate a conditional of whether a thread will execute
2922 /// body code or not.
2923 ///
2924 /// \return The insertion position in exit block
2925 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
2926 BasicBlock *ExitBB,
2927 bool Conditional = false);
2928
2929 /// Common interface to finalize the region
2930 ///
2931 /// \param OMPD Directive to generate exiting code for
2932 /// \param FinIP Insertion point for emitting Finalization code and exit call
2933 /// \param ExitCall Call to the ending OMP Runtime Function
2934 /// \param HasFinalize indicate if the directive will require finalization
2935 /// and has a finalization callback in the stack that
2936 /// should be called.
2937 ///
2938 /// \return The insertion position in exit block
2939 InsertPointTy emitCommonDirectiveExit(omp::Directive OMPD,
2940 InsertPointTy FinIP,
2941 Instruction *ExitCall,
2942 bool HasFinalize = true);
2943
2944 /// Common Interface to generate OMP inlined regions
2945 ///
2946 /// \param OMPD Directive to generate inlined region for
2947 /// \param EntryCall Call to the entry OMP Runtime Function
2948 /// \param ExitCall Call to the ending OMP Runtime Function
2949 /// \param BodyGenCB Body code generation callback.
2950 /// \param FiniCB Finalization Callback. Will be called when finalizing region
2951 /// \param Conditional indicate if the entry call result will be used
2952 /// to evaluate a conditional of whether a thread will execute
2953 /// body code or not.
2954 /// \param HasFinalize indicate if the directive will require finalization
2955 /// and has a finalization callback in the stack that
2956 /// should be called.
2957 /// \param IsCancellable if HasFinalize is set to true, indicate if the
2958 /// the directive should be cancellable.
2959 /// \return The insertion point after the region
2960
2962 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
2963 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
2964 FinalizeCallbackTy FiniCB, bool Conditional = false,
2965 bool HasFinalize = true, bool IsCancellable = false);
2966
2967 /// Get the platform-specific name separator.
2968 /// \param Parts different parts of the final name that needs separation
2969 /// \param FirstSeparator First separator used between the initial two
2970 /// parts of the name.
2971 /// \param Separator separator used between all of the rest consecutive
2972 /// parts of the name
2973 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
2974 StringRef FirstSeparator,
2975 StringRef Separator);
2976
2977 /// Returns corresponding lock object for the specified critical region
2978 /// name. If the lock object does not exist it is created, otherwise the
2979 /// reference to the existing copy is returned.
2980 /// \param CriticalName Name of the critical region.
2981 ///
2982 Value *getOMPCriticalRegionLock(StringRef CriticalName);
2983
2984 /// Callback type for Atomic Expression update
2985 /// ex:
2986 /// \code{.cpp}
2987 /// unsigned x = 0;
2988 /// #pragma omp atomic update
2989 /// x = Expr(x_old); //Expr() is any legal operation
2990 /// \endcode
2991 ///
2992 /// \param XOld the value of the atomic memory address to use for update
2993 /// \param IRB reference to the IRBuilder to use
2994 ///
2995 /// \returns Value to update X to.
2996 using AtomicUpdateCallbackTy =
2997 const function_ref<Value *(Value *XOld, IRBuilder<> &IRB)>;
2998
2999private:
3000 enum AtomicKind { Read, Write, Update, Capture, Compare };
3001
3002 /// Determine whether to emit flush or not
3003 ///
3004 /// \param Loc The insert and source location description.
3005 /// \param AO The required atomic ordering
3006 /// \param AK The OpenMP atomic operation kind used.
3007 ///
3008 /// \returns wether a flush was emitted or not
3009 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3010 AtomicOrdering AO, AtomicKind AK);
3011
3012 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3013 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3014 /// Only Scalar data types.
3015 ///
3016 /// \param AllocaIP The insertion point to be used for alloca
3017 /// instructions.
3018 /// \param X The target atomic pointer to be updated
3019 /// \param XElemTy The element type of the atomic pointer.
3020 /// \param Expr The value to update X with.
3021 /// \param AO Atomic ordering of the generated atomic
3022 /// instructions.
3023 /// \param RMWOp The binary operation used for update. If
3024 /// operation is not supported by atomicRMW,
3025 /// or belong to {FADD, FSUB, BAD_BINOP}.
3026 /// Then a `cmpExch` based atomic will be generated.
3027 /// \param UpdateOp Code generator for complex expressions that cannot be
3028 /// expressed through atomicrmw instruction.
3029 /// \param VolatileX true if \a X volatile?
3030 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3031 /// update expression, false otherwise.
3032 /// (e.g. true for X = X BinOp Expr)
3033 ///
3034 /// \returns A pair of the old value of X before the update, and the value
3035 /// used for the update.
3036 std::pair<Value *, Value *>
3037 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3039 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3040 bool IsXBinopExpr);
3041
3042 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3043 ///
3044 /// \Return The instruction
3045 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3046 AtomicRMWInst::BinOp RMWOp);
3047
3048public:
3049 /// a struct to pack relevant information while generating atomic Ops
3051 Value *Var = nullptr;
3052 Type *ElemTy = nullptr;
3053 bool IsSigned = false;
3054 bool IsVolatile = false;
3055 };
3056
3057 /// Emit atomic Read for : V = X --- Only Scalar data types.
3058 ///
3059 /// \param Loc The insert and source location description.
3060 /// \param X The target pointer to be atomically read
3061 /// \param V Memory address where to store atomically read
3062 /// value
3063 /// \param AO Atomic ordering of the generated atomic
3064 /// instructions.
3065 ///
3066 /// \return Insertion point after generated atomic read IR.
3069 AtomicOrdering AO);
3070
3071 /// Emit atomic write for : X = Expr --- Only Scalar data types.
3072 ///
3073 /// \param Loc The insert and source location description.
3074 /// \param X The target pointer to be atomically written to
3075 /// \param Expr The value to store.
3076 /// \param AO Atomic ordering of the generated atomic
3077 /// instructions.
3078 ///
3079 /// \return Insertion point after generated atomic Write IR.
3081 AtomicOpValue &X, Value *Expr,
3082 AtomicOrdering AO);
3083
3084 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3085 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3086 /// Only Scalar data types.
3087 ///
3088 /// \param Loc The insert and source location description.
3089 /// \param AllocaIP The insertion point to be used for alloca instructions.
3090 /// \param X The target atomic pointer to be updated
3091 /// \param Expr The value to update X with.
3092 /// \param AO Atomic ordering of the generated atomic instructions.
3093 /// \param RMWOp The binary operation used for update. If operation
3094 /// is not supported by atomicRMW, or belong to
3095 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3096 /// atomic will be generated.
3097 /// \param UpdateOp Code generator for complex expressions that cannot be
3098 /// expressed through atomicrmw instruction.
3099 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3100 /// update expression, false otherwise.
3101 /// (e.g. true for X = X BinOp Expr)
3102 ///
3103 /// \return Insertion point after generated atomic update IR.
3105 InsertPointTy AllocaIP, AtomicOpValue &X,
3106 Value *Expr, AtomicOrdering AO,
3108 AtomicUpdateCallbackTy &UpdateOp,
3109 bool IsXBinopExpr);
3110
3111 /// Emit atomic update for constructs: --- Only Scalar data types
3112 /// V = X; X = X BinOp Expr ,
3113 /// X = X BinOp Expr; V = X,
3114 /// V = X; X = Expr BinOp X,
3115 /// X = Expr BinOp X; V = X,
3116 /// V = X; X = UpdateOp(X),
3117 /// X = UpdateOp(X); V = X,
3118 ///
3119 /// \param Loc The insert and source location description.
3120 /// \param AllocaIP The insertion point to be used for alloca instructions.
3121 /// \param X The target atomic pointer to be updated
3122 /// \param V Memory address where to store captured value
3123 /// \param Expr The value to update X with.
3124 /// \param AO Atomic ordering of the generated atomic instructions
3125 /// \param RMWOp The binary operation used for update. If
3126 /// operation is not supported by atomicRMW, or belong to
3127 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3128 /// atomic will be generated.
3129 /// \param UpdateOp Code generator for complex expressions that cannot be
3130 /// expressed through atomicrmw instruction.
3131 /// \param UpdateExpr true if X is an in place update of the form
3132 /// X = X BinOp Expr or X = Expr BinOp X
3133 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3134 /// update expression, false otherwise.
3135 /// (e.g. true for X = X BinOp Expr)
3136 /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3137 /// 'v', not an updated one.
3138 ///
3139 /// \return Insertion point after generated atomic capture IR.
3142 AtomicOpValue &X, AtomicOpValue &V, Value *Expr,
3144 AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr,
3145 bool IsPostfixUpdate, bool IsXBinopExpr);
3146
3147 /// Emit atomic compare for constructs: --- Only scalar data types
3148 /// cond-expr-stmt:
3149 /// x = x ordop expr ? expr : x;
3150 /// x = expr ordop x ? expr : x;
3151 /// x = x == e ? d : x;
3152 /// x = e == x ? d : x; (this one is not in the spec)
3153 /// cond-update-stmt:
3154 /// if (x ordop expr) { x = expr; }
3155 /// if (expr ordop x) { x = expr; }
3156 /// if (x == e) { x = d; }
3157 /// if (e == x) { x = d; } (this one is not in the spec)
3158 /// conditional-update-capture-atomic:
3159 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3160 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3161 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3162 /// IsFailOnly=true)
3163 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3164 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3165 /// IsFailOnly=true)
3166 ///
3167 /// \param Loc The insert and source location description.
3168 /// \param X The target atomic pointer to be updated.
3169 /// \param V Memory address where to store captured value (for
3170 /// compare capture only).
3171 /// \param R Memory address where to store comparison result
3172 /// (for compare capture with '==' only).
3173 /// \param E The expected value ('e') for forms that use an
3174 /// equality comparison or an expression ('expr') for
3175 /// forms that use 'ordop' (logically an atomic maximum or
3176 /// minimum).
3177 /// \param D The desired value for forms that use an equality
3178 /// comparison. If forms that use 'ordop', it should be
3179 /// \p nullptr.
3180 /// \param AO Atomic ordering of the generated atomic instructions.
3181 /// \param Op Atomic compare operation. It can only be ==, <, or >.
3182 /// \param IsXBinopExpr True if the conditional statement is in the form where
3183 /// x is on LHS. It only matters for < or >.
3184 /// \param IsPostfixUpdate True if original value of 'x' must be stored in
3185 /// 'v', not an updated one (for compare capture
3186 /// only).
3187 /// \param IsFailOnly True if the original value of 'x' is stored to 'v'
3188 /// only when the comparison fails. This is only valid for
3189 /// the case the comparison is '=='.
3190 ///
3191 /// \return Insertion point after generated atomic capture IR.
3196 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3199 AtomicOpValue &R, Value *E, Value *D,
3200 AtomicOrdering AO,
3202 bool IsXBinopExpr, bool IsPostfixUpdate,
3203 bool IsFailOnly, AtomicOrdering Failure);
3204
3205 /// Create the control flow structure of a canonical OpenMP loop.
3206 ///
3207 /// The emitted loop will be disconnected, i.e. no edge to the loop's
3208 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3209 /// IRBuilder location is not preserved.
3210 ///
3211 /// \param DL DebugLoc used for the instructions in the skeleton.
3212 /// \param TripCount Value to be used for the trip count.
3213 /// \param F Function in which to insert the BasicBlocks.
3214 /// \param PreInsertBefore Where to insert BBs that execute before the body,
3215 /// typically the body itself.
3216 /// \param PostInsertBefore Where to insert BBs that execute after the body.
3217 /// \param Name Base name used to derive BB
3218 /// and instruction names.
3219 ///
3220 /// \returns The CanonicalLoopInfo that represents the emitted loop.
3222 Function *F,
3223 BasicBlock *PreInsertBefore,
3224 BasicBlock *PostInsertBefore,
3225 const Twine &Name = {});
3226 /// OMP Offload Info Metadata name string
3227 const std::string ompOffloadInfoName = "omp_offload.info";
3228
3229 /// Loads all the offload entries information from the host IR
3230 /// metadata. This function is only meant to be used with device code
3231 /// generation.
3232 ///
3233 /// \param M Module to load Metadata info from. Module passed maybe
3234 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3236
3237 /// Loads all the offload entries information from the host IR
3238 /// metadata read from the file passed in as the HostFilePath argument. This
3239 /// function is only meant to be used with device code generation.
3240 ///
3241 /// \param HostFilePath The path to the host IR file,
3242 /// used to load in offload metadata for the device, allowing host and device
3243 /// to maintain the same metadata mapping.
3244 void loadOffloadInfoMetadata(StringRef HostFilePath);
3245
3246 /// Gets (if variable with the given name already exist) or creates
3247 /// internal global variable with the specified Name. The created variable has
3248 /// linkage CommonLinkage by default and is initialized by null value.
3249 /// \param Ty Type of the global variable. If it is exist already the type
3250 /// must be the same.
3251 /// \param Name Name of the variable.
3253 unsigned AddressSpace = 0);
3254};
3255
3256/// Class to represented the control flow structure of an OpenMP canonical loop.
3257///
3258/// The control-flow structure is standardized for easy consumption by
3259/// directives associated with loops. For instance, the worksharing-loop
3260/// construct may change this control flow such that each loop iteration is
3261/// executed on only one thread. The constraints of a canonical loop in brief
3262/// are:
3263///
3264/// * The number of loop iterations must have been computed before entering the
3265/// loop.
3266///
3267/// * Has an (unsigned) logical induction variable that starts at zero and
3268/// increments by one.
3269///
3270/// * The loop's CFG itself has no side-effects. The OpenMP specification
3271/// itself allows side-effects, but the order in which they happen, including
3272/// how often or whether at all, is unspecified. We expect that the frontend
3273/// will emit those side-effect instructions somewhere (e.g. before the loop)
3274/// such that the CanonicalLoopInfo itself can be side-effect free.
3275///
3276/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3277/// execution of a loop body that satifies these constraints. It does NOT
3278/// represent arbitrary SESE regions that happen to contain a loop. Do not use
3279/// CanonicalLoopInfo for such purposes.
3280///
3281/// The control flow can be described as follows:
3282///
3283/// Preheader
3284/// |
3285/// /-> Header
3286/// | |
3287/// | Cond---\
3288/// | | |
3289/// | Body |
3290/// | | | |
3291/// | <...> |
3292/// | | | |
3293/// \--Latch |
3294/// |
3295/// Exit
3296/// |
3297/// After
3298///
3299/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3300/// including) and end at AfterIP (at the After's first instruction, excluding).
3301/// That is, instructions in the Preheader and After blocks (except the
3302/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3303/// side-effects. Typically, the Preheader is used to compute the loop's trip
3304/// count. The instructions from BodyIP (at the Body block's first instruction,
3305/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3306/// control and thus can have side-effects. The body block is the single entry
3307/// point into the loop body, which may contain arbitrary control flow as long
3308/// as all control paths eventually branch to the Latch block.
3309///
3310/// TODO: Consider adding another standardized BasicBlock between Body CFG and
3311/// Latch to guarantee that there is only a single edge to the latch. It would
3312/// make loop transformations easier to not needing to consider multiple
3313/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3314/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3315/// executes after each body iteration.
3316///
3317/// There must be no loop-carried dependencies through llvm::Values. This is
3318/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3319/// for the induction variable.
3320///
3321/// All code in Header, Cond, Latch and Exit (plus the terminator of the
3322/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3323/// by assertOK(). They are expected to not be modified unless explicitly
3324/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3325/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3326/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3327/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3328/// anymore as its underlying control flow may not exist anymore.
3329/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3330/// may also return a new CanonicalLoopInfo that can be passed to other
3331/// loop-associated construct implementing methods. These loop-transforming
3332/// methods may either create a new CanonicalLoopInfo usually using
3333/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3334/// modify one of the input CanonicalLoopInfo and return it as representing the
3335/// modified loop. What is done is an implementation detail of
3336/// transformation-implementing method and callers should always assume that the
3337/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3338/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3339/// created by createCanonicalLoop, such that transforming methods do not have
3340/// to special case where the CanonicalLoopInfo originated from.
3341///
3342/// Generally, methods consuming CanonicalLoopInfo do not need an
3343/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3344/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3345/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3346/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3347/// any InsertPoint in the Preheader, After or Block can still be used after
3348/// calling such a method.
3349///
3350/// TODO: Provide mechanisms for exception handling and cancellation points.
3351///
3352/// Defined outside OpenMPIRBuilder because nested classes cannot be
3353/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3355 friend class OpenMPIRBuilder;
3356
3357private:
3358 BasicBlock *Header = nullptr;
3359 BasicBlock *Cond = nullptr;
3360 BasicBlock *Latch = nullptr;
3361 BasicBlock *Exit = nullptr;
3362
3363 /// Add the control blocks of this loop to \p BBs.
3364 ///
3365 /// This does not include any block from the body, including the one returned
3366 /// by getBody().
3367 ///
3368 /// FIXME: This currently includes the Preheader and After blocks even though
3369 /// their content is (mostly) not under CanonicalLoopInfo's control.
3370 /// Re-evaluated whether this makes sense.
3371 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3372
3373 /// Sets the number of loop iterations to the given value. This value must be
3374 /// valid in the condition block (i.e., defined in the preheader) and is
3375 /// interpreted as an unsigned integer.
3376 void setTripCount(Value *TripCount);
3377
3378 /// Replace all uses of the canonical induction variable in the loop body with
3379 /// a new one.
3380 ///
3381 /// The intended use case is to update the induction variable for an updated
3382 /// iteration space such that it can stay normalized in the 0...tripcount-1
3383 /// range.
3384 ///
3385 /// The \p Updater is called with the (presumable updated) current normalized
3386 /// induction variable and is expected to return the value that uses of the
3387 /// pre-updated induction values should use instead, typically dependent on
3388 /// the new induction variable. This is a lambda (instead of e.g. just passing
3389 /// the new value) to be able to distinguish the uses of the pre-updated
3390 /// induction variable and uses of the induction varible to compute the
3391 /// updated induction variable value.
3392 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3393
3394public:
3395 /// Returns whether this object currently represents the IR of a loop. If
3396 /// returning false, it may have been consumed by a loop transformation or not
3397 /// been intialized. Do not use in this case;
3398 bool isValid() const { return Header; }
3399
3400 /// The preheader ensures that there is only a single edge entering the loop.
3401 /// Code that must be execute before any loop iteration can be emitted here,
3402 /// such as computing the loop trip count and begin lifetime markers. Code in
3403 /// the preheader is not considered part of the canonical loop.
3404 BasicBlock *getPreheader() const;
3405
3406 /// The header is the entry for each iteration. In the canonical control flow,
3407 /// it only contains the PHINode for the induction variable.
3409 assert(isValid() && "Requires a valid canonical loop");
3410 return Header;
3411 }
3412
3413 /// The condition block computes whether there is another loop iteration. If
3414 /// yes, branches to the body; otherwise to the exit block.
3416 assert(isValid() && "Requires a valid canonical loop");
3417 return Cond;
3418 }
3419
3420 /// The body block is the single entry for a loop iteration and not controlled
3421 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
3422 /// eventually branch to the \p Latch block.
3424 assert(isValid() && "Requires a valid canonical loop");
3425 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
3426 }
3427
3428 /// Reaching the latch indicates the end of the loop body code. In the
3429 /// canonical control flow, it only contains the increment of the induction
3430 /// variable.
3432 assert(isValid() && "Requires a valid canonical loop");
3433 return Latch;
3434 }
3435
3436 /// Reaching the exit indicates no more iterations are being executed.
3438 assert(isValid() && "Requires a valid canonical loop");
3439 return Exit;
3440 }
3441
3442 /// The after block is intended for clean-up code such as lifetime end
3443 /// markers. It is separate from the exit block to ensure, analogous to the
3444 /// preheader, it having just a single entry edge and being free from PHI
3445 /// nodes should there be multiple loop exits (such as from break
3446 /// statements/cancellations).
3448 assert(isValid() && "Requires a valid canonical loop");
3449 return Exit->getSingleSuccessor();
3450 }
3451
3452 /// Returns the llvm::Value containing the number of loop iterations. It must
3453 /// be valid in the preheader and always interpreted as an unsigned integer of
3454 /// any bit-width.
3456 assert(isValid() && "Requires a valid canonical loop");
3457 Instruction *CmpI = &Cond->front();
3458 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
3459 return CmpI->getOperand(1);
3460 }
3461
3462 /// Returns the instruction representing the current logical induction
3463 /// variable. Always unsigned, always starting at 0 with an increment of one.
3465 assert(isValid() && "Requires a valid canonical loop");
3466 Instruction *IndVarPHI = &Header->front();
3467 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
3468 return IndVarPHI;
3469 }
3470
3471 /// Return the type of the induction variable (and the trip count).
3473 assert(isValid() && "Requires a valid canonical loop");
3474 return getIndVar()->getType();
3475 }
3476
3477 /// Return the insertion point for user code before the loop.
3479 assert(isValid() && "Requires a valid canonical loop");
3480 BasicBlock *Preheader = getPreheader();
3481 return {Preheader, std::prev(Preheader->end())};
3482 };
3483
3484 /// Return the insertion point for user code in the body.
3486 assert(isValid() && "Requires a valid canonical loop");
3487 BasicBlock *Body = getBody();
3488 return {Body, Body->begin()};
3489 };
3490
3491 /// Return the insertion point for user code after the loop.
3493 assert(isValid() && "Requires a valid canonical loop");
3495 return {After, After->begin()};
3496 };
3497
3499 assert(isValid() && "Requires a valid canonical loop");
3500 return Header->getParent();
3501 }
3502
3503 /// Consistency self-check.
3504 void assertOK() const;
3505
3506 /// Invalidate this loop. That is, the underlying IR does not fulfill the
3507 /// requirements of an OpenMP canonical loop anymore.
3508 void invalidate();
3509};
3510
3511} // end namespace llvm
3512
3513#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
arc branch finalize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file defines the BumpPtrAllocator interface.
BlockVerifier::State From
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
uint64_t Addr
std::string Name
uint64_t Size
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
Hexagon Hardware Loops
#define F(x, y, z)
Definition: MD5.cpp:55
#define G(x, y, z)
Definition: MD5.cpp:56
Module.h This file contains the declarations for the Module class.
This file defines constans and helpers used when dealing with OpenMP.
Provides definitions for Target specific Grid Values.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Value * RHS
Value * LHS
an instruction to allocate memory on the stack
Definition: Instructions.h:61
This class represents an incoming formal argument to a Function.
Definition: Argument.h:31
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
BinOp
This enumeration lists the possible modifications atomicrmw can make.
Definition: Instructions.h:708
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
iterator end()
Definition: BasicBlock.h:461
iterator begin()
Instruction iterator methods.
Definition: BasicBlock.h:448
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:219
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
This is an important base class in LLVM.
Definition: Constant.h:42
This class represents an Operation in the Expression.
A debug info location.
Definition: DebugLoc.h:33
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
Definition: DerivedTypes.h:168
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition: GlobalValue.h:51
InsertPoint - A saved insertion point.
Definition: IRBuilder.h:254
BasicBlock * getBlock() const
Definition: IRBuilder.h:269
Common base class shared among various IRBuilders.
Definition: IRBuilder.h:91
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition: IRBuilder.h:217
InsertPoint saveIP() const
Returns the current insert point.
Definition: IRBuilder.h:274
void restoreIP(InsertPoint IP)
Sets the current insert point to a previously-saved location.
Definition: IRBuilder.h:286
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2674
Class to represent integer types.
Definition: DerivedTypes.h:40
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
This class implements a map that also provides access to all stored values in a deterministic order.
Definition: MapVector.h:36
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
OffloadEntryInfoDeviceGlobalVar(unsigned Order, OMPTargetGlobalVarEntryKind Flags)
Definition: OMPIRBuilder.h:397
OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage, const std::string &VarName)
Definition: OMPIRBuilder.h:400
static bool classof(const OffloadEntryInfo *Info)
Definition: OMPIRBuilder.h:415
static bool classof(const OffloadEntryInfo *Info)
Definition: OMPIRBuilder.h:322
OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Definition: OMPIRBuilder.h:309
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
Definition: OMPIRBuilder.h:243
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
Definition: OMPIRBuilder.h:245
OffloadingEntryInfoKinds getKind() const
Definition: OMPIRBuilder.h:261
OffloadEntryInfo(OffloadingEntryInfoKinds Kind)
Definition: OMPIRBuilder.h:252
static bool classof(const OffloadEntryInfo *Info)
Definition: OMPIRBuilder.h:269
OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, uint32_t Flags)
Definition: OMPIRBuilder.h:253
Class that manages information about offload code regions and data.
Definition: OMPIRBuilder.h:231
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
Definition: OMPIRBuilder.h:437
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
Definition: OMPIRBuilder.h:376
@ OMPTargetDeviceClauseNoHost
The target is marked for non-host devices.
Definition: OMPIRBuilder.h:380
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
Definition: OMPIRBuilder.h:378
@ OMPTargetDeviceClauseNone
The target is marked as having no clause.
Definition: OMPIRBuilder.h:384
@ OMPTargetDeviceClauseHost
The target is marked for host devices.
Definition: OMPIRBuilder.h:382
void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
Definition: OMPIRBuilder.h:296
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
Definition: OMPIRBuilder.h:298
OffloadEntriesInfoManager(OpenMPIRBuilder *builder)
Definition: OMPIRBuilder.h:289
void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
Definition: OMPIRBuilder.h:287
void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
Definition: OMPIRBuilder.h:356
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
Definition: OMPIRBuilder.h:362
@ OMPTargetGlobalVarEntryNone
Mark the entry as having no declare target entry kind.
Definition: OMPIRBuilder.h:364
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
Definition: OMPIRBuilder.h:368
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
Definition: OMPIRBuilder.h:366
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
Definition: OMPIRBuilder.h:360
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
Definition: OMPIRBuilder.h:358
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
Definition: OMPIRBuilder.h:347
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
Definition: OMPIRBuilder.h:432
bool empty() const
Return true if a there are no entries defined.
Captures attributes that affect generating LLVM-IR using the OpenMPIRBuilder and related classes.
Definition: OMPIRBuilder.h:86
void setIsGPU(bool Value)
Definition: OMPIRBuilder.h:183
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
Definition: OMPIRBuilder.h:92
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
Definition: OMPIRBuilder.h:102
void setGridValue(omp::GV G)
Definition: OMPIRBuilder.h:188
std::optional< StringRef > FirstSeparator
First separator used between the initial two parts of a name.
Definition: OMPIRBuilder.h:111
StringRef separator() const
Definition: OMPIRBuilder.h:174
int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
void setFirstSeparator(StringRef FS)
Definition: OMPIRBuilder.h:186
StringRef firstSeparator() const
Definition: OMPIRBuilder.h:164
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
Definition: OMPIRBuilder.h:108
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
Definition: OMPIRBuilder.h:105
omp::GV getGridValue() const
Definition: OMPIRBuilder.h:147
SmallVector< Triple > TargetTriples
When compilation is being done for the OpenMP host (i.e.
Definition: OMPIRBuilder.h:120
void setHasRequiresReverseOffload(bool Value)
bool hasRequiresUnifiedSharedMemory() const
void setHasRequiresUnifiedSharedMemory(bool Value)
std::optional< StringRef > Separator
Separator used between all of the rest consecutive parts of s name.
Definition: OMPIRBuilder.h:113
bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
Definition: OMPIRBuilder.h:141
void setHasRequiresUnifiedAddress(bool Value)
void setOpenMPOffloadMandatory(bool Value)
Definition: OMPIRBuilder.h:185
void setIsTargetDevice(bool Value)
Definition: OMPIRBuilder.h:182
void setSeparator(StringRef S)
Definition: OMPIRBuilder.h:187
void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
Definition: OMPIRBuilder.h:184
std::optional< omp::GV > GridValue
Definition: OMPIRBuilder.h:116
bool hasRequiresReverseOffload() const
bool hasRequiresUnifiedAddress() const
Struct that keeps the information that should be kept throughout a 'target data' region.
TargetDataInfo(bool RequiresDevicePointerInfo, bool SeparateBeginEndCalls)
SmallMapVector< const Value *, std::pair< Value *, Value * >, 4 > DevicePtrInfoMap
void clearArrayInfo()
Clear information about the data arrays.
unsigned NumberOfPtrs
The total number of pointers passed to the runtime library.
bool isValid()
Return true if the current target data information has valid arrays.
bool HasMapper
Indicate whether any user-defined mapper exists.
An interface to create LLVM-IR for OpenMP directives.
Definition: OMPIRBuilder.h:473
Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
std::function< void(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
Definition: OMPIRBuilder.h:519
InsertPointTy createTargetInit(const LocationDescription &Loc, bool IsSPMD, int32_t MinThreadsVal=0, int32_t MaxThreadsVal=0, int32_t MinTeamsVal=0, int32_t MaxTeamsVal=0)
The omp target interface.
void emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
function_ref< void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
Definition: OMPIRBuilder.h:571
void createTaskyield(const LocationDescription &Loc)
Generator for '#omp taskyield'.
void emitBranch(BasicBlock *Target)
InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO)
Emit atomic write for : X = Expr — Only Scalar data types.
static void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
EvalKind
Enum class for reduction evaluation types scalar, complex and aggregate.
InsertPointTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for '#omp critical'.
static TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
InsertPointTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
std::function< void(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> StorableBodyGenCallbackTy
Definition: OMPIRBuilder.h:578
CanonicalLoopInfo * createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< InsertPointTy(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
Definition: OMPIRBuilder.h:611
void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
InsertPointTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
InsertPointTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
InsertPointTy emitKernelLaunch(const LocationDescription &Loc, Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for '#omp cancel'.
static std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO)
Emit atomic Read for : V = X — Only Scalar data types.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
void setConfig(OpenMPIRBuilderConfig C)
Definition: OMPIRBuilder.h:488
InsertPointTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for '#omp ordered [threads | simd]'.
OpenMPIRBuilder::InsertPointTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for '#omp target data'.
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
void createTaskwait(const LocationDescription &Loc)
Generator for '#omp taskwait'.
CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
InsertPointTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, bool IsNoWait=false, bool IsTeamsReduction=false, bool HasDistribute=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
InsertPointTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for '#omp sections'.
InsertPointTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={})
Generator for #omp task
void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?...
InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for '#omp ordered depend (source | sink)'.
InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
void emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
InsertPointTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp section'.
InsertPointTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop)
Modifies the canonical loop to be a workshare loop.
std::function< InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, Value **LHS, Value **RHS, Function *CurFn)> ReductionGenClangCBTy
ReductionGen CallBack for Clang.
void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
void emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, function_ref< Value *(unsigned int)> CustomMapperCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
InsertPointTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for '#omp master'.
void pushFinalizationCB(const FinalizationInfo &FI)
Push a finalization callback on the finalization stack.
Definition: OMPIRBuilder.h:537
InsertPointTy getInsertionPoint()
}
IRBuilder ::InsertPoint createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for '#omp parallel'.
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, unsigned AddressSpace=0)
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
InsertPointTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for '#omp single'.
CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
Definition: OMPIRBuilder.h:499
GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
static void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< Function *(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
std::function< InsertPointTy(InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)> ReductionGenCBTy
ReductionGen CallBack for MLIR.
void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
void emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective, FinalizeCallbackTy ExitCB={})
Generate control flow and cleanup for cancellation.
std::function< InsertPointTy(InsertPointTy, Type *, Value *, Value *)> ReductionGenAtomicCBTy
Functions used to generate atomic reductions.
Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
InsertPointTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for '#omp masked'.
void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
InsertPointTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
void emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
OpenMPIRBuilder(Module &M)
Create a new OpenMPIRBuilder operating on the given module M.
Definition: OMPIRBuilder.h:477
FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
InsertPointTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
BodyGenTy
Type of BodyGen to use for region codegen.
InsertPointTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo, ArrayRef< int32_t > NumTeams, int32_t NumThreads, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, SmallVector< DependData > Dependencies={})
Generator for '#omp target'.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
void popFinalizationCB()
Pop the last finalization callback from the finalization stack.
Definition: OMPIRBuilder.h:544
InsertPointTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false)
Generator for '#omp reduction'.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
void createFlush(const LocationDescription &Loc)
Generator for '#omp flush'.
Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
InsertPointTy emitTargetTask(Function *OutlinedFn, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP, SmallVector< OpenMPIRBuilder::DependData > &Dependencies, bool HasNoWait)
Generate a target-task for the target construct.
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
Definition: SmallPtrSet.h:347
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:587
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
Definition: SmallVector.h:697
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1210
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition: StringMap.h:128
size_type count(StringRef Key) const
count - Return 1 if the element is in the map, 0 otherwise.
Definition: StringMap.h:276
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition: Twine.h:81
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Value handle that is nullable, but tries to track the Value.
Definition: ValueHandle.h:204
bool pointsToAliveValue() const
Definition: ValueHandle.h:224
An efficient, type-erasing, non-owning reference to a callable.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition: ISDOpcodes.h:71
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
Definition: OMPConstants.h:195
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
Definition: OMPConstants.h:65
RTLDependenceKindTy
Dependence kind for RTL.
Definition: OMPConstants.h:273
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
Definition: OMPConstants.h:45
WorksharingLoopType
A type of worksharing loop construct.
Definition: OMPConstants.h:283
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
Definition: OMPConstants.h:267
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition: DWP.cpp:480
AddressSpace
Definition: NVPTXBaseInfo.h:21
void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
AtomicOrdering
Atomic ordering for LLVM's memory model.
a struct to pack relevant information while generating atomic Ops
A struct to pack the relevant information for an OpenMP depend clause.
DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType, Value *DepVal)
omp::RTLDependenceKindTy DepKind
bool IsCancellable
Flag to indicate if the directive is cancellable.
Definition: OMPIRBuilder.h:531
FinalizeCallbackTy FiniCB
The finalization callback provided by the last in-flight invocation of createXXXX for the directive o...
Definition: OMPIRBuilder.h:524
omp::Directive DK
The directive kind of the innermost directive that has an associated region which might require final...
Definition: OMPIRBuilder.h:528
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
Definition: OMPIRBuilder.h:615
LocationDescription(const InsertPointTy &IP)
Definition: OMPIRBuilder.h:618
LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
Definition: OMPIRBuilder.h:619
LocationDescription(const IRBuilderBase &IRB)
Definition: OMPIRBuilder.h:616
This structure contains combined information generated for mappable clauses, including base pointers,...
void append(MapInfosTy &CurInfo)
Append arrays in CurInfo.
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
Function * getFunction() const
Return the function that contains the region to be outlined.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
std::function< void(Function &)> PostOutlineCBTy
Information about an OpenMP reduction.
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
ReductionInfo(Value *PrivateVariable)
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, ReductionGenAtomicCBTy AtomicReductionGen)
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, Value *SizesArray, Value *MapTypesArray, Value *MapTypesArrayEnd, Value *MappersArray, Value *MapNamesArray)
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
Value * DynCGGroupMem
The size of the dynamic shared memory.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, Value *NumIterations, ArrayRef< Value * > NumTeams, Value *NumThreads, Value *DynCGGroupMem, bool HasNoWait)
Value * NumIterations
The number of iterations.
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
Value * NumThreads
The number of threads.
A MapVector that performs no allocations if smaller than a certain size.
Definition: MapVector.h:254
Data structure to contain the information needed to uniquely identify a target entry.
Definition: OMPIRBuilder.h:202
static void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
Definition: OMPIRBuilder.h:204
bool operator<(const TargetRegionEntryInfo &RHS) const
Definition: OMPIRBuilder.h:223
TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count=0)
Definition: OMPIRBuilder.h:213
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...
Definition: OMPGridValues.h:57