LLVM 22.0.0git
OMPIRBuilder.h
Go to the documentation of this file.
1//===- IR/OpenMPIRBuilder.h - OpenMP encoding builder for LLVM IR - C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the OpenMPIRBuilder class and helpers used as a convenient
10// way to create LLVM instructions for OpenMP directives.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
15#define LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
16
17#include "llvm/ADT/SetVector.h"
21#include "llvm/IR/CallingConv.h"
22#include "llvm/IR/DebugLoc.h"
23#include "llvm/IR/IRBuilder.h"
24#include "llvm/IR/Module.h"
25#include "llvm/IR/ValueMap.h"
28#include "llvm/Support/Error.h"
30#include <forward_list>
31#include <map>
32#include <optional>
33
34namespace llvm {
36class ScanInfo;
39class OpenMPIRBuilder;
40class Loop;
41class LoopAnalysis;
42class LoopInfo;
43
44namespace vfs {
45class FileSystem;
46} // namespace vfs
47
48/// Move the instruction after an InsertPoint to the beginning of another
49/// BasicBlock.
50///
51/// The instructions after \p IP are moved to the beginning of \p New which must
52/// not have any PHINodes. If \p CreateBranch is true, a branch instruction to
53/// \p New will be added such that there is no semantic change. Otherwise, the
54/// \p IP insert block remains degenerate and it is up to the caller to insert a
55/// terminator. \p DL is used as the debug location for the branch instruction
56/// if one is created.
58 bool CreateBranch, DebugLoc DL);
59
60/// Splice a BasicBlock at an IRBuilder's current insertion point. Its new
61/// insert location will stick to after the instruction before the insertion
62/// point (instead of moving with the instruction the InsertPoint stores
63/// internally).
64LLVM_ABI void spliceBB(IRBuilder<> &Builder, BasicBlock *New,
65 bool CreateBranch);
66
67/// Split a BasicBlock at an InsertPoint, even if the block is degenerate
68/// (missing the terminator).
69///
70/// llvm::SplitBasicBlock and BasicBlock::splitBasicBlock require a well-formed
71/// BasicBlock. \p Name is used for the new successor block. If \p CreateBranch
72/// is true, a branch to the new successor will new created such that
73/// semantically there is no change; otherwise the block of the insertion point
74/// remains degenerate and it is the caller's responsibility to insert a
75/// terminator. \p DL is used as the debug location for the branch instruction
76/// if one is created. Returns the new successor block.
78 DebugLoc DL, llvm::Twine Name = {});
79
80/// Split a BasicBlock at \p Builder's insertion point, even if the block is
81/// degenerate (missing the terminator). Its new insert location will stick to
82/// after the instruction before the insertion point (instead of moving with the
83/// instruction the InsertPoint stores internally).
84LLVM_ABI BasicBlock *splitBB(IRBuilderBase &Builder, bool CreateBranch,
85 llvm::Twine Name = {});
86
87/// Split a BasicBlock at \p Builder's insertion point, even if the block is
88/// degenerate (missing the terminator). Its new insert location will stick to
89/// after the instruction before the insertion point (instead of moving with the
90/// instruction the InsertPoint stores internally).
91LLVM_ABI BasicBlock *splitBB(IRBuilder<> &Builder, bool CreateBranch,
92 llvm::Twine Name);
93
94/// Like splitBB, but reuses the current block's name for the new name.
96 bool CreateBranch,
97 llvm::Twine Suffix = ".split");
98
99/// Captures attributes that affect generating LLVM-IR using the
100/// OpenMPIRBuilder and related classes. Note that not all attributes are
101/// required for all classes or functions. In some use cases the configuration
102/// is not necessary at all, because because the only functions that are called
103/// are ones that are not dependent on the configuration.
105public:
106 /// Flag to define whether to generate code for the role of the OpenMP host
107 /// (if set to false) or device (if set to true) in an offloading context. It
108 /// is set when the -fopenmp-is-target-device compiler frontend option is
109 /// specified.
110 std::optional<bool> IsTargetDevice;
111
112 /// Flag for specifying if the compilation is done for an accelerator. It is
113 /// set according to the architecture of the target triple and currently only
114 /// true when targeting AMDGPU or NVPTX. Today, these targets can only perform
115 /// the role of an OpenMP target device, so `IsTargetDevice` must also be true
116 /// if `IsGPU` is true. This restriction might be lifted if an accelerator-
117 /// like target with the ability to work as the OpenMP host is added, or if
118 /// the capabilities of the currently supported GPU architectures are
119 /// expanded.
120 std::optional<bool> IsGPU;
121
122 /// Flag for specifying if LLVMUsed information should be emitted.
123 std::optional<bool> EmitLLVMUsedMetaInfo;
124
125 /// Flag for specifying if offloading is mandatory.
126 std::optional<bool> OpenMPOffloadMandatory;
127
128 /// First separator used between the initial two parts of a name.
129 std::optional<StringRef> FirstSeparator;
130 /// Separator used between all of the rest consecutive parts of s name.
131 std::optional<StringRef> Separator;
132
133 // Grid Value for the GPU target.
134 std::optional<omp::GV> GridValue;
135
136 /// When compilation is being done for the OpenMP host (i.e. `IsTargetDevice =
137 /// false`), this contains the list of offloading triples associated, if any.
139
140 // Default address space for the target.
141 unsigned DefaultTargetAS = 0;
142
144
148 bool HasRequiresReverseOffload,
149 bool HasRequiresUnifiedAddress,
150 bool HasRequiresUnifiedSharedMemory,
151 bool HasRequiresDynamicAllocators);
152
153 // Getters functions that assert if the required values are not present.
154 bool isTargetDevice() const {
155 assert(IsTargetDevice.has_value() && "IsTargetDevice is not set");
156 return *IsTargetDevice;
157 }
158
159 bool isGPU() const {
160 assert(IsGPU.has_value() && "IsGPU is not set");
161 return *IsGPU;
162 }
163
165 assert(OpenMPOffloadMandatory.has_value() &&
166 "OpenMPOffloadMandatory is not set");
168 }
169
171 assert(GridValue.has_value() && "GridValue is not set");
172 return *GridValue;
173 }
174
175 unsigned getDefaultTargetAS() const { return DefaultTargetAS; }
176
178
179 bool hasRequiresFlags() const { return RequiresFlags; }
184
185 /// Returns requires directive clauses as flags compatible with those expected
186 /// by libomptarget.
187 LLVM_ABI int64_t getRequiresFlags() const;
188
189 // Returns the FirstSeparator if set, otherwise use the default separator
190 // depending on isGPU
192 if (FirstSeparator.has_value())
193 return *FirstSeparator;
194 if (isGPU())
195 return "_";
196 return ".";
197 }
198
199 // Returns the Separator if set, otherwise use the default separator depending
200 // on isGPU
202 if (Separator.has_value())
203 return *Separator;
204 if (isGPU())
205 return "$";
206 return ".";
207 }
208
210 void setIsGPU(bool Value) { IsGPU = Value; }
216 void setDefaultTargetAS(unsigned AS) { DefaultTargetAS = AS; }
218
223
224private:
225 /// Flags for specifying which requires directive clauses are present.
226 int64_t RequiresFlags;
227};
228
229/// Data structure to contain the information needed to uniquely identify
230/// a target entry.
232 /// The prefix used for kernel names.
233 static constexpr const char *KernelNamePrefix = "__omp_offloading_";
234
235 std::string ParentName;
236 unsigned DeviceID;
237 unsigned FileID;
238 unsigned Line;
239 unsigned Count;
240
243 unsigned FileID, unsigned Line, unsigned Count = 0)
245 Count(Count) {}
246
247 LLVM_ABI static void
249 unsigned DeviceID, unsigned FileID, unsigned Line,
250 unsigned Count);
251
253 return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
254 std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
255 RHS.Count);
256 }
257};
258
259/// Class that manages information about offload code regions and data
261 /// Number of entries registered so far.
262 OpenMPIRBuilder *OMPBuilder;
263 unsigned OffloadingEntriesNum = 0;
264
265public:
266 /// Base class of the entries info.
268 public:
269 /// Kind of a given entry.
270 enum OffloadingEntryInfoKinds : unsigned {
271 /// Entry is a target region.
273 /// Entry is a declare target variable.
275 /// Invalid entry info.
277 };
278
279 protected:
281 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind) : Kind(Kind) {}
282 explicit OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order,
283 uint32_t Flags)
284 : Flags(Flags), Order(Order), Kind(Kind) {}
285 ~OffloadEntryInfo() = default;
286
287 public:
288 bool isValid() const { return Order != ~0u; }
289 unsigned getOrder() const { return Order; }
290 OffloadingEntryInfoKinds getKind() const { return Kind; }
291 uint32_t getFlags() const { return Flags; }
292 void setFlags(uint32_t NewFlags) { Flags = NewFlags; }
293 Constant *getAddress() const { return cast_or_null<Constant>(Addr); }
295 assert(!Addr.pointsToAliveValue() && "Address has been set before!");
296 Addr = V;
297 }
298 static bool classof(const OffloadEntryInfo *Info) { return true; }
299
300 private:
301 /// Address of the entity that has to be mapped for offloading.
302 WeakTrackingVH Addr;
303
304 /// Flags associated with the device global.
305 uint32_t Flags = 0u;
306
307 /// Order this entry was emitted.
308 unsigned Order = ~0u;
309
310 OffloadingEntryInfoKinds Kind = OffloadingEntryInfoInvalid;
311 };
312
313 /// Return true if a there are no entries defined.
314 LLVM_ABI bool empty() const;
315 /// Return number of entries defined so far.
316 unsigned size() const { return OffloadingEntriesNum; }
317
318 OffloadEntriesInfoManager(OpenMPIRBuilder *builder) : OMPBuilder(builder) {}
319
320 //
321 // Target region entries related.
322 //
323
324 /// Kind of the target registry entry.
326 /// Mark the entry as target region.
328 };
329
330 /// Target region entries info.
332 /// Address that can be used as the ID of the entry.
333 Constant *ID = nullptr;
334
335 public:
338 explicit OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr,
339 Constant *ID,
342 ID(ID) {
343 setAddress(Addr);
344 }
345
346 Constant *getID() const { return ID; }
347 void setID(Constant *V) {
348 assert(!ID && "ID has been set before!");
349 ID = V;
350 }
351 static bool classof(const OffloadEntryInfo *Info) {
352 return Info->getKind() == OffloadingEntryInfoTargetRegion;
353 }
354 };
355
356 /// Initialize target region entry.
357 /// This is ONLY needed for DEVICE compilation.
358 LLVM_ABI void
360 unsigned Order);
361 /// Register target region entry.
363 Constant *Addr, Constant *ID,
365 /// Return true if a target region entry with the provided information
366 /// exists.
368 bool IgnoreAddressId = false) const;
369
370 // Return the Name based on \a EntryInfo using the next available Count.
371 LLVM_ABI void
373 const TargetRegionEntryInfo &EntryInfo);
374
375 /// brief Applies action \a Action on all registered entries.
376 typedef function_ref<void(const TargetRegionEntryInfo &EntryInfo,
377 const OffloadEntryInfoTargetRegion &)>
379 LLVM_ABI void
381
382 //
383 // Device global variable entries related.
384 //
385
386 /// Kind of the global variable entry..
388 /// Mark the entry as a to declare target.
390 /// Mark the entry as a to declare target link.
392 /// Mark the entry as a declare target enter.
394 /// Mark the entry as having no declare target entry kind.
396 /// Mark the entry as a declare target indirect global.
398 /// Mark the entry as a register requires global.
400 /// Mark the entry as a declare target indirect vtable.
402 };
403
404 /// Kind of device clause for declare target variables
405 /// and functions
406 /// NOTE: Currently not used as a part of a variable entry
407 /// used for Flang and Clang to interface with the variable
408 /// related registration functions
410 /// The target is marked for all devices
412 /// The target is marked for non-host devices
414 /// The target is marked for host devices
416 /// The target is marked as having no clause
418 };
419
420 /// Device global variable entries info.
422 /// Type of the global variable.
423 int64_t VarSize;
425 const std::string VarName;
426
427 public:
433 explicit OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr,
434 int64_t VarSize,
437 const std::string &VarName)
439 VarSize(VarSize), Linkage(Linkage), VarName(VarName) {
440 setAddress(Addr);
441 }
442
443 int64_t getVarSize() const { return VarSize; }
444 StringRef getVarName() const { return VarName; }
445 void setVarSize(int64_t Size) { VarSize = Size; }
446 GlobalValue::LinkageTypes getLinkage() const { return Linkage; }
447 void setLinkage(GlobalValue::LinkageTypes LT) { Linkage = LT; }
448 static bool classof(const OffloadEntryInfo *Info) {
449 return Info->getKind() == OffloadingEntryInfoDeviceGlobalVar;
450 }
451 };
452
453 /// Initialize device global variable entry.
454 /// This is ONLY used for DEVICE compilation.
456 StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order);
457
458 /// Register device global variable entry.
460 StringRef VarName, Constant *Addr, int64_t VarSize,
462 /// Checks if the variable with the given name has been registered already.
464 return OffloadEntriesDeviceGlobalVar.count(VarName) > 0;
465 }
466 /// Applies action \a Action on all registered entries.
467 typedef function_ref<void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)>
471
472private:
473 /// Return the count of entries at a particular source location.
474 unsigned
475 getTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo) const;
476
477 /// Update the count of entries at a particular source location.
478 void
479 incrementTargetRegionEntryInfoCount(const TargetRegionEntryInfo &EntryInfo);
480
482 getTargetRegionEntryCountKey(const TargetRegionEntryInfo &EntryInfo) {
483 return TargetRegionEntryInfo(EntryInfo.ParentName, EntryInfo.DeviceID,
484 EntryInfo.FileID, EntryInfo.Line, 0);
485 }
486
487 // Count of entries at a location.
488 std::map<TargetRegionEntryInfo, unsigned> OffloadEntriesTargetRegionCount;
489
490 // Storage for target region entries kind.
491 typedef std::map<TargetRegionEntryInfo, OffloadEntryInfoTargetRegion>
492 OffloadEntriesTargetRegionTy;
493 OffloadEntriesTargetRegionTy OffloadEntriesTargetRegion;
494 /// Storage for device global variable entries kind. The storage is to be
495 /// indexed by mangled name.
497 OffloadEntriesDeviceGlobalVarTy;
498 OffloadEntriesDeviceGlobalVarTy OffloadEntriesDeviceGlobalVar;
499};
500
501/// An interface to create LLVM-IR for OpenMP directives.
502///
503/// Each OpenMP directive has a corresponding public generator method.
505public:
506 /// Create a new OpenMPIRBuilder operating on the given module \p M. This will
507 /// not have an effect on \p M (see initialize)
509 : M(M), Builder(M.getContext()), OffloadInfoManager(this),
510 T(M.getTargetTriple()), IsFinalized(false) {}
512
514 llvm::Value *AtomicVar;
515
516 public:
524
525 llvm::Value *getAtomicPointer() const override { return AtomicVar; }
528 const llvm::Twine &Name) const override {
529 llvm::AllocaInst *allocaInst = Builder->CreateAlloca(Ty);
530 allocaInst->setName(Name);
531 return allocaInst;
532 }
533 };
534 /// Initialize the internal state, this will put structures types and
535 /// potentially other helpers into the underlying module. Must be called
536 /// before any other method and only once! This internal state includes types
537 /// used in the OpenMPIRBuilder generated from OMPKinds.def.
538 LLVM_ABI void initialize();
539
541
542 /// Finalize the underlying module, e.g., by outlining regions.
543 /// \param Fn The function to be finalized. If not used,
544 /// all functions are finalized.
545 LLVM_ABI void finalize(Function *Fn = nullptr);
546
547 /// Check whether the finalize function has already run
548 /// \return true if the finalize function has already run
549 LLVM_ABI bool isFinalized();
550
551 /// Add attributes known for \p FnID to \p Fn.
553
554 /// Type used throughout for insertion points.
556
557 /// Type used to represent an insertion point or an error value.
559
560 /// Get the create a name using the platform specific separators.
561 /// \param Parts parts of the final name that needs separation
562 /// The created name has a first separator between the first and second part
563 /// and a second separator between all other parts.
564 /// E.g. with FirstSeparator "$" and Separator "." and
565 /// parts: "p1", "p2", "p3", "p4"
566 /// The resulting name is "p1$p2.p3.p4"
567 /// The separators are retrieved from the OpenMPIRBuilderConfig.
568 LLVM_ABI std::string
570
571 /// Callback type for variable finalization (think destructors).
572 ///
573 /// \param CodeGenIP is the insertion point at which the finalization code
574 /// should be placed.
575 ///
576 /// A finalize callback knows about all objects that need finalization, e.g.
577 /// destruction, when the scope of the currently generated construct is left
578 /// at the time, and location, the callback is invoked.
579 using FinalizeCallbackTy = std::function<Error(InsertPointTy CodeGenIP)>;
580
582 FinalizationInfo(FinalizeCallbackTy FiniCB, omp::Directive DK,
583 bool IsCancellable)
584 : DK(DK), IsCancellable(IsCancellable), FiniCB(std::move(FiniCB)) {}
585 /// The directive kind of the innermost directive that has an associated
586 /// region which might require finalization when it is left.
587 const omp::Directive DK;
588
589 /// Flag to indicate if the directive is cancellable.
590 const bool IsCancellable;
591
592 /// The basic block to which control should be transferred to
593 /// implement the FiniCB. Memoized to avoid generating finalization
594 /// multiple times.
596
597 /// For cases where there is an unavoidable existing finalization block
598 /// (e.g. loop finialization after omp sections). The existing finalization
599 /// block must not contain any non-finalization code.
601
602 private:
603 /// Access via getFiniBB.
604 BasicBlock *FiniBB = nullptr;
605
606 /// The finalization callback provided by the last in-flight invocation of
607 /// createXXXX for the directive of kind DK.
608 FinalizeCallbackTy FiniCB;
609 };
610
611 /// Push a finalization callback on the finalization stack.
612 ///
613 /// NOTE: Temporary solution until Clang CG is gone.
615 FinalizationStack.push_back(FI);
616 }
617
618 /// Pop the last finalization callback from the finalization stack.
619 ///
620 /// NOTE: Temporary solution until Clang CG is gone.
622
623 /// Callback type for body (=inner region) code generation
624 ///
625 /// The callback takes code locations as arguments, each describing a
626 /// location where additional instructions can be inserted.
627 ///
628 /// The CodeGenIP may be in the middle of a basic block or point to the end of
629 /// it. The basic block may have a terminator or be degenerate. The callback
630 /// function may just insert instructions at that position, but also split the
631 /// block (without the Before argument of BasicBlock::splitBasicBlock such
632 /// that the identify of the split predecessor block is preserved) and insert
633 /// additional control flow, including branches that do not lead back to what
634 /// follows the CodeGenIP. Note that since the callback is allowed to split
635 /// the block, callers must assume that InsertPoints to positions in the
636 /// BasicBlock after CodeGenIP including CodeGenIP itself are invalidated. If
637 /// such InsertPoints need to be preserved, it can split the block itself
638 /// before calling the callback.
639 ///
640 /// AllocaIP and CodeGenIP must not point to the same position.
641 ///
642 /// \param AllocaIP is the insertion point at which new alloca instructions
643 /// should be placed. The BasicBlock it is pointing to must
644 /// not be split.
645 /// \param CodeGenIP is the insertion point at which the body code should be
646 /// placed.
647 ///
648 /// \return an error, if any were triggered during execution.
650 function_ref<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
651
652 /// Callback type for task duplication function code generation. This is the
653 /// task duplication function passed to __kmpc_taskloop. It is expected that
654 /// this function will set up (first)private variables in the duplicated task
655 /// which have non-trivial (copy-)constructors. Insertion points are handled
656 /// the same way as for BodyGenCallbackTy.
657 ///
658 /// \ref createTaskloop lays out the task's auxiliary data structure as:
659 /// `{ lower bound, upper bound, step, data... }`. DestPtr and SrcPtr point
660 /// to this data.
661 ///
662 /// It is acceptable for the callback to be set to nullptr. In that case no
663 /// function will be generated and nullptr will be passed as the task
664 /// duplication function to __kmpc_taskloop.
665 ///
666 /// \param AllocaIP is the insertion point at which new alloca instructions
667 /// should be placed. The BasicBlock it is pointing to must
668 /// not be split.
669 /// \param CodeGenIP is the insertion point at which the body code should be
670 /// placed.
671 /// \param DestPtr This is a pointer to data inside the newly duplicated
672 /// task's auxiliary data structure (allocated after the task
673 /// descriptor.)
674 /// \param SrcPtr This is a pointer to data inside the original task's
675 /// auxiliary data structure (allocated after the task
676 /// descriptor.)
677 ///
678 /// \return The insertion point immediately after the generated code, or an
679 /// error if any occured.
681 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr,
682 Value *SrcPtr)>;
683
684 // This is created primarily for sections construct as llvm::function_ref
685 // (BodyGenCallbackTy) is not storable (as described in the comments of
686 // function_ref class - function_ref contains non-ownable reference
687 // to the callable.
688 ///
689 /// \return an error, if any were triggered during execution.
691 std::function<Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
692
693 /// Callback type for loop body code generation.
694 ///
695 /// \param CodeGenIP is the insertion point where the loop's body code must be
696 /// placed. This will be a dedicated BasicBlock with a
697 /// conditional branch from the loop condition check and
698 /// terminated with an unconditional branch to the loop
699 /// latch.
700 /// \param IndVar is the induction variable usable at the insertion point.
701 ///
702 /// \return an error, if any were triggered during execution.
704 function_ref<Error(InsertPointTy CodeGenIP, Value *IndVar)>;
705
706 /// Callback type for variable privatization (think copy & default
707 /// constructor).
708 ///
709 /// \param AllocaIP is the insertion point at which new alloca instructions
710 /// should be placed.
711 /// \param CodeGenIP is the insertion point at which the privatization code
712 /// should be placed.
713 /// \param Original The value being copied/created, should not be used in the
714 /// generated IR.
715 /// \param Inner The equivalent of \p Original that should be used in the
716 /// generated IR; this is equal to \p Original if the value is
717 /// a pointer and can thus be passed directly, otherwise it is
718 /// an equivalent but different value.
719 /// \param ReplVal The replacement value, thus a copy or new created version
720 /// of \p Inner.
721 ///
722 /// \returns The new insertion point where code generation continues and
723 /// \p ReplVal the replacement value.
725 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original,
726 Value &Inner, Value *&ReplVal)>;
727
728 /// Description of a LLVM-IR insertion point (IP) and a debug/source location
729 /// (filename, line, column, ...).
732 : IP(IRB.saveIP()), DL(IRB.getCurrentDebugLocation()) {}
735 : IP(IP), DL(DL) {}
738 };
739
740 /// Emitter methods for OpenMP directives.
741 ///
742 ///{
743
744 /// Generator for '#omp barrier'
745 ///
746 /// \param Loc The location where the barrier directive was encountered.
747 /// \param Kind The kind of directive that caused the barrier.
748 /// \param ForceSimpleCall Flag to force a simple (=non-cancellation) barrier.
749 /// \param CheckCancelFlag Flag to indicate a cancel barrier return value
750 /// should be checked and acted upon.
751 /// \param ThreadID Optional parameter to pass in any existing ThreadID value.
752 ///
753 /// \returns The insertion point after the barrier.
755 omp::Directive Kind,
756 bool ForceSimpleCall = false,
757 bool CheckCancelFlag = true);
758
759 /// Generator for '#omp cancel'
760 ///
761 /// \param Loc The location where the directive was encountered.
762 /// \param IfCondition The evaluated 'if' clause expression, if any.
763 /// \param CanceledDirective The kind of directive that is cancled.
764 ///
765 /// \returns The insertion point after the barrier.
767 Value *IfCondition,
768 omp::Directive CanceledDirective);
769
770 /// Generator for '#omp cancellation point'
771 ///
772 /// \param Loc The location where the directive was encountered.
773 /// \param CanceledDirective The kind of directive that is cancled.
774 ///
775 /// \returns The insertion point after the barrier.
777 const LocationDescription &Loc, omp::Directive CanceledDirective);
778
779 /// Creates a ScanInfo object, allocates and returns the pointer.
781
782 /// Generator for '#omp parallel'
783 ///
784 /// \param Loc The insert and source location description.
785 /// \param AllocaIP The insertion points to be used for alloca instructions.
786 /// \param BodyGenCB Callback that will generate the region code.
787 /// \param PrivCB Callback to copy a given variable (think copy constructor).
788 /// \param FiniCB Callback to finalize variable copies.
789 /// \param IfCondition The evaluated 'if' clause expression, if any.
790 /// \param NumThreads The evaluated 'num_threads' clause expression, if any.
791 /// \param ProcBind The value of the 'proc_bind' clause (see ProcBindKind).
792 /// \param IsCancellable Flag to indicate a cancellable parallel region.
793 ///
794 /// \returns The insertion position *after* the parallel.
796 const LocationDescription &Loc, InsertPointTy AllocaIP,
797 BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB,
798 FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads,
799 omp::ProcBindKind ProcBind, bool IsCancellable);
800
801 /// Generator for the control flow structure of an OpenMP canonical loop.
802 ///
803 /// This generator operates on the logical iteration space of the loop, i.e.
804 /// the caller only has to provide a loop trip count of the loop as defined by
805 /// base language semantics. The trip count is interpreted as an unsigned
806 /// integer. The induction variable passed to \p BodyGenCB will be of the same
807 /// type and run from 0 to \p TripCount - 1. It is up to the callback to
808 /// convert the logical iteration variable to the loop counter variable in the
809 /// loop body.
810 ///
811 /// \param Loc The insert and source location description. The insert
812 /// location can be between two instructions or the end of a
813 /// degenerate block (e.g. a BB under construction).
814 /// \param BodyGenCB Callback that will generate the loop body code.
815 /// \param TripCount Number of iterations the loop body is executed.
816 /// \param Name Base name used to derive BB and instruction names.
817 ///
818 /// \returns An object representing the created control flow structure which
819 /// can be used for loop-associated directives.
822 LoopBodyGenCallbackTy BodyGenCB, Value *TripCount,
823 const Twine &Name = "loop");
824
825 /// Generator for the control flow structure of an OpenMP canonical loops if
826 /// the parent directive has an `inscan` modifier specified.
827 /// If the `inscan` modifier is specified, the region of the parent is
828 /// expected to have a `scan` directive. Based on the clauses in
829 /// scan directive, the body of the loop is split into two loops: Input loop
830 /// and Scan Loop. Input loop contains the code generated for input phase of
831 /// scan and Scan loop contains the code generated for scan phase of scan.
832 /// From the bodyGen callback of these loops, `createScan` would be called
833 /// when a scan directive is encountered from the loop body. `createScan`
834 /// based on whether 1. inclusive or exclusive scan is specified and, 2. input
835 /// loop or scan loop is generated, lowers the body of the for loop
836 /// accordingly.
837 ///
838 /// \param Loc The insert and source location description.
839 /// \param BodyGenCB Callback that will generate the loop body code.
840 /// \param Start Value of the loop counter for the first iterations.
841 /// \param Stop Loop counter values past this will stop the loop.
842 /// \param Step Loop counter increment after each iteration; negative
843 /// means counting down.
844 /// \param IsSigned Whether Start, Stop and Step are signed integers.
845 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
846 /// counter.
847 /// \param ComputeIP Insertion point for instructions computing the trip
848 /// count. Can be used to ensure the trip count is available
849 /// at the outermost loop of a loop nest. If not set,
850 /// defaults to the preheader of the generated loop.
851 /// \param Name Base name used to derive BB and instruction names.
852 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
853 /// `ScanInfoInitialize`.
854 ///
855 /// \returns A vector containing Loop Info of Input Loop and Scan Loop.
858 LoopBodyGenCallbackTy BodyGenCB, Value *Start,
859 Value *Stop, Value *Step, bool IsSigned,
860 bool InclusiveStop, InsertPointTy ComputeIP,
861 const Twine &Name, ScanInfo *ScanRedInfo);
862
863 /// Calculate the trip count of a canonical loop.
864 ///
865 /// This allows specifying user-defined loop counter values using increment,
866 /// upper- and lower bounds. To disambiguate the terminology when counting
867 /// downwards, instead of lower bounds we use \p Start for the loop counter
868 /// value in the first body iteration.
869 ///
870 /// Consider the following limitations:
871 ///
872 /// * A loop counter space over all integer values of its bit-width cannot be
873 /// represented. E.g using uint8_t, its loop trip count of 256 cannot be
874 /// stored into an 8 bit integer):
875 ///
876 /// DO I = 0, 255, 1
877 ///
878 /// * Unsigned wrapping is only supported when wrapping only "once"; E.g.
879 /// effectively counting downwards:
880 ///
881 /// for (uint8_t i = 100u; i > 0; i += 127u)
882 ///
883 ///
884 /// TODO: May need to add additional parameters to represent:
885 ///
886 /// * Allow representing downcounting with unsigned integers.
887 ///
888 /// * Sign of the step and the comparison operator might disagree:
889 ///
890 /// for (int i = 0; i < 42; i -= 1u)
891 ///
892 /// \param Loc The insert and source location description.
893 /// \param Start Value of the loop counter for the first iterations.
894 /// \param Stop Loop counter values past this will stop the loop.
895 /// \param Step Loop counter increment after each iteration; negative
896 /// means counting down.
897 /// \param IsSigned Whether Start, Stop and Step are signed integers.
898 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
899 /// counter.
900 /// \param Name Base name used to derive instruction names.
901 ///
902 /// \returns The value holding the calculated trip count.
904 const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step,
905 bool IsSigned, bool InclusiveStop, const Twine &Name = "loop");
906
907 /// Generator for the control flow structure of an OpenMP canonical loop.
908 ///
909 /// Instead of a logical iteration space, this allows specifying user-defined
910 /// loop counter values using increment, upper- and lower bounds. To
911 /// disambiguate the terminology when counting downwards, instead of lower
912 /// bounds we use \p Start for the loop counter value in the first body
913 ///
914 /// It calls \see calculateCanonicalLoopTripCount for trip count calculations,
915 /// so limitations of that method apply here as well.
916 ///
917 /// \param Loc The insert and source location description.
918 /// \param BodyGenCB Callback that will generate the loop body code.
919 /// \param Start Value of the loop counter for the first iterations.
920 /// \param Stop Loop counter values past this will stop the loop.
921 /// \param Step Loop counter increment after each iteration; negative
922 /// means counting down.
923 /// \param IsSigned Whether Start, Stop and Step are signed integers.
924 /// \param InclusiveStop Whether \p Stop itself is a valid value for the loop
925 /// counter.
926 /// \param ComputeIP Insertion point for instructions computing the trip
927 /// count. Can be used to ensure the trip count is available
928 /// at the outermost loop of a loop nest. If not set,
929 /// defaults to the preheader of the generated loop.
930 /// \param Name Base name used to derive BB and instruction names.
931 /// \param InScan Whether loop has a scan reduction specified.
932 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
933 /// `ScanInfoInitialize`.
934 ///
935 /// \returns An object representing the created control flow structure which
936 /// can be used for loop-associated directives.
939 Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop,
940 InsertPointTy ComputeIP = {}, const Twine &Name = "loop",
941 bool InScan = false, ScanInfo *ScanRedInfo = nullptr);
942
943 /// Collapse a loop nest into a single loop.
944 ///
945 /// Merges loops of a loop nest into a single CanonicalLoopNest representation
946 /// that has the same number of innermost loop iterations as the origin loop
947 /// nest. The induction variables of the input loops are derived from the
948 /// collapsed loop's induction variable. This is intended to be used to
949 /// implement OpenMP's collapse clause. Before applying a directive,
950 /// collapseLoops normalizes a loop nest to contain only a single loop and the
951 /// directive's implementation does not need to handle multiple loops itself.
952 /// This does not remove the need to handle all loop nest handling by
953 /// directives, such as the ordered(<n>) clause or the simd schedule-clause
954 /// modifier of the worksharing-loop directive.
955 ///
956 /// Example:
957 /// \code
958 /// for (int i = 0; i < 7; ++i) // Canonical loop "i"
959 /// for (int j = 0; j < 9; ++j) // Canonical loop "j"
960 /// body(i, j);
961 /// \endcode
962 ///
963 /// After collapsing with Loops={i,j}, the loop is changed to
964 /// \code
965 /// for (int ij = 0; ij < 63; ++ij) {
966 /// int i = ij / 9;
967 /// int j = ij % 9;
968 /// body(i, j);
969 /// }
970 /// \endcode
971 ///
972 /// In the current implementation, the following limitations apply:
973 ///
974 /// * All input loops have an induction variable of the same type.
975 ///
976 /// * The collapsed loop will have the same trip count integer type as the
977 /// input loops. Therefore it is possible that the collapsed loop cannot
978 /// represent all iterations of the input loops. For instance, assuming a
979 /// 32 bit integer type, and two input loops both iterating 2^16 times, the
980 /// theoretical trip count of the collapsed loop would be 2^32 iteration,
981 /// which cannot be represented in an 32-bit integer. Behavior is undefined
982 /// in this case.
983 ///
984 /// * The trip counts of every input loop must be available at \p ComputeIP.
985 /// Non-rectangular loops are not yet supported.
986 ///
987 /// * At each nest level, code between a surrounding loop and its nested loop
988 /// is hoisted into the loop body, and such code will be executed more
989 /// often than before collapsing (or not at all if any inner loop iteration
990 /// has a trip count of 0). This is permitted by the OpenMP specification.
991 ///
992 /// \param DL Debug location for instructions added for collapsing,
993 /// such as instructions to compute/derive the input loop's
994 /// induction variables.
995 /// \param Loops Loops in the loop nest to collapse. Loops are specified
996 /// from outermost-to-innermost and every control flow of a
997 /// loop's body must pass through its directly nested loop.
998 /// \param ComputeIP Where additional instruction that compute the collapsed
999 /// trip count. If not set, defaults to before the generated
1000 /// loop.
1001 ///
1002 /// \returns The CanonicalLoopInfo object representing the collapsed loop.
1005 InsertPointTy ComputeIP);
1006
1007 /// Get the default alignment value for given target
1008 ///
1009 /// \param TargetTriple Target triple
1010 /// \param Features StringMap which describes extra CPU features
1011 LLVM_ABI static unsigned
1012 getOpenMPDefaultSimdAlign(const Triple &TargetTriple,
1013 const StringMap<bool> &Features);
1014
1015 /// Retrieve (or create if non-existent) the address of a declare
1016 /// target variable, used in conjunction with registerTargetGlobalVariable
1017 /// to create declare target global variables.
1018 ///
1019 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
1020 /// clause used in conjunction with the variable being registered (link,
1021 /// to, enter).
1022 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
1023 /// clause used in conjunction with the variable being registered (nohost,
1024 /// host, any)
1025 /// \param IsDeclaration - boolean stating if the variable being registered
1026 /// is a declaration-only and not a definition
1027 /// \param IsExternallyVisible - boolean stating if the variable is externally
1028 /// visible
1029 /// \param EntryInfo - Unique entry information for the value generated
1030 /// using getTargetEntryUniqueInfo, used to name generated pointer references
1031 /// to the declare target variable
1032 /// \param MangledName - the mangled name of the variable being registered
1033 /// \param GeneratedRefs - references generated by invocations of
1034 /// registerTargetGlobalVariable invoked from getAddrOfDeclareTargetVar,
1035 /// these are required by Clang for book keeping.
1036 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
1037 /// \param TargetTriple - The OpenMP device target triple we are compiling
1038 /// for
1039 /// \param LlvmPtrTy - The type of the variable we are generating or
1040 /// retrieving an address for
1041 /// \param GlobalInitializer - a lambda function which creates a constant
1042 /// used for initializing a pointer reference to the variable in certain
1043 /// cases. If a nullptr is passed, it will default to utilising the original
1044 /// variable to initialize the pointer reference.
1045 /// \param VariableLinkage - a lambda function which returns the variables
1046 /// linkage type, if unspecified and a nullptr is given, it will instead
1047 /// utilise the linkage stored on the existing global variable in the
1048 /// LLVMModule.
1052 bool IsDeclaration, bool IsExternallyVisible,
1053 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
1054 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
1055 std::vector<Triple> TargetTriple, Type *LlvmPtrTy,
1056 std::function<Constant *()> GlobalInitializer,
1057 std::function<GlobalValue::LinkageTypes()> VariableLinkage);
1058
1059 /// Registers a target variable for device or host.
1060 ///
1061 /// \param CaptureClause - enumerator corresponding to the OpenMP capture
1062 /// clause used in conjunction with the variable being registered (link,
1063 /// to, enter).
1064 /// \param DeviceClause - enumerator corresponding to the OpenMP capture
1065 /// clause used in conjunction with the variable being registered (nohost,
1066 /// host, any)
1067 /// \param IsDeclaration - boolean stating if the variable being registered
1068 /// is a declaration-only and not a definition
1069 /// \param IsExternallyVisible - boolean stating if the variable is externally
1070 /// visible
1071 /// \param EntryInfo - Unique entry information for the value generated
1072 /// using getTargetEntryUniqueInfo, used to name generated pointer references
1073 /// to the declare target variable
1074 /// \param MangledName - the mangled name of the variable being registered
1075 /// \param GeneratedRefs - references generated by invocations of
1076 /// registerTargetGlobalVariable these are required by Clang for book
1077 /// keeping.
1078 /// \param OpenMPSIMD - if OpenMP SIMD mode is currently enabled
1079 /// \param TargetTriple - The OpenMP device target triple we are compiling
1080 /// for
1081 /// \param GlobalInitializer - a lambda function which creates a constant
1082 /// used for initializing a pointer reference to the variable in certain
1083 /// cases. If a nullptr is passed, it will default to utilising the original
1084 /// variable to initialize the pointer reference.
1085 /// \param VariableLinkage - a lambda function which returns the variables
1086 /// linkage type, if unspecified and a nullptr is given, it will instead
1087 /// utilise the linkage stored on the existing global variable in the
1088 /// LLVMModule.
1089 /// \param LlvmPtrTy - The type of the variable we are generating or
1090 /// retrieving an address for
1091 /// \param Addr - the original llvm value (addr) of the variable to be
1092 /// registered
1096 bool IsDeclaration, bool IsExternallyVisible,
1097 TargetRegionEntryInfo EntryInfo, StringRef MangledName,
1098 std::vector<GlobalVariable *> &GeneratedRefs, bool OpenMPSIMD,
1099 std::vector<Triple> TargetTriple,
1100 std::function<Constant *()> GlobalInitializer,
1101 std::function<GlobalValue::LinkageTypes()> VariableLinkage,
1102 Type *LlvmPtrTy, Constant *Addr);
1103
1104 /// Get the offset of the OMP_MAP_MEMBER_OF field.
1105 LLVM_ABI unsigned getFlagMemberOffset();
1106
1107 /// Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on
1108 /// the position given.
1109 /// \param Position - A value indicating the position of the parent
1110 /// of the member in the kernel argument structure, often retrieved
1111 /// by the parents position in the combined information vectors used
1112 /// to generate the structure itself. Multiple children (member's of)
1113 /// with the same parent will use the same returned member flag.
1115
1116 /// Given an initial flag set, this function modifies it to contain
1117 /// the passed in MemberOfFlag generated from the getMemberOfFlag
1118 /// function. The results are dependent on the existing flag bits
1119 /// set in the original flag set.
1120 /// \param Flags - The original set of flags to be modified with the
1121 /// passed in MemberOfFlag.
1122 /// \param MemberOfFlag - A modified OMP_MAP_MEMBER_OF flag, adjusted
1123 /// slightly based on the getMemberOfFlag which adjusts the flag bits
1124 /// based on the members position in its parent.
1125 LLVM_ABI void
1127 omp::OpenMPOffloadMappingFlags MemberOfFlag);
1128
1129private:
1130 /// Modifies the canonical loop to be a statically-scheduled workshare loop
1131 /// which is executed on the device
1132 ///
1133 /// This takes a \p CLI representing a canonical loop, such as the one
1134 /// created by \see createCanonicalLoop and emits additional instructions to
1135 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1136 /// runtime function in the preheader to call OpenMP device rtl function
1137 /// which handles worksharing of loop body interations.
1138 ///
1139 /// \param DL Debug location for instructions added for the
1140 /// workshare-loop construct itself.
1141 /// \param CLI A descriptor of the canonical loop to workshare.
1142 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1143 /// preheader of the loop.
1144 /// \param LoopType Information about type of loop worksharing.
1145 /// It corresponds to type of loop workshare OpenMP pragma.
1146 /// \param NoLoop If true, no-loop code is generated.
1147 ///
1148 /// \returns Point where to insert code after the workshare construct.
1149 InsertPointTy applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
1150 InsertPointTy AllocaIP,
1151 omp::WorksharingLoopType LoopType,
1152 bool NoLoop);
1153
1154 /// Modifies the canonical loop to be a statically-scheduled workshare loop.
1155 ///
1156 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1157 /// created by \p createCanonicalLoop and emits additional instructions to
1158 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1159 /// runtime function in the preheader to obtain the loop bounds to be used in
1160 /// the current thread, updates the relevant instructions in the canonical
1161 /// loop and calls to an OpenMP runtime finalization function after the loop.
1162 ///
1163 /// \param DL Debug location for instructions added for the
1164 /// workshare-loop construct itself.
1165 /// \param CLI A descriptor of the canonical loop to workshare.
1166 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1167 /// preheader of the loop.
1168 /// \param NeedsBarrier Indicates whether a barrier must be inserted after
1169 /// the loop.
1170 /// \param LoopType Type of workshare loop.
1171 /// \param HasDistSchedule Defines if the clause being lowered is
1172 /// dist_schedule as this is handled slightly differently
1173 /// \param DistScheduleSchedType Defines the Schedule Type for the Distribute
1174 /// loop. Defaults to None if no Distribute loop is present.
1175 ///
1176 /// \returns Point where to insert code after the workshare construct.
1177 InsertPointOrErrorTy applyStaticWorkshareLoop(
1179 omp::WorksharingLoopType LoopType, bool NeedsBarrier,
1180 bool HasDistSchedule = false,
1181 omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None);
1182
1183 /// Modifies the canonical loop a statically-scheduled workshare loop with a
1184 /// user-specified chunk size.
1185 ///
1186 /// \param DL Debug location for instructions added for the
1187 /// workshare-loop construct itself.
1188 /// \param CLI A descriptor of the canonical loop to workshare.
1189 /// \param AllocaIP An insertion point for Alloca instructions usable in
1190 /// the preheader of the loop.
1191 /// \param NeedsBarrier Indicates whether a barrier must be inserted after the
1192 /// loop.
1193 /// \param ChunkSize The user-specified chunk size.
1194 /// \param SchedType Optional type of scheduling to be passed to the init
1195 /// function.
1196 /// \param DistScheduleChunkSize The size of dist_shcedule chunk considered
1197 /// as a unit when
1198 /// scheduling. If \p nullptr, defaults to 1.
1199 /// \param DistScheduleSchedType Defines the Schedule Type for the Distribute
1200 /// loop. Defaults to None if no Distribute loop is present.
1201 ///
1202 /// \returns Point where to insert code after the workshare construct.
1203 InsertPointOrErrorTy applyStaticChunkedWorkshareLoop(
1205 bool NeedsBarrier, Value *ChunkSize,
1206 omp::OMPScheduleType SchedType =
1208 Value *DistScheduleChunkSize = nullptr,
1209 omp::OMPScheduleType DistScheduleSchedType = omp::OMPScheduleType::None);
1210
1211 /// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
1212 ///
1213 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1214 /// created by \p createCanonicalLoop and emits additional instructions to
1215 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1216 /// runtime function in the preheader to obtain, and then in each iteration
1217 /// to update the loop counter.
1218 ///
1219 /// \param DL Debug location for instructions added for the
1220 /// workshare-loop construct itself.
1221 /// \param CLI A descriptor of the canonical loop to workshare.
1222 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1223 /// preheader of the loop.
1224 /// \param SchedType Type of scheduling to be passed to the init function.
1225 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1226 /// the loop.
1227 /// \param Chunk The size of loop chunk considered as a unit when
1228 /// scheduling. If \p nullptr, defaults to 1.
1229 ///
1230 /// \returns Point where to insert code after the workshare construct.
1231 InsertPointOrErrorTy applyDynamicWorkshareLoop(DebugLoc DL,
1232 CanonicalLoopInfo *CLI,
1233 InsertPointTy AllocaIP,
1234 omp::OMPScheduleType SchedType,
1235 bool NeedsBarrier,
1236 Value *Chunk = nullptr);
1237
1238 /// Create alternative version of the loop to support if clause
1239 ///
1240 /// OpenMP if clause can require to generate second loop. This loop
1241 /// will be executed when if clause condition is not met. createIfVersion
1242 /// adds branch instruction to the copied loop if \p ifCond is not met.
1243 ///
1244 /// \param Loop Original loop which should be versioned.
1245 /// \param IfCond Value which corresponds to if clause condition
1246 /// \param VMap Value to value map to define relation between
1247 /// original and copied loop values and loop blocks.
1248 /// \param NamePrefix Optional name prefix for if.then if.else blocks.
1249 void createIfVersion(CanonicalLoopInfo *Loop, Value *IfCond,
1251 LoopAnalysis &LIA, LoopInfo &LI, llvm::Loop *L,
1252 const Twine &NamePrefix = "");
1253
1254 /// Creates a task duplication function to be passed to kmpc_taskloop.
1255 ///
1256 /// The OpenMP runtime defines this function as taking the destination
1257 /// kmp_task_t, source kmp_task_t, and a lastprivate flag. This function is
1258 /// called on the source and destination tasks after the source task has been
1259 /// duplicated to create the destination task. At this point the destination
1260 /// task has been otherwise set up from the runtime's perspective, but this
1261 /// function is needed to fix up any data for the duplicated task e.g. private
1262 /// variables with non-trivial constructors.
1263 ///
1264 /// \param PrivatesTy The type of the privates structure for the task.
1265 /// \param PrivatesIndex The index inside the privates structure containing
1266 /// the data for the callback.
1267 /// \param DupCB The callback to generate the duplication code. See
1268 /// documentation for \ref TaskDupCallbackTy. This can be
1269 /// nullptr.
1270 Expected<Value *> createTaskDuplicationFunction(Type *PrivatesTy,
1271 int32_t PrivatesIndex,
1272 TaskDupCallbackTy DupCB);
1273
1274public:
1275 /// Modifies the canonical loop to be a workshare loop.
1276 ///
1277 /// This takes a \p LoopInfo representing a canonical loop, such as the one
1278 /// created by \p createCanonicalLoop and emits additional instructions to
1279 /// turn it into a workshare loop. In particular, it calls to an OpenMP
1280 /// runtime function in the preheader to obtain the loop bounds to be used in
1281 /// the current thread, updates the relevant instructions in the canonical
1282 /// loop and calls to an OpenMP runtime finalization function after the loop.
1283 ///
1284 /// The concrete transformation is done by applyStaticWorkshareLoop,
1285 /// applyStaticChunkedWorkshareLoop, or applyDynamicWorkshareLoop, depending
1286 /// on the value of \p SchedKind and \p ChunkSize.
1287 ///
1288 /// \param DL Debug location for instructions added for the
1289 /// workshare-loop construct itself.
1290 /// \param CLI A descriptor of the canonical loop to workshare.
1291 /// \param AllocaIP An insertion point for Alloca instructions usable in the
1292 /// preheader of the loop.
1293 /// \param NeedsBarrier Indicates whether a barrier must be insterted after
1294 /// the loop.
1295 /// \param SchedKind Scheduling algorithm to use.
1296 /// \param ChunkSize The chunk size for the inner loop.
1297 /// \param HasSimdModifier Whether the simd modifier is present in the
1298 /// schedule clause.
1299 /// \param HasMonotonicModifier Whether the monotonic modifier is present in
1300 /// the schedule clause.
1301 /// \param HasNonmonotonicModifier Whether the nonmonotonic modifier is
1302 /// present in the schedule clause.
1303 /// \param HasOrderedClause Whether the (parameterless) ordered clause is
1304 /// present.
1305 /// \param LoopType Information about type of loop worksharing.
1306 /// It corresponds to type of loop workshare OpenMP pragma.
1307 /// \param NoLoop If true, no-loop code is generated.
1308 /// \param HasDistSchedule Defines if the clause being lowered is
1309 /// dist_schedule as this is handled slightly differently
1310 ///
1311 /// \param DistScheduleChunkSize The chunk size for dist_schedule loop
1312 ///
1313 /// \returns Point where to insert code after the workshare construct.
1316 bool NeedsBarrier,
1317 llvm::omp::ScheduleKind SchedKind = llvm::omp::OMP_SCHEDULE_Default,
1318 Value *ChunkSize = nullptr, bool HasSimdModifier = false,
1319 bool HasMonotonicModifier = false, bool HasNonmonotonicModifier = false,
1320 bool HasOrderedClause = false,
1321 omp::WorksharingLoopType LoopType =
1323 bool NoLoop = false, bool HasDistSchedule = false,
1324 Value *DistScheduleChunkSize = nullptr);
1325
1326 /// Tile a loop nest.
1327 ///
1328 /// Tiles the loops of \p Loops by the tile sizes in \p TileSizes. Loops in
1329 /// \p/ Loops must be perfectly nested, from outermost to innermost loop
1330 /// (i.e. Loops.front() is the outermost loop). The trip count llvm::Value
1331 /// of every loop and every tile sizes must be usable in the outermost
1332 /// loop's preheader. This implies that the loop nest is rectangular.
1333 ///
1334 /// Example:
1335 /// \code
1336 /// for (int i = 0; i < 15; ++i) // Canonical loop "i"
1337 /// for (int j = 0; j < 14; ++j) // Canonical loop "j"
1338 /// body(i, j);
1339 /// \endcode
1340 ///
1341 /// After tiling with Loops={i,j} and TileSizes={5,7}, the loop is changed to
1342 /// \code
1343 /// for (int i1 = 0; i1 < 3; ++i1)
1344 /// for (int j1 = 0; j1 < 2; ++j1)
1345 /// for (int i2 = 0; i2 < 5; ++i2)
1346 /// for (int j2 = 0; j2 < 7; ++j2)
1347 /// body(i1*3+i2, j1*3+j2);
1348 /// \endcode
1349 ///
1350 /// The returned vector are the loops {i1,j1,i2,j2}. The loops i1 and j1 are
1351 /// referred to the floor, and the loops i2 and j2 are the tiles. Tiling also
1352 /// handles non-constant trip counts, non-constant tile sizes and trip counts
1353 /// that are not multiples of the tile size. In the latter case the tile loop
1354 /// of the last floor-loop iteration will have fewer iterations than specified
1355 /// as its tile size.
1356 ///
1357 ///
1358 /// @param DL Debug location for instructions added by tiling, for
1359 /// instance the floor- and tile trip count computation.
1360 /// @param Loops Loops to tile. The CanonicalLoopInfo objects are
1361 /// invalidated by this method, i.e. should not used after
1362 /// tiling.
1363 /// @param TileSizes For each loop in \p Loops, the tile size for that
1364 /// dimensions.
1365 ///
1366 /// \returns A list of generated loops. Contains twice as many loops as the
1367 /// input loop nest; the first half are the floor loops and the
1368 /// second half are the tile loops.
1369 LLVM_ABI std::vector<CanonicalLoopInfo *>
1371 ArrayRef<Value *> TileSizes);
1372
1373 /// Fully unroll a loop.
1374 ///
1375 /// Instead of unrolling the loop immediately (and duplicating its body
1376 /// instructions), it is deferred to LLVM's LoopUnrollPass by adding loop
1377 /// metadata.
1378 ///
1379 /// \param DL Debug location for instructions added by unrolling.
1380 /// \param Loop The loop to unroll. The loop will be invalidated.
1382
1383 /// Fully or partially unroll a loop. How the loop is unrolled is determined
1384 /// using LLVM's LoopUnrollPass.
1385 ///
1386 /// \param DL Debug location for instructions added by unrolling.
1387 /// \param Loop The loop to unroll. The loop will be invalidated.
1389
1390 /// Partially unroll a loop.
1391 ///
1392 /// The CanonicalLoopInfo of the unrolled loop for use with chained
1393 /// loop-associated directive can be requested using \p UnrolledCLI. Not
1394 /// needing the CanonicalLoopInfo allows more efficient code generation by
1395 /// deferring the actual unrolling to the LoopUnrollPass using loop metadata.
1396 /// A loop-associated directive applied to the unrolled loop needs to know the
1397 /// new trip count which means that if using a heuristically determined unroll
1398 /// factor (\p Factor == 0), that factor must be computed immediately. We are
1399 /// using the same logic as the LoopUnrollPass to derived the unroll factor,
1400 /// but which assumes that some canonicalization has taken place (e.g.
1401 /// Mem2Reg, LICM, GVN, Inlining, etc.). That is, the heuristic will perform
1402 /// better when the unrolled loop's CanonicalLoopInfo is not needed.
1403 ///
1404 /// \param DL Debug location for instructions added by unrolling.
1405 /// \param Loop The loop to unroll. The loop will be invalidated.
1406 /// \param Factor The factor to unroll the loop by. A factor of 0
1407 /// indicates that a heuristic should be used to determine
1408 /// the unroll-factor.
1409 /// \param UnrolledCLI If non-null, receives the CanonicalLoopInfo of the
1410 /// partially unrolled loop. Otherwise, uses loop metadata
1411 /// to defer unrolling to the LoopUnrollPass.
1413 int32_t Factor,
1414 CanonicalLoopInfo **UnrolledCLI);
1415
1416 /// Add metadata to simd-ize a loop. If IfCond is not nullptr, the loop
1417 /// is cloned. The metadata which prevents vectorization is added to
1418 /// to the cloned loop. The cloned loop is executed when ifCond is evaluated
1419 /// to false.
1420 ///
1421 /// \param Loop The loop to simd-ize.
1422 /// \param AlignedVars The map which containts pairs of the pointer
1423 /// and its corresponding alignment.
1424 /// \param IfCond The value which corresponds to the if clause
1425 /// condition.
1426 /// \param Order The enum to map order clause.
1427 /// \param Simdlen The Simdlen length to apply to the simd loop.
1428 /// \param Safelen The Safelen length to apply to the simd loop.
1430 MapVector<Value *, Value *> AlignedVars,
1431 Value *IfCond, omp::OrderKind Order,
1432 ConstantInt *Simdlen, ConstantInt *Safelen);
1433
1434 /// Generator for '#omp flush'
1435 ///
1436 /// \param Loc The location where the flush directive was encountered
1437 LLVM_ABI void createFlush(const LocationDescription &Loc);
1438
1439 /// Generator for '#omp taskwait'
1440 ///
1441 /// \param Loc The location where the taskwait directive was encountered.
1442 LLVM_ABI void createTaskwait(const LocationDescription &Loc);
1443
1444 /// Generator for '#omp taskyield'
1445 ///
1446 /// \param Loc The location where the taskyield directive was encountered.
1447 LLVM_ABI void createTaskyield(const LocationDescription &Loc);
1448
1449 /// A struct to pack the relevant information for an OpenMP depend clause.
1459
1460 /// Generator for `#omp taskloop`
1461 ///
1462 /// \param Loc The location where the taskloop construct was encountered.
1463 /// \param AllocaIP The insertion point to be used for alloca instructions.
1464 /// \param BodyGenCB Callback that will generate the region code.
1465 /// \param LoopInfo Callback that return the CLI
1466 /// \param LBVal Lowerbound value of loop
1467 /// \param UBVal Upperbound value of loop
1468 /// \param StepVal Step value of loop
1469 /// \param Untied True if the task is untied, false if the task is tied.
1470 /// \param IfCond i1 value. If it evaluates to `false`, an undeferred
1471 /// task is generated, and the encountering thread must
1472 /// suspend the current task region, for which execution
1473 /// cannot be resumed until execution of the structured
1474 /// block that is associated with the generated task is
1475 /// completed.
1476 /// \param GrainSize Value of the GrainSize/Num of Tasks if present
1477 /// \param NoGroup False if NoGroup is defined, true if not
1478 /// \param Sched If Grainsize is defined, Sched is 1. Num Tasks, Shed is 2.
1479 /// Otherwise Sched is 0
1480 /// \param Final i1 value which is `true` if the task is final, `false` if the
1481 /// task is not final.
1482 /// \param Mergeable If the given task is `mergeable`
1483 /// \param Priority `priority-value' specifies the execution order of the
1484 /// tasks that is generated by the construct
1485 /// \param DupCB The callback to generate the duplication code. See
1486 /// documentation for \ref TaskDupCallbackTy. This can be nullptr.
1487 /// \param TaskContextStructPtrVal If non-null, a pointer to to be placed
1488 /// immediately after the {lower bound, upper
1489 /// bound, step} values in the task data.
1490 LLVM_ABI InsertPointOrErrorTy createTaskloop(
1491 const LocationDescription &Loc, InsertPointTy AllocaIP,
1492 BodyGenCallbackTy BodyGenCB,
1494 Value *LBVal, Value *UBVal, Value *StepVal, bool Untied = false,
1495 Value *IfCond = nullptr, Value *GrainSize = nullptr, bool NoGroup = false,
1496 int Sched = 0, Value *Final = nullptr, bool Mergeable = false,
1497 Value *Priority = nullptr, TaskDupCallbackTy DupCB = nullptr,
1498 Value *TaskContextStructPtrVal = nullptr);
1499
1500 /// Generator for `#omp task`
1501 ///
1502 /// \param Loc The location where the task construct was encountered.
1503 /// \param AllocaIP The insertion point to be used for alloca instructions.
1504 /// \param BodyGenCB Callback that will generate the region code.
1505 /// \param Tied True if the task is tied, false if the task is untied.
1506 /// \param Final i1 value which is `true` if the task is final, `false` if the
1507 /// task is not final.
1508 /// \param IfCondition i1 value. If it evaluates to `false`, an undeferred
1509 /// task is generated, and the encountering thread must
1510 /// suspend the current task region, for which execution
1511 /// cannot be resumed until execution of the structured
1512 /// block that is associated with the generated task is
1513 /// completed.
1514 /// \param EventHandle If present, signifies the event handle as part of
1515 /// the detach clause
1516 /// \param Mergeable If the given task is `mergeable`
1517 /// \param priority `priority-value' specifies the execution order of the
1518 /// tasks that is generated by the construct
1520 createTask(const LocationDescription &Loc, InsertPointTy AllocaIP,
1521 BodyGenCallbackTy BodyGenCB, bool Tied = true,
1522 Value *Final = nullptr, Value *IfCondition = nullptr,
1523 SmallVector<DependData> Dependencies = {}, bool Mergeable = false,
1524 Value *EventHandle = nullptr, Value *Priority = nullptr);
1525
1526 /// Generator for the taskgroup construct
1527 ///
1528 /// \param Loc The location where the taskgroup construct was encountered.
1529 /// \param AllocaIP The insertion point to be used for alloca instructions.
1530 /// \param BodyGenCB Callback that will generate the region code.
1531 LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc,
1532 InsertPointTy AllocaIP,
1533 BodyGenCallbackTy BodyGenCB);
1534
1536 std::function<std::tuple<std::string, uint64_t>()>;
1537
1538 /// Creates a unique info for a target entry when provided a filename and
1539 /// line number from.
1540 ///
1541 /// \param CallBack A callback function which should return filename the entry
1542 /// resides in as well as the line number for the target entry
1543 /// \param ParentName The name of the parent the target entry resides in, if
1544 /// any.
1547 vfs::FileSystem &VFS, StringRef ParentName = "");
1548
1549 /// Enum class for the RedctionGen CallBack type to be used.
1551
1552 /// ReductionGen CallBack for Clang
1553 ///
1554 /// \param CodeGenIP InsertPoint for CodeGen.
1555 /// \param Index Index of the ReductionInfo to generate code for.
1556 /// \param LHSPtr Optionally used by Clang to return the LHSPtr it used for
1557 /// codegen, used for fixup later.
1558 /// \param RHSPtr Optionally used by Clang to
1559 /// return the RHSPtr it used for codegen, used for fixup later.
1560 /// \param CurFn Optionally used by Clang to pass in the Current Function as
1561 /// Clang context may be old.
1563 std::function<InsertPointTy(InsertPointTy CodeGenIP, unsigned Index,
1564 Value **LHS, Value **RHS, Function *CurFn)>;
1565
1566 /// ReductionGen CallBack for MLIR
1567 ///
1568 /// \param CodeGenIP InsertPoint for CodeGen.
1569 /// \param LHS Pass in the LHS Value to be used for CodeGen.
1570 /// \param RHS Pass in the RHS Value to be used for CodeGen.
1572 InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)>;
1573
1574 /// Functions used to generate atomic reductions. Such functions take two
1575 /// Values representing pointers to LHS and RHS of the reduction, as well as
1576 /// the element type of these pointers. They are expected to atomically
1577 /// update the LHS to the reduced value.
1579 InsertPointTy, Type *, Value *, Value *)>;
1580
1582 InsertPointTy, Value *ByRefVal, Value *&Res)>;
1583
1584 /// Enum class for reduction evaluation types scalar, complex and aggregate.
1586
1587 /// Information about an OpenMP reduction.
1602
1608
1609 /// Reduction element type, must match pointee type of variable. For by-ref
1610 /// reductions, this would be just an opaque `ptr`.
1612
1613 /// Reduction variable of pointer type.
1615
1616 /// Thread-private partial reduction variable.
1618
1619 /// Reduction evaluation kind - scalar, complex or aggregate.
1621
1622 /// Callback for generating the reduction body. The IR produced by this will
1623 /// be used to combine two values in a thread-safe context, e.g., under
1624 /// lock or within the same thread, and therefore need not be atomic.
1626
1627 /// Clang callback for generating the reduction body. The IR produced by
1628 /// this will be used to combine two values in a thread-safe context, e.g.,
1629 /// under lock or within the same thread, and therefore need not be atomic.
1631
1632 /// Callback for generating the atomic reduction body, may be null. The IR
1633 /// produced by this will be used to atomically combine two values during
1634 /// reduction. If null, the implementation will use the non-atomic version
1635 /// along with the appropriate synchronization mechanisms.
1637
1639
1640 /// For by-ref reductions, we need to keep track of 2 extra types that are
1641 /// potentially different:
1642 /// * The allocated type is the type of the storage allocated by the
1643 /// reduction op's `alloc` region. For example, for allocatables and arrays,
1644 /// this type would be the descriptor/box struct.
1646
1647 /// * The by-ref element type is the type of the actual storage needed for
1648 /// the data of the allocatable or array. For example, an float allocatable
1649 /// of would need some float storage to store intermediate reduction
1650 /// results.
1652 };
1653
1654 enum class CopyAction : unsigned {
1655 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1656 // the warp using shuffle instructions.
1658 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1660 };
1661
1667
1668 /// Supporting functions for Reductions CodeGen.
1669private:
1670 /// Get the id of the current thread on the GPU.
1671 Value *getGPUThreadID();
1672
1673 /// Get the GPU warp size.
1674 Value *getGPUWarpSize();
1675
1676 /// Get the id of the warp in the block.
1677 /// We assume that the warp size is 32, which is always the case
1678 /// on the NVPTX device, to generate more efficient code.
1679 Value *getNVPTXWarpID();
1680
1681 /// Get the id of the current lane in the Warp.
1682 /// We assume that the warp size is 32, which is always the case
1683 /// on the NVPTX device, to generate more efficient code.
1684 Value *getNVPTXLaneID();
1685
1686 /// Cast value to the specified type.
1687 Value *castValueToType(InsertPointTy AllocaIP, Value *From, Type *ToType);
1688
1689 /// This function creates calls to one of two shuffle functions to copy
1690 /// variables between lanes in a warp.
1691 Value *createRuntimeShuffleFunction(InsertPointTy AllocaIP, Value *Element,
1692 Type *ElementType, Value *Offset);
1693
1694 /// Function to shuffle over the value from the remote lane.
1695 void shuffleAndStore(InsertPointTy AllocaIP, Value *SrcAddr, Value *DstAddr,
1696 Type *ElementType, Value *Offset, Type *ReductionArrayTy,
1697 bool IsByRefElem);
1698
1699 /// Emit instructions to copy a Reduce list, which contains partially
1700 /// aggregated values, in the specified direction.
1701 Error emitReductionListCopy(
1702 InsertPointTy AllocaIP, CopyAction Action, Type *ReductionArrayTy,
1703 ArrayRef<ReductionInfo> ReductionInfos, Value *SrcBase, Value *DestBase,
1704 ArrayRef<bool> IsByRef,
1705 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr});
1706
1707 /// Emit a helper that reduces data across two OpenMP threads (lanes)
1708 /// in the same warp. It uses shuffle instructions to copy over data from
1709 /// a remote lane's stack. The reduction algorithm performed is specified
1710 /// by the fourth parameter.
1711 ///
1712 /// Algorithm Versions.
1713 /// Full Warp Reduce (argument value 0):
1714 /// This algorithm assumes that all 32 lanes are active and gathers
1715 /// data from these 32 lanes, producing a single resultant value.
1716 /// Contiguous Partial Warp Reduce (argument value 1):
1717 /// This algorithm assumes that only a *contiguous* subset of lanes
1718 /// are active. This happens for the last warp in a parallel region
1719 /// when the user specified num_threads is not an integer multiple of
1720 /// 32. This contiguous subset always starts with the zeroth lane.
1721 /// Partial Warp Reduce (argument value 2):
1722 /// This algorithm gathers data from any number of lanes at any position.
1723 /// All reduced values are stored in the lowest possible lane. The set
1724 /// of problems every algorithm addresses is a super set of those
1725 /// addressable by algorithms with a lower version number. Overhead
1726 /// increases as algorithm version increases.
1727 ///
1728 /// Terminology
1729 /// Reduce element:
1730 /// Reduce element refers to the individual data field with primitive
1731 /// data types to be combined and reduced across threads.
1732 /// Reduce list:
1733 /// Reduce list refers to a collection of local, thread-private
1734 /// reduce elements.
1735 /// Remote Reduce list:
1736 /// Remote Reduce list refers to a collection of remote (relative to
1737 /// the current thread) reduce elements.
1738 ///
1739 /// We distinguish between three states of threads that are important to
1740 /// the implementation of this function.
1741 /// Alive threads:
1742 /// Threads in a warp executing the SIMT instruction, as distinguished from
1743 /// threads that are inactive due to divergent control flow.
1744 /// Active threads:
1745 /// The minimal set of threads that has to be alive upon entry to this
1746 /// function. The computation is correct iff active threads are alive.
1747 /// Some threads are alive but they are not active because they do not
1748 /// contribute to the computation in any useful manner. Turning them off
1749 /// may introduce control flow overheads without any tangible benefits.
1750 /// Effective threads:
1751 /// In order to comply with the argument requirements of the shuffle
1752 /// function, we must keep all lanes holding data alive. But at most
1753 /// half of them perform value aggregation; we refer to this half of
1754 /// threads as effective. The other half is simply handing off their
1755 /// data.
1756 ///
1757 /// Procedure
1758 /// Value shuffle:
1759 /// In this step active threads transfer data from higher lane positions
1760 /// in the warp to lower lane positions, creating Remote Reduce list.
1761 /// Value aggregation:
1762 /// In this step, effective threads combine their thread local Reduce list
1763 /// with Remote Reduce list and store the result in the thread local
1764 /// Reduce list.
1765 /// Value copy:
1766 /// In this step, we deal with the assumption made by algorithm 2
1767 /// (i.e. contiguity assumption). When we have an odd number of lanes
1768 /// active, say 2k+1, only k threads will be effective and therefore k
1769 /// new values will be produced. However, the Reduce list owned by the
1770 /// (2k+1)th thread is ignored in the value aggregation. Therefore
1771 /// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1772 /// that the contiguity assumption still holds.
1773 ///
1774 /// \param ReductionInfos Array type containing the ReductionOps.
1775 /// \param ReduceFn The reduction function.
1776 /// \param FuncAttrs Optional param to specify any function attributes that
1777 /// need to be copied to the new function.
1778 /// \param IsByRef For each reduction clause, whether the reduction is by-ref
1779 /// or not.
1780 ///
1781 /// \return The ShuffleAndReduce function.
1782 Expected<Function *> emitShuffleAndReduceFunction(
1784 Function *ReduceFn, AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1785
1786 /// Helper function for CreateCanonicalScanLoops to create InputLoop
1787 /// in the firstGen and Scan Loop in the SecondGen
1788 /// \param InputLoopGen Callback for generating the loop for input phase
1789 /// \param ScanLoopGen Callback for generating the loop for scan phase
1790 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1791 /// `ScanInfoInitialize`.
1792 ///
1793 /// \return error if any produced, else return success.
1794 Error emitScanBasedDirectiveIR(
1795 llvm::function_ref<Error()> InputLoopGen,
1796 llvm::function_ref<Error(LocationDescription Loc)> ScanLoopGen,
1797 ScanInfo *ScanRedInfo);
1798
1799 /// Creates the basic blocks required for scan reduction.
1800 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1801 /// `ScanInfoInitialize`.
1802 void createScanBBs(ScanInfo *ScanRedInfo);
1803
1804 /// Dynamically allocates the buffer needed for scan reduction.
1805 /// \param AllocaIP The IP where possibly-shared pointer of buffer needs to
1806 /// be declared.
1807 /// \param ScanVars Scan Variables.
1808 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1809 /// `ScanInfoInitialize`.
1810 ///
1811 /// \return error if any produced, else return success.
1812 Error emitScanBasedDirectiveDeclsIR(InsertPointTy AllocaIP,
1813 ArrayRef<llvm::Value *> ScanVars,
1814 ArrayRef<llvm::Type *> ScanVarsType,
1815 ScanInfo *ScanRedInfo);
1816
1817 /// Copies the result back to the reduction variable.
1818 /// \param ReductionInfos Array type containing the ReductionOps.
1819 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
1820 /// `ScanInfoInitialize`.
1821 ///
1822 /// \return error if any produced, else return success.
1823 Error emitScanBasedDirectiveFinalsIR(
1826
1827 /// This function emits a helper that gathers Reduce lists from the first
1828 /// lane of every active warp to lanes in the first warp.
1829 ///
1830 /// void inter_warp_copy_func(void* reduce_data, num_warps)
1831 /// shared smem[warp_size];
1832 /// For all data entries D in reduce_data:
1833 /// sync
1834 /// If (I am the first lane in each warp)
1835 /// Copy my local D to smem[warp_id]
1836 /// sync
1837 /// if (I am the first warp)
1838 /// Copy smem[thread_id] to my local D
1839 ///
1840 /// \param Loc The insert and source location description.
1841 /// \param ReductionInfos Array type containing the ReductionOps.
1842 /// \param FuncAttrs Optional param to specify any function attributes that
1843 /// need to be copied to the new function.
1844 /// \param IsByRef For each reduction clause, whether the reduction is by-ref
1845 /// or not.
1846 ///
1847 /// \return The InterWarpCopy function.
1849 emitInterWarpCopyFunction(const LocationDescription &Loc,
1850 ArrayRef<ReductionInfo> ReductionInfos,
1851 AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1852
1853 /// This function emits a helper that copies all the reduction variables from
1854 /// the team into the provided global buffer for the reduction variables.
1855 ///
1856 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1857 /// For all data entries D in reduce_data:
1858 /// Copy local D to buffer.D[Idx]
1859 ///
1860 /// \param ReductionInfos Array type containing the ReductionOps.
1861 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1862 /// \param FuncAttrs Optional param to specify any function attributes that
1863 /// need to be copied to the new function.
1864 ///
1865 /// \return The ListToGlobalCopy function.
1867 emitListToGlobalCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1868 Type *ReductionsBufferTy,
1869 AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1870
1871 /// This function emits a helper that copies all the reduction variables from
1872 /// the team into the provided global buffer for the reduction variables.
1873 ///
1874 /// void list_to_global_copy_func(void *buffer, int Idx, void *reduce_data)
1875 /// For all data entries D in reduce_data:
1876 /// Copy buffer.D[Idx] to local D;
1877 ///
1878 /// \param ReductionInfos Array type containing the ReductionOps.
1879 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1880 /// \param FuncAttrs Optional param to specify any function attributes that
1881 /// need to be copied to the new function.
1882 ///
1883 /// \return The GlobalToList function.
1885 emitGlobalToListCopyFunction(ArrayRef<ReductionInfo> ReductionInfos,
1886 Type *ReductionsBufferTy,
1887 AttributeList FuncAttrs, ArrayRef<bool> IsByRef);
1888
1889 /// This function emits a helper that reduces all the reduction variables from
1890 /// the team into the provided global buffer for the reduction variables.
1891 ///
1892 /// void list_to_global_reduce_func(void *buffer, int Idx, void *reduce_data)
1893 /// void *GlobPtrs[];
1894 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1895 /// ...
1896 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1897 /// reduce_function(GlobPtrs, reduce_data);
1898 ///
1899 /// \param ReductionInfos Array type containing the ReductionOps.
1900 /// \param ReduceFn The reduction function.
1901 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1902 /// \param FuncAttrs Optional param to specify any function attributes that
1903 /// need to be copied to the new function.
1904 ///
1905 /// \return The ListToGlobalReduce function.
1907 emitListToGlobalReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1908 Function *ReduceFn, Type *ReductionsBufferTy,
1909 AttributeList FuncAttrs,
1910 ArrayRef<bool> IsByRef);
1911
1912 /// This function emits a helper that reduces all the reduction variables from
1913 /// the team into the provided global buffer for the reduction variables.
1914 ///
1915 /// void global_to_list_reduce_func(void *buffer, int Idx, void *reduce_data)
1916 /// void *GlobPtrs[];
1917 /// GlobPtrs[0] = (void*)&buffer.D0[Idx];
1918 /// ...
1919 /// GlobPtrs[N] = (void*)&buffer.DN[Idx];
1920 /// reduce_function(reduce_data, GlobPtrs);
1921 ///
1922 /// \param ReductionInfos Array type containing the ReductionOps.
1923 /// \param ReduceFn The reduction function.
1924 /// \param ReductionsBufferTy The StructTy for the reductions buffer.
1925 /// \param FuncAttrs Optional param to specify any function attributes that
1926 /// need to be copied to the new function.
1927 ///
1928 /// \return The GlobalToListReduce function.
1930 emitGlobalToListReduceFunction(ArrayRef<ReductionInfo> ReductionInfos,
1931 Function *ReduceFn, Type *ReductionsBufferTy,
1932 AttributeList FuncAttrs,
1933 ArrayRef<bool> IsByRef);
1934
1935 /// Get the function name of a reduction function.
1936 std::string getReductionFuncName(StringRef Name) const;
1937
1938 /// Emits reduction function.
1939 /// \param ReducerName Name of the function calling the reduction.
1940 /// \param ReductionInfos Array type containing the ReductionOps.
1941 /// \param ReductionGenCBKind Optional param to specify Clang or MLIR
1942 /// CodeGenCB kind.
1943 /// \param FuncAttrs Optional param to specify any function attributes that
1944 /// need to be copied to the new function.
1945 ///
1946 /// \return The reduction function.
1947 Expected<Function *> createReductionFunction(
1948 StringRef ReducerName, ArrayRef<ReductionInfo> ReductionInfos,
1949 ArrayRef<bool> IsByRef,
1951 AttributeList FuncAttrs = {});
1952
1953public:
1954 ///
1955 /// Design of OpenMP reductions on the GPU
1956 ///
1957 /// Consider a typical OpenMP program with one or more reduction
1958 /// clauses:
1959 ///
1960 /// float foo;
1961 /// double bar;
1962 /// #pragma omp target teams distribute parallel for \
1963 /// reduction(+:foo) reduction(*:bar)
1964 /// for (int i = 0; i < N; i++) {
1965 /// foo += A[i]; bar *= B[i];
1966 /// }
1967 ///
1968 /// where 'foo' and 'bar' are reduced across all OpenMP threads in
1969 /// all teams. In our OpenMP implementation on the NVPTX device an
1970 /// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1971 /// within a team are mapped to CUDA threads within a threadblock.
1972 /// Our goal is to efficiently aggregate values across all OpenMP
1973 /// threads such that:
1974 ///
1975 /// - the compiler and runtime are logically concise, and
1976 /// - the reduction is performed efficiently in a hierarchical
1977 /// manner as follows: within OpenMP threads in the same warp,
1978 /// across warps in a threadblock, and finally across teams on
1979 /// the NVPTX device.
1980 ///
1981 /// Introduction to Decoupling
1982 ///
1983 /// We would like to decouple the compiler and the runtime so that the
1984 /// latter is ignorant of the reduction variables (number, data types)
1985 /// and the reduction operators. This allows a simpler interface
1986 /// and implementation while still attaining good performance.
1987 ///
1988 /// Pseudocode for the aforementioned OpenMP program generated by the
1989 /// compiler is as follows:
1990 ///
1991 /// 1. Create private copies of reduction variables on each OpenMP
1992 /// thread: 'foo_private', 'bar_private'
1993 /// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1994 /// to it and writes the result in 'foo_private' and 'bar_private'
1995 /// respectively.
1996 /// 3. Call the OpenMP runtime on the GPU to reduce within a team
1997 /// and store the result on the team master:
1998 ///
1999 /// __kmpc_nvptx_parallel_reduce_nowait_v2(...,
2000 /// reduceData, shuffleReduceFn, interWarpCpyFn)
2001 ///
2002 /// where:
2003 /// struct ReduceData {
2004 /// double *foo;
2005 /// double *bar;
2006 /// } reduceData
2007 /// reduceData.foo = &foo_private
2008 /// reduceData.bar = &bar_private
2009 ///
2010 /// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2011 /// auxiliary functions generated by the compiler that operate on
2012 /// variables of type 'ReduceData'. They aid the runtime perform
2013 /// algorithmic steps in a data agnostic manner.
2014 ///
2015 /// 'shuffleReduceFn' is a pointer to a function that reduces data
2016 /// of type 'ReduceData' across two OpenMP threads (lanes) in the
2017 /// same warp. It takes the following arguments as input:
2018 ///
2019 /// a. variable of type 'ReduceData' on the calling lane,
2020 /// b. its lane_id,
2021 /// c. an offset relative to the current lane_id to generate a
2022 /// remote_lane_id. The remote lane contains the second
2023 /// variable of type 'ReduceData' that is to be reduced.
2024 /// d. an algorithm version parameter determining which reduction
2025 /// algorithm to use.
2026 ///
2027 /// 'shuffleReduceFn' retrieves data from the remote lane using
2028 /// efficient GPU shuffle intrinsics and reduces, using the
2029 /// algorithm specified by the 4th parameter, the two operands
2030 /// element-wise. The result is written to the first operand.
2031 ///
2032 /// Different reduction algorithms are implemented in different
2033 /// runtime functions, all calling 'shuffleReduceFn' to perform
2034 /// the essential reduction step. Therefore, based on the 4th
2035 /// parameter, this function behaves slightly differently to
2036 /// cooperate with the runtime to ensure correctness under
2037 /// different circumstances.
2038 ///
2039 /// 'InterWarpCpyFn' is a pointer to a function that transfers
2040 /// reduced variables across warps. It tunnels, through CUDA
2041 /// shared memory, the thread-private data of type 'ReduceData'
2042 /// from lane 0 of each warp to a lane in the first warp.
2043 /// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2044 /// The last team writes the global reduced value to memory.
2045 ///
2046 /// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2047 /// reduceData, shuffleReduceFn, interWarpCpyFn,
2048 /// scratchpadCopyFn, loadAndReduceFn)
2049 ///
2050 /// 'scratchpadCopyFn' is a helper that stores reduced
2051 /// data from the team master to a scratchpad array in
2052 /// global memory.
2053 ///
2054 /// 'loadAndReduceFn' is a helper that loads data from
2055 /// the scratchpad array and reduces it with the input
2056 /// operand.
2057 ///
2058 /// These compiler generated functions hide address
2059 /// calculation and alignment information from the runtime.
2060 /// 5. if ret == 1:
2061 /// The team master of the last team stores the reduced
2062 /// result to the globals in memory.
2063 /// foo += reduceData.foo; bar *= reduceData.bar
2064 ///
2065 ///
2066 /// Warp Reduction Algorithms
2067 ///
2068 /// On the warp level, we have three algorithms implemented in the
2069 /// OpenMP runtime depending on the number of active lanes:
2070 ///
2071 /// Full Warp Reduction
2072 ///
2073 /// The reduce algorithm within a warp where all lanes are active
2074 /// is implemented in the runtime as follows:
2075 ///
2076 /// full_warp_reduce(void *reduce_data,
2077 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2078 /// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2079 /// ShuffleReduceFn(reduce_data, 0, offset, 0);
2080 /// }
2081 ///
2082 /// The algorithm completes in log(2, WARPSIZE) steps.
2083 ///
2084 /// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2085 /// not used therefore we save instructions by not retrieving lane_id
2086 /// from the corresponding special registers. The 4th parameter, which
2087 /// represents the version of the algorithm being used, is set to 0 to
2088 /// signify full warp reduction.
2089 ///
2090 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2091 ///
2092 /// #reduce_elem refers to an element in the local lane's data structure
2093 /// #remote_elem is retrieved from a remote lane
2094 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2095 /// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2096 ///
2097 /// Contiguous Partial Warp Reduction
2098 ///
2099 /// This reduce algorithm is used within a warp where only the first
2100 /// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2101 /// number of OpenMP threads in a parallel region is not a multiple of
2102 /// WARPSIZE. The algorithm is implemented in the runtime as follows:
2103 ///
2104 /// void
2105 /// contiguous_partial_reduce(void *reduce_data,
2106 /// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2107 /// int size, int lane_id) {
2108 /// int curr_size;
2109 /// int offset;
2110 /// curr_size = size;
2111 /// mask = curr_size/2;
2112 /// while (offset>0) {
2113 /// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2114 /// curr_size = (curr_size+1)/2;
2115 /// offset = curr_size/2;
2116 /// }
2117 /// }
2118 ///
2119 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2120 ///
2121 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2122 /// if (lane_id < offset)
2123 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2124 /// else
2125 /// reduce_elem = remote_elem
2126 ///
2127 /// This algorithm assumes that the data to be reduced are located in a
2128 /// contiguous subset of lanes starting from the first. When there is
2129 /// an odd number of active lanes, the data in the last lane is not
2130 /// aggregated with any other lane's dat but is instead copied over.
2131 ///
2132 /// Dispersed Partial Warp Reduction
2133 ///
2134 /// This algorithm is used within a warp when any discontiguous subset of
2135 /// lanes are active. It is used to implement the reduction operation
2136 /// across lanes in an OpenMP simd region or in a nested parallel region.
2137 ///
2138 /// void
2139 /// dispersed_partial_reduce(void *reduce_data,
2140 /// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2141 /// int size, remote_id;
2142 /// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2143 /// do {
2144 /// remote_id = next_active_lane_id_right_after_me();
2145 /// # the above function returns 0 of no active lane
2146 /// # is present right after the current lane.
2147 /// size = number_of_active_lanes_in_this_warp();
2148 /// logical_lane_id /= 2;
2149 /// ShuffleReduceFn(reduce_data, logical_lane_id,
2150 /// remote_id-1-threadIdx.x, 2);
2151 /// } while (logical_lane_id % 2 == 0 && size > 1);
2152 /// }
2153 ///
2154 /// There is no assumption made about the initial state of the reduction.
2155 /// Any number of lanes (>=1) could be active at any position. The reduction
2156 /// result is returned in the first active lane.
2157 ///
2158 /// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2159 ///
2160 /// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2161 /// if (lane_id % 2 == 0 && offset > 0)
2162 /// reduce_elem = reduce_elem REDUCE_OP remote_elem
2163 /// else
2164 /// reduce_elem = remote_elem
2165 ///
2166 ///
2167 /// Intra-Team Reduction
2168 ///
2169 /// This function, as implemented in the runtime call
2170 /// '__kmpc_nvptx_parallel_reduce_nowait_v2', aggregates data across OpenMP
2171 /// threads in a team. It first reduces within a warp using the
2172 /// aforementioned algorithms. We then proceed to gather all such
2173 /// reduced values at the first warp.
2174 ///
2175 /// The runtime makes use of the function 'InterWarpCpyFn', which copies
2176 /// data from each of the "warp master" (zeroth lane of each warp, where
2177 /// warp-reduced data is held) to the zeroth warp. This step reduces (in
2178 /// a mathematical sense) the problem of reduction across warp masters in
2179 /// a block to the problem of warp reduction.
2180 ///
2181 ///
2182 /// Inter-Team Reduction
2183 ///
2184 /// Once a team has reduced its data to a single value, it is stored in
2185 /// a global scratchpad array. Since each team has a distinct slot, this
2186 /// can be done without locking.
2187 ///
2188 /// The last team to write to the scratchpad array proceeds to reduce the
2189 /// scratchpad array. One or more workers in the last team use the helper
2190 /// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2191 /// the k'th worker reduces every k'th element.
2192 ///
2193 /// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait_v2' to
2194 /// reduce across workers and compute a globally reduced value.
2195 ///
2196 /// \param Loc The location where the reduction was
2197 /// encountered. Must be within the associate
2198 /// directive and after the last local access to the
2199 /// reduction variables.
2200 /// \param AllocaIP An insertion point suitable for allocas usable
2201 /// in reductions.
2202 /// \param CodeGenIP An insertion point suitable for code
2203 /// generation.
2204 /// \param ReductionInfos A list of info on each reduction
2205 /// variable.
2206 /// \param IsNoWait Optional flag set if the reduction is
2207 /// marked as nowait.
2208 /// \param IsByRef For each reduction clause, whether the reduction is by-ref.
2209 /// \param IsTeamsReduction Optional flag set if it is a teams
2210 /// reduction.
2211 /// \param GridValue Optional GPU grid value.
2212 /// \param ReductionBufNum Optional OpenMPCUDAReductionBufNumValue to be
2213 /// used for teams reduction.
2214 /// \param SrcLocInfo Source location information global.
2216 const LocationDescription &Loc, InsertPointTy AllocaIP,
2217 InsertPointTy CodeGenIP, ArrayRef<ReductionInfo> ReductionInfos,
2218 ArrayRef<bool> IsByRef, bool IsNoWait = false,
2219 bool IsTeamsReduction = false,
2221 std::optional<omp::GV> GridValue = {}, unsigned ReductionBufNum = 1024,
2222 Value *SrcLocInfo = nullptr);
2223
2224 // TODO: provide atomic and non-atomic reduction generators for reduction
2225 // operators defined by the OpenMP specification.
2226
2227 /// Generator for '#omp reduction'.
2228 ///
2229 /// Emits the IR instructing the runtime to perform the specific kind of
2230 /// reductions. Expects reduction variables to have been privatized and
2231 /// initialized to reduction-neutral values separately. Emits the calls to
2232 /// runtime functions as well as the reduction function and the basic blocks
2233 /// performing the reduction atomically and non-atomically.
2234 ///
2235 /// The code emitted for the following:
2236 ///
2237 /// \code
2238 /// type var_1;
2239 /// type var_2;
2240 /// #pragma omp <directive> reduction(reduction-op:var_1,var_2)
2241 /// /* body */;
2242 /// \endcode
2243 ///
2244 /// corresponds to the following sketch.
2245 ///
2246 /// \code
2247 /// void _outlined_par() {
2248 /// // N is the number of different reductions.
2249 /// void *red_array[] = {privatized_var_1, privatized_var_2, ...};
2250 /// switch(__kmpc_reduce(..., N, /*size of data in red array*/, red_array,
2251 /// _omp_reduction_func,
2252 /// _gomp_critical_user.reduction.var)) {
2253 /// case 1: {
2254 /// var_1 = var_1 <reduction-op> privatized_var_1;
2255 /// var_2 = var_2 <reduction-op> privatized_var_2;
2256 /// // ...
2257 /// __kmpc_end_reduce(...);
2258 /// break;
2259 /// }
2260 /// case 2: {
2261 /// _Atomic<ReductionOp>(var_1, privatized_var_1);
2262 /// _Atomic<ReductionOp>(var_2, privatized_var_2);
2263 /// // ...
2264 /// break;
2265 /// }
2266 /// default: break;
2267 /// }
2268 /// }
2269 ///
2270 /// void _omp_reduction_func(void **lhs, void **rhs) {
2271 /// *(type *)lhs[0] = *(type *)lhs[0] <reduction-op> *(type *)rhs[0];
2272 /// *(type *)lhs[1] = *(type *)lhs[1] <reduction-op> *(type *)rhs[1];
2273 /// // ...
2274 /// }
2275 /// \endcode
2276 ///
2277 /// \param Loc The location where the reduction was
2278 /// encountered. Must be within the associate
2279 /// directive and after the last local access to the
2280 /// reduction variables.
2281 /// \param AllocaIP An insertion point suitable for allocas usable
2282 /// in reductions.
2283 /// \param ReductionInfos A list of info on each reduction variable.
2284 /// \param IsNoWait A flag set if the reduction is marked as nowait.
2285 /// \param IsByRef A flag set if the reduction is using reference
2286 /// or direct value.
2287 /// \param IsTeamsReduction Optional flag set if it is a teams
2288 /// reduction.
2290 const LocationDescription &Loc, InsertPointTy AllocaIP,
2291 ArrayRef<ReductionInfo> ReductionInfos, ArrayRef<bool> IsByRef,
2292 bool IsNoWait = false, bool IsTeamsReduction = false);
2293
2294 ///}
2295
2296 /// Return the insertion point used by the underlying IRBuilder.
2298
2299 /// Update the internal location to \p Loc.
2301 Builder.restoreIP(Loc.IP);
2302 Builder.SetCurrentDebugLocation(Loc.DL);
2303 return Loc.IP.getBlock() != nullptr;
2304 }
2305
2306 /// Return the function declaration for the runtime function with \p FnID.
2309
2311
2313 ArrayRef<Value *> Args,
2314 StringRef Name = "");
2315
2316 /// Return the (LLVM-IR) string describing the source location \p LocStr.
2318 uint32_t &SrcLocStrSize);
2319
2320 /// Return the (LLVM-IR) string describing the default source location.
2322
2323 /// Return the (LLVM-IR) string describing the source location identified by
2324 /// the arguments.
2326 StringRef FileName, unsigned Line,
2327 unsigned Column,
2328 uint32_t &SrcLocStrSize);
2329
2330 /// Return the (LLVM-IR) string describing the DebugLoc \p DL. Use \p F as
2331 /// fallback if \p DL does not specify the function name.
2333 Function *F = nullptr);
2334
2335 /// Return the (LLVM-IR) string describing the source location \p Loc.
2336 LLVM_ABI Constant *getOrCreateSrcLocStr(const LocationDescription &Loc,
2337 uint32_t &SrcLocStrSize);
2338
2339 /// Return an ident_t* encoding the source location \p SrcLocStr and \p Flags.
2340 /// TODO: Create a enum class for the Reserve2Flags
2342 uint32_t SrcLocStrSize,
2343 omp::IdentFlag Flags = omp::IdentFlag(0),
2344 unsigned Reserve2Flags = 0);
2345
2346 /// Create a hidden global flag \p Name in the module with initial value \p
2347 /// Value.
2349
2350 /// Emit the llvm.used metadata.
2352
2353 /// Emit the kernel execution mode.
2356
2357 /// Generate control flow and cleanup for cancellation.
2358 ///
2359 /// \param CancelFlag Flag indicating if the cancellation is performed.
2360 /// \param CanceledDirective The kind of directive that is cancled.
2361 /// \param ExitCB Extra code to be generated in the exit block.
2362 ///
2363 /// \return an error, if any were triggered during execution.
2365 omp::Directive CanceledDirective);
2366
2367 /// Generate a target region entry call.
2368 ///
2369 /// \param Loc The location at which the request originated and is fulfilled.
2370 /// \param AllocaIP The insertion point to be used for alloca instructions.
2371 /// \param Return Return value of the created function returned by reference.
2372 /// \param DeviceID Identifier for the device via the 'device' clause.
2373 /// \param NumTeams Numer of teams for the region via the 'num_teams' clause
2374 /// or 0 if unspecified and -1 if there is no 'teams' clause.
2375 /// \param NumThreads Number of threads via the 'thread_limit' clause.
2376 /// \param HostPtr Pointer to the host-side pointer of the target kernel.
2377 /// \param KernelArgs Array of arguments to the kernel.
2378 LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc,
2379 InsertPointTy AllocaIP,
2380 Value *&Return, Value *Ident,
2381 Value *DeviceID, Value *NumTeams,
2382 Value *NumThreads, Value *HostPtr,
2383 ArrayRef<Value *> KernelArgs);
2384
2385 /// Generate a flush runtime call.
2386 ///
2387 /// \param Loc The location at which the request originated and is fulfilled.
2388 LLVM_ABI void emitFlush(const LocationDescription &Loc);
2389
2390 /// The finalization stack made up of finalize callbacks currently in-flight,
2391 /// wrapped into FinalizationInfo objects that reference also the finalization
2392 /// target block and the kind of cancellable directive.
2394
2395 /// Return true if the last entry in the finalization stack is of kind \p DK
2396 /// and cancellable.
2397 bool isLastFinalizationInfoCancellable(omp::Directive DK) {
2398 return !FinalizationStack.empty() &&
2399 FinalizationStack.back().IsCancellable &&
2400 FinalizationStack.back().DK == DK;
2401 }
2402
2403 /// Generate a taskwait runtime call.
2404 ///
2405 /// \param Loc The location at which the request originated and is fulfilled.
2406 LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc);
2407
2408 /// Generate a taskyield runtime call.
2409 ///
2410 /// \param Loc The location at which the request originated and is fulfilled.
2411 LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc);
2412
2413 /// Return the current thread ID.
2414 ///
2415 /// \param Ident The ident (ident_t*) describing the query origin.
2417
2418 /// The OpenMPIRBuilder Configuration
2420
2421 /// The underlying LLVM-IR module
2423
2424 /// The LLVM-IR Builder used to create IR.
2426
2427 /// Map to remember source location strings
2429
2430 /// Map to remember existing ident_t*.
2432
2433 /// Info manager to keep track of target regions.
2435
2436 /// The target triple of the underlying module.
2437 const Triple T;
2438
2439 /// Helper that contains information about regions we need to outline
2440 /// during finalization.
2442 using PostOutlineCBTy = std::function<void(Function &)>;
2447 // TODO: this should be safe to enable by default
2449
2450 /// Collect all blocks in between EntryBB and ExitBB in both the given
2451 /// vector and set.
2453 SmallVectorImpl<BasicBlock *> &BlockVector);
2454
2455 /// Return the function that contains the region to be outlined.
2456 Function *getFunction() const { return EntryBB->getParent(); }
2457 };
2458
2459 /// Collection of regions that need to be outlined during finalization.
2461
2462 /// A collection of candidate target functions that's constant allocas will
2463 /// attempt to be raised on a call of finalize after all currently enqueued
2464 /// outline info's have been processed.
2466
2467 /// Collection of owned canonical loop objects that eventually need to be
2468 /// free'd.
2469 std::forward_list<CanonicalLoopInfo> LoopInfos;
2470
2471 /// Collection of owned ScanInfo objects that eventually need to be free'd.
2472 std::forward_list<ScanInfo> ScanInfos;
2473
2474 /// Add a new region that will be outlined later.
2475 void addOutlineInfo(OutlineInfo &&OI) { OutlineInfos.emplace_back(OI); }
2476
2477 /// An ordered map of auto-generated variables to their unique names.
2478 /// It stores variables with the following names: 1) ".gomp_critical_user_" +
2479 /// <critical_section_name> + ".var" for "omp critical" directives; 2)
2480 /// <mangled_name_for_global_var> + ".cache." for cache for threadprivate
2481 /// variables.
2483
2484 /// Computes the size of type in bytes.
2486
2487 // Emit a branch from the current block to the Target block only if
2488 // the current block has a terminator.
2490
2491 // If BB has no use then delete it and return. Else place BB after the current
2492 // block, if possible, or else at the end of the function. Also add a branch
2493 // from current block to BB if current block does not have a terminator.
2494 LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn,
2495 bool IsFinished = false);
2496
2497 /// Emits code for OpenMP 'if' clause using specified \a BodyGenCallbackTy
2498 /// Here is the logic:
2499 /// if (Cond) {
2500 /// ThenGen();
2501 /// } else {
2502 /// ElseGen();
2503 /// }
2504 ///
2505 /// \return an error, if any were triggered during execution.
2507 BodyGenCallbackTy ElseGen,
2508 InsertPointTy AllocaIP = {});
2509
2510 /// Create the global variable holding the offload mappings information.
2513 std::string VarName);
2514
2515 /// Create the global variable holding the offload names information.
2518 std::string VarName);
2519
2522 AllocaInst *Args = nullptr;
2524 };
2525
2526 /// Create the allocas instruction used in call to mapper functions.
2528 InsertPointTy AllocaIP,
2529 unsigned NumOperands,
2531
2532 /// Create the call for the target mapper function.
2533 /// \param Loc The source location description.
2534 /// \param MapperFunc Function to be called.
2535 /// \param SrcLocInfo Source location information global.
2536 /// \param MaptypesArg The argument types.
2537 /// \param MapnamesArg The argument names.
2538 /// \param MapperAllocas The AllocaInst used for the call.
2539 /// \param DeviceID Device ID for the call.
2540 /// \param NumOperands Number of operands in the call.
2542 Function *MapperFunc, Value *SrcLocInfo,
2543 Value *MaptypesArg, Value *MapnamesArg,
2545 int64_t DeviceID, unsigned NumOperands);
2546
2547 /// Container for the arguments used to pass data to the runtime library.
2549 /// The array of base pointer passed to the runtime library.
2551 /// The array of section pointers passed to the runtime library.
2553 /// The array of sizes passed to the runtime library.
2554 Value *SizesArray = nullptr;
2555 /// The array of map types passed to the runtime library for the beginning
2556 /// of the region or for the entire region if there are no separate map
2557 /// types for the region end.
2559 /// The array of map types passed to the runtime library for the end of the
2560 /// region, or nullptr if there are no separate map types for the region
2561 /// end.
2563 /// The array of user-defined mappers passed to the runtime library.
2565 /// The array of original declaration names of mapped pointers sent to the
2566 /// runtime library for debugging
2568
2569 explicit TargetDataRTArgs() = default;
2578 };
2579
2580 /// Container to pass the default attributes with which a kernel must be
2581 /// launched, used to set kernel attributes and populate associated static
2582 /// structures.
2583 ///
2584 /// For max values, < 0 means unset, == 0 means set but unknown at compile
2585 /// time. The number of max values will be 1 except for the case where
2586 /// ompx_bare is set.
2597
2598 /// Container to pass LLVM IR runtime values or constants related to the
2599 /// number of teams and threads with which the kernel must be launched, as
2600 /// well as the trip count of the loop, if it is an SPMD or Generic-SPMD
2601 /// kernel. These must be defined in the host prior to the call to the kernel
2602 /// launch OpenMP RTL function.
2605 Value *MinTeams = nullptr;
2608
2609 /// 'parallel' construct 'num_threads' clause value, if present and it is an
2610 /// SPMD kernel.
2611 Value *MaxThreads = nullptr;
2612
2613 /// Total number of iterations of the SPMD or Generic-SPMD kernel or null if
2614 /// it is a generic kernel.
2616
2617 /// Device ID value used in the kernel launch.
2618 Value *DeviceID = nullptr;
2619 };
2620
2621 /// Data structure that contains the needed information to construct the
2622 /// kernel args vector.
2624 /// Number of arguments passed to the runtime library.
2625 unsigned NumTargetItems = 0;
2626 /// Arguments passed to the runtime library
2628 /// The number of iterations
2630 /// The number of teams.
2632 /// The number of threads.
2634 /// The size of the dynamic shared memory.
2636 /// True if the kernel has 'no wait' clause.
2637 bool HasNoWait = false;
2638 /// The fallback mechanism for the shared memory.
2641
2642 // Constructors for TargetKernelArgs.
2643 TargetKernelArgs() = default;
2653 };
2654
2655 /// Create the kernel args vector used by emitTargetKernel. This function
2656 /// creates various constant values that are used in the resulting args
2657 /// vector.
2658 LLVM_ABI static void getKernelArgsVector(TargetKernelArgs &KernelArgs,
2659 IRBuilderBase &Builder,
2660 SmallVector<Value *> &ArgsVector);
2661
2662 /// Struct that keeps the information that should be kept throughout
2663 /// a 'target data' region.
2665 /// Set to true if device pointer information have to be obtained.
2666 bool RequiresDevicePointerInfo = false;
2667 /// Set to true if Clang emits separate runtime calls for the beginning and
2668 /// end of the region. These calls might have separate map type arrays.
2669 bool SeparateBeginEndCalls = false;
2670
2671 public:
2673
2676
2677 /// Indicate whether any user-defined mapper exists.
2678 bool HasMapper = false;
2679 /// The total number of pointers passed to the runtime library.
2680 unsigned NumberOfPtrs = 0u;
2681
2682 bool EmitDebug = false;
2683
2684 /// Whether the `target ... data` directive has a `nowait` clause.
2685 bool HasNoWait = false;
2686
2687 explicit TargetDataInfo() = default;
2688 explicit TargetDataInfo(bool RequiresDevicePointerInfo,
2689 bool SeparateBeginEndCalls)
2690 : RequiresDevicePointerInfo(RequiresDevicePointerInfo),
2691 SeparateBeginEndCalls(SeparateBeginEndCalls) {}
2692 /// Clear information about the data arrays.
2695 HasMapper = false;
2696 NumberOfPtrs = 0u;
2697 }
2698 /// Return true if the current target data information has valid arrays.
2699 bool isValid() {
2700 return RTArgs.BasePointersArray && RTArgs.PointersArray &&
2701 RTArgs.SizesArray && RTArgs.MapTypesArray &&
2702 (!HasMapper || RTArgs.MappersArray) && NumberOfPtrs;
2703 }
2704 bool requiresDevicePointerInfo() { return RequiresDevicePointerInfo; }
2705 bool separateBeginEndCalls() { return SeparateBeginEndCalls; }
2706 };
2707
2715
2716 /// This structure contains combined information generated for mappable
2717 /// clauses, including base pointers, pointers, sizes, map types, user-defined
2718 /// mappers, and non-contiguous information.
2719 struct MapInfosTy {
2734
2735 /// Append arrays in \a CurInfo.
2736 void append(MapInfosTy &CurInfo) {
2737 BasePointers.append(CurInfo.BasePointers.begin(),
2738 CurInfo.BasePointers.end());
2739 Pointers.append(CurInfo.Pointers.begin(), CurInfo.Pointers.end());
2740 DevicePointers.append(CurInfo.DevicePointers.begin(),
2741 CurInfo.DevicePointers.end());
2742 Sizes.append(CurInfo.Sizes.begin(), CurInfo.Sizes.end());
2743 Types.append(CurInfo.Types.begin(), CurInfo.Types.end());
2744 Names.append(CurInfo.Names.begin(), CurInfo.Names.end());
2745 NonContigInfo.Dims.append(CurInfo.NonContigInfo.Dims.begin(),
2746 CurInfo.NonContigInfo.Dims.end());
2747 NonContigInfo.Offsets.append(CurInfo.NonContigInfo.Offsets.begin(),
2748 CurInfo.NonContigInfo.Offsets.end());
2749 NonContigInfo.Counts.append(CurInfo.NonContigInfo.Counts.begin(),
2750 CurInfo.NonContigInfo.Counts.end());
2751 NonContigInfo.Strides.append(CurInfo.NonContigInfo.Strides.begin(),
2752 CurInfo.NonContigInfo.Strides.end());
2753 }
2754 };
2756
2757 /// Callback function type for functions emitting the host fallback code that
2758 /// is executed when the kernel launch fails. It takes an insertion point as
2759 /// parameter where the code should be emitted. It returns an insertion point
2760 /// that points right after after the emitted code.
2763
2764 // Callback function type for emitting and fetching user defined custom
2765 // mappers.
2767 function_ref<Expected<Function *>(unsigned int)>;
2768
2769 /// Generate a target region entry call and host fallback call.
2770 ///
2771 /// \param Loc The location at which the request originated and is fulfilled.
2772 /// \param OutlinedFnID The ooulined function ID.
2773 /// \param EmitTargetCallFallbackCB Call back function to generate host
2774 /// fallback code.
2775 /// \param Args Data structure holding information about the kernel arguments.
2776 /// \param DeviceID Identifier for the device via the 'device' clause.
2777 /// \param RTLoc Source location identifier
2778 /// \param AllocaIP The insertion point to be used for alloca instructions.
2780 const LocationDescription &Loc, Value *OutlinedFnID,
2781 EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args,
2782 Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP);
2783
2784 /// Callback type for generating the bodies of device directives that require
2785 /// outer target tasks (e.g. in case of having `nowait` or `depend` clauses).
2786 ///
2787 /// \param DeviceID The ID of the device on which the target region will
2788 /// execute.
2789 /// \param RTLoc Source location identifier
2790 /// \Param TargetTaskAllocaIP Insertion point for the alloca block of the
2791 /// generated task.
2792 ///
2793 /// \return an error, if any were triggered during execution.
2795 function_ref<Error(Value *DeviceID, Value *RTLoc,
2796 IRBuilderBase::InsertPoint TargetTaskAllocaIP)>;
2797
2798 /// Generate a target-task for the target construct
2799 ///
2800 /// \param TaskBodyCB Callback to generate the actual body of the target task.
2801 /// \param DeviceID Identifier for the device via the 'device' clause.
2802 /// \param RTLoc Source location identifier
2803 /// \param AllocaIP The insertion point to be used for alloca instructions.
2804 /// \param Dependencies Vector of DependData objects holding information of
2805 /// dependencies as specified by the 'depend' clause.
2806 /// \param HasNoWait True if the target construct had 'nowait' on it, false
2807 /// otherwise
2809 TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc,
2812 const TargetDataRTArgs &RTArgs, bool HasNoWait);
2813
2814 /// Emit the arguments to be passed to the runtime library based on the
2815 /// arrays of base pointers, pointers, sizes, map types, and mappers. If
2816 /// ForEndCall, emit map types to be passed for the end of the region instead
2817 /// of the beginning.
2820 OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall = false);
2821
2822 /// Emit an array of struct descriptors to be assigned to the offload args.
2824 InsertPointTy CodeGenIP,
2825 MapInfosTy &CombinedInfo,
2827
2828 /// Emit the arrays used to pass the captures and map information to the
2829 /// offloading runtime library. If there is no map or capture information,
2830 /// return nullptr by reference. Accepts a reference to a MapInfosTy object
2831 /// that contains information generated for mappable clauses,
2832 /// including base pointers, pointers, sizes, map types, user-defined mappers.
2834 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
2836 bool IsNonContiguous = false,
2837 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2838
2839 /// Allocates memory for and populates the arrays required for offloading
2840 /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
2841 /// emits their base addresses as arguments to be passed to the runtime
2842 /// library. In essence, this function is a combination of
2843 /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
2844 /// be preferred by clients of OpenMPIRBuilder.
2846 InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
2847 TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
2848 CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false,
2849 bool ForEndCall = false,
2850 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
2851
2852 /// Creates offloading entry for the provided entry ID \a ID, address \a
2853 /// Addr, size \a Size, and flags \a Flags.
2855 int32_t Flags, GlobalValue::LinkageTypes,
2856 StringRef Name = "");
2857
2858 /// The kind of errors that can occur when emitting the offload entries and
2859 /// metadata.
2866
2867 /// Callback function type
2869 std::function<void(EmitMetadataErrorKind, TargetRegionEntryInfo)>;
2870
2871 // Emit the offloading entries and metadata so that the device codegen side
2872 // can easily figure out what to emit. The produced metadata looks like
2873 // this:
2874 //
2875 // !omp_offload.info = !{!1, ...}
2876 //
2877 // We only generate metadata for function that contain target regions.
2879 EmitMetadataErrorReportFunctionTy &ErrorReportFunction);
2880
2881public:
2882 /// Generator for __kmpc_copyprivate
2883 ///
2884 /// \param Loc The source location description.
2885 /// \param BufSize Number of elements in the buffer.
2886 /// \param CpyBuf List of pointers to data to be copied.
2887 /// \param CpyFn function to call for copying data.
2888 /// \param DidIt flag variable; 1 for 'single' thread, 0 otherwise.
2889 ///
2890 /// \return The insertion position *after* the CopyPrivate call.
2891
2893 llvm::Value *BufSize,
2894 llvm::Value *CpyBuf,
2895 llvm::Value *CpyFn,
2896 llvm::Value *DidIt);
2897
2898 /// Generator for '#omp single'
2899 ///
2900 /// \param Loc The source location description.
2901 /// \param BodyGenCB Callback that will generate the region code.
2902 /// \param FiniCB Callback to finalize variable copies.
2903 /// \param IsNowait If false, a barrier is emitted.
2904 /// \param CPVars copyprivate variables.
2905 /// \param CPFuncs copy functions to use for each copyprivate variable.
2906 ///
2907 /// \returns The insertion position *after* the single call.
2910 FinalizeCallbackTy FiniCB, bool IsNowait,
2911 ArrayRef<llvm::Value *> CPVars = {},
2912 ArrayRef<llvm::Function *> CPFuncs = {});
2913
2914 /// Generator for '#omp master'
2915 ///
2916 /// \param Loc The insert and source location description.
2917 /// \param BodyGenCB Callback that will generate the region code.
2918 /// \param FiniCB Callback to finalize variable copies.
2919 ///
2920 /// \returns The insertion position *after* the master.
2921 LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc,
2922 BodyGenCallbackTy BodyGenCB,
2923 FinalizeCallbackTy FiniCB);
2924
2925 /// Generator for '#omp masked'
2926 ///
2927 /// \param Loc The insert and source location description.
2928 /// \param BodyGenCB Callback that will generate the region code.
2929 /// \param FiniCB Callback to finialize variable copies.
2930 ///
2931 /// \returns The insertion position *after* the masked.
2932 LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc,
2933 BodyGenCallbackTy BodyGenCB,
2934 FinalizeCallbackTy FiniCB,
2935 Value *Filter);
2936
2937 /// This function performs the scan reduction of the values updated in
2938 /// the input phase. The reduction logic needs to be emitted between input
2939 /// and scan loop returned by `CreateCanonicalScanLoops`. The following
2940 /// is the code that is generated, `buffer` and `span` are expected to be
2941 /// populated before executing the generated code.
2942 /// \code{c}
2943 /// for (int k = 0; k != ceil(log2(span)); ++k) {
2944 /// i=pow(2,k)
2945 /// for (size cnt = last_iter; cnt >= i; --cnt)
2946 /// buffer[cnt] op= buffer[cnt-i];
2947 /// }
2948 /// \endcode
2949 /// \param Loc The insert and source location description.
2950 /// \param ReductionInfos Array type containing the ReductionOps.
2951 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2952 /// `ScanInfoInitialize`.
2953 ///
2954 /// \returns The insertion position *after* the masked.
2956 const LocationDescription &Loc,
2958 ScanInfo *ScanRedInfo);
2959
2960 /// This directive split and directs the control flow to input phase
2961 /// blocks or scan phase blocks based on 1. whether input loop or scan loop
2962 /// is executed, 2. whether exclusive or inclusive scan is used.
2963 ///
2964 /// \param Loc The insert and source location description.
2965 /// \param AllocaIP The IP where the temporary buffer for scan reduction
2966 // needs to be allocated.
2967 /// \param ScanVars Scan Variables.
2968 /// \param IsInclusive Whether it is an inclusive or exclusive scan.
2969 /// \param ScanRedInfo Pointer to the ScanInfo objected created using
2970 /// `ScanInfoInitialize`.
2971 ///
2972 /// \returns The insertion position *after* the scan.
2973 LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc,
2974 InsertPointTy AllocaIP,
2975 ArrayRef<llvm::Value *> ScanVars,
2976 ArrayRef<llvm::Type *> ScanVarsType,
2977 bool IsInclusive,
2978 ScanInfo *ScanRedInfo);
2979
2980 /// Generator for '#omp critical'
2981 ///
2982 /// \param Loc The insert and source location description.
2983 /// \param BodyGenCB Callback that will generate the region body code.
2984 /// \param FiniCB Callback to finalize variable copies.
2985 /// \param CriticalName name of the lock used by the critical directive
2986 /// \param HintInst Hint Instruction for hint clause associated with critical
2987 ///
2988 /// \returns The insertion position *after* the critical.
2989 LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc,
2990 BodyGenCallbackTy BodyGenCB,
2991 FinalizeCallbackTy FiniCB,
2992 StringRef CriticalName,
2993 Value *HintInst);
2994
2995 /// Generator for '#omp ordered depend (source | sink)'
2996 ///
2997 /// \param Loc The insert and source location description.
2998 /// \param AllocaIP The insertion point to be used for alloca instructions.
2999 /// \param NumLoops The number of loops in depend clause.
3000 /// \param StoreValues The value will be stored in vector address.
3001 /// \param Name The name of alloca instruction.
3002 /// \param IsDependSource If true, depend source; otherwise, depend sink.
3003 ///
3004 /// \return The insertion position *after* the ordered.
3006 createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP,
3007 unsigned NumLoops, ArrayRef<llvm::Value *> StoreValues,
3008 const Twine &Name, bool IsDependSource);
3009
3010 /// Generator for '#omp ordered [threads | simd]'
3011 ///
3012 /// \param Loc The insert and source location description.
3013 /// \param BodyGenCB Callback that will generate the region code.
3014 /// \param FiniCB Callback to finalize variable copies.
3015 /// \param IsThreads If true, with threads clause or without clause;
3016 /// otherwise, with simd clause;
3017 ///
3018 /// \returns The insertion position *after* the ordered.
3020 const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
3021 FinalizeCallbackTy FiniCB, bool IsThreads);
3022
3023 /// Generator for '#omp sections'
3024 ///
3025 /// \param Loc The insert and source location description.
3026 /// \param AllocaIP The insertion points to be used for alloca instructions.
3027 /// \param SectionCBs Callbacks that will generate body of each section.
3028 /// \param PrivCB Callback to copy a given variable (think copy constructor).
3029 /// \param FiniCB Callback to finalize variable copies.
3030 /// \param IsCancellable Flag to indicate a cancellable parallel region.
3031 /// \param IsNowait If true, barrier - to ensure all sections are executed
3032 /// before moving forward will not be generated.
3033 /// \returns The insertion position *after* the sections.
3035 createSections(const LocationDescription &Loc, InsertPointTy AllocaIP,
3038 bool IsCancellable, bool IsNowait);
3039
3040 /// Generator for '#omp section'
3041 ///
3042 /// \param Loc The insert and source location description.
3043 /// \param BodyGenCB Callback that will generate the region body code.
3044 /// \param FiniCB Callback to finalize variable copies.
3045 /// \returns The insertion position *after* the section.
3046 LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc,
3047 BodyGenCallbackTy BodyGenCB,
3048 FinalizeCallbackTy FiniCB);
3049
3050 /// Generator for `#omp teams`
3051 ///
3052 /// \param Loc The location where the teams construct was encountered.
3053 /// \param BodyGenCB Callback that will generate the region code.
3054 /// \param NumTeamsLower Lower bound on number of teams. If this is nullptr,
3055 /// it is as if lower bound is specified as equal to upperbound. If
3056 /// this is non-null, then upperbound must also be non-null.
3057 /// \param NumTeamsUpper Upper bound on the number of teams.
3058 /// \param ThreadLimit on the number of threads that may participate in a
3059 /// contention group created by each team.
3060 /// \param IfExpr is the integer argument value of the if condition on the
3061 /// teams clause.
3062 LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc,
3063 BodyGenCallbackTy BodyGenCB,
3064 Value *NumTeamsLower = nullptr,
3065 Value *NumTeamsUpper = nullptr,
3066 Value *ThreadLimit = nullptr,
3067 Value *IfExpr = nullptr);
3068
3069 /// Generator for `#omp distribute`
3070 ///
3071 /// \param Loc The location where the distribute construct was encountered.
3072 /// \param AllocaIP The insertion points to be used for alloca instructions.
3073 /// \param BodyGenCB Callback that will generate the region code.
3074 LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc,
3075 InsertPointTy AllocaIP,
3076 BodyGenCallbackTy BodyGenCB);
3077
3078 /// Generate conditional branch and relevant BasicBlocks through which private
3079 /// threads copy the 'copyin' variables from Master copy to threadprivate
3080 /// copies.
3081 ///
3082 /// \param IP insertion block for copyin conditional
3083 /// \param MasterVarPtr a pointer to the master variable
3084 /// \param PrivateVarPtr a pointer to the threadprivate variable
3085 /// \param IntPtrTy Pointer size type
3086 /// \param BranchtoEnd Create a branch between the copyin.not.master blocks
3087 // and copy.in.end block
3088 ///
3089 /// \returns The insertion point where copying operation to be emitted.
3091 Value *MasterAddr,
3092 Value *PrivateAddr,
3093 llvm::IntegerType *IntPtrTy,
3094 bool BranchtoEnd = true);
3095
3096 /// Create a runtime call for kmpc_Alloc
3097 ///
3098 /// \param Loc The insert and source location description.
3099 /// \param Size Size of allocated memory space
3100 /// \param Allocator Allocator information instruction
3101 /// \param Name Name of call Instruction for OMP_alloc
3102 ///
3103 /// \returns CallInst to the OMP_Alloc call
3104 LLVM_ABI CallInst *createOMPAlloc(const LocationDescription &Loc, Value *Size,
3105 Value *Allocator, std::string Name = "");
3106
3107 /// Create a runtime call for kmpc_free
3108 ///
3109 /// \param Loc The insert and source location description.
3110 /// \param Addr Address of memory space to be freed
3111 /// \param Allocator Allocator information instruction
3112 /// \param Name Name of call Instruction for OMP_Free
3113 ///
3114 /// \returns CallInst to the OMP_Free call
3115 LLVM_ABI CallInst *createOMPFree(const LocationDescription &Loc, Value *Addr,
3116 Value *Allocator, std::string Name = "");
3117
3118 /// Create a runtime call for kmpc_threadprivate_cached
3119 ///
3120 /// \param Loc The insert and source location description.
3121 /// \param Pointer pointer to data to be cached
3122 /// \param Size size of data to be cached
3123 /// \param Name Name of call Instruction for callinst
3124 ///
3125 /// \returns CallInst to the thread private cache call.
3126 LLVM_ABI CallInst *
3127 createCachedThreadPrivate(const LocationDescription &Loc,
3129 const llvm::Twine &Name = Twine(""));
3130
3131 /// Create a runtime call for __tgt_interop_init
3132 ///
3133 /// \param Loc The insert and source location description.
3134 /// \param InteropVar variable to be allocated
3135 /// \param InteropType type of interop operation
3136 /// \param Device devide to which offloading will occur
3137 /// \param NumDependences number of dependence variables
3138 /// \param DependenceAddress pointer to dependence variables
3139 /// \param HaveNowaitClause does nowait clause exist
3140 ///
3141 /// \returns CallInst to the __tgt_interop_init call
3142 LLVM_ABI CallInst *createOMPInteropInit(const LocationDescription &Loc,
3143 Value *InteropVar,
3144 omp::OMPInteropType InteropType,
3145 Value *Device, Value *NumDependences,
3146 Value *DependenceAddress,
3147 bool HaveNowaitClause);
3148
3149 /// Create a runtime call for __tgt_interop_destroy
3150 ///
3151 /// \param Loc The insert and source location description.
3152 /// \param InteropVar variable to be allocated
3153 /// \param Device devide to which offloading will occur
3154 /// \param NumDependences number of dependence variables
3155 /// \param DependenceAddress pointer to dependence variables
3156 /// \param HaveNowaitClause does nowait clause exist
3157 ///
3158 /// \returns CallInst to the __tgt_interop_destroy call
3159 LLVM_ABI CallInst *createOMPInteropDestroy(const LocationDescription &Loc,
3160 Value *InteropVar, Value *Device,
3161 Value *NumDependences,
3162 Value *DependenceAddress,
3163 bool HaveNowaitClause);
3164
3165 /// Create a runtime call for __tgt_interop_use
3166 ///
3167 /// \param Loc The insert and source location description.
3168 /// \param InteropVar variable to be allocated
3169 /// \param Device devide to which offloading will occur
3170 /// \param NumDependences number of dependence variables
3171 /// \param DependenceAddress pointer to dependence variables
3172 /// \param HaveNowaitClause does nowait clause exist
3173 ///
3174 /// \returns CallInst to the __tgt_interop_use call
3175 LLVM_ABI CallInst *createOMPInteropUse(const LocationDescription &Loc,
3176 Value *InteropVar, Value *Device,
3177 Value *NumDependences,
3178 Value *DependenceAddress,
3179 bool HaveNowaitClause);
3180
3181 /// The `omp target` interface
3182 ///
3183 /// For more information about the usage of this interface,
3184 /// \see openmp/libomptarget/deviceRTLs/common/include/target.h
3185 ///
3186 ///{
3187
3188 /// Create a runtime call for kmpc_target_init
3189 ///
3190 /// \param Loc The insert and source location description.
3191 /// \param Attrs Structure containing the default attributes, including
3192 /// numbers of threads and teams to launch the kernel with.
3194 const LocationDescription &Loc,
3196
3197 /// Create a runtime call for kmpc_target_deinit
3198 ///
3199 /// \param Loc The insert and source location description.
3200 /// \param TeamsReductionDataSize The maximal size of all the reduction data
3201 /// for teams reduction.
3202 /// \param TeamsReductionBufferLength The number of elements (each of up to
3203 /// \p TeamsReductionDataSize size), in the teams reduction buffer.
3204 LLVM_ABI void createTargetDeinit(const LocationDescription &Loc,
3205 int32_t TeamsReductionDataSize = 0,
3206 int32_t TeamsReductionBufferLength = 1024);
3207
3208 ///}
3209
3210 /// Helpers to read/write kernel annotations from the IR.
3211 ///
3212 ///{
3213
3214 /// Read/write a bounds on threads for \p Kernel. Read will return 0 if none
3215 /// is set.
3216 LLVM_ABI static std::pair<int32_t, int32_t>
3217 readThreadBoundsForKernel(const Triple &T, Function &Kernel);
3218 LLVM_ABI static void writeThreadBoundsForKernel(const Triple &T,
3219 Function &Kernel, int32_t LB,
3220 int32_t UB);
3221
3222 /// Read/write a bounds on teams for \p Kernel. Read will return 0 if none
3223 /// is set.
3224 LLVM_ABI static std::pair<int32_t, int32_t>
3225 readTeamBoundsForKernel(const Triple &T, Function &Kernel);
3226 LLVM_ABI static void writeTeamsForKernel(const Triple &T, Function &Kernel,
3227 int32_t LB, int32_t UB);
3228 ///}
3229
3230private:
3231 // Sets the function attributes expected for the outlined function
3232 void setOutlinedTargetRegionFunctionAttributes(Function *OutlinedFn);
3233
3234 // Creates the function ID/Address for the given outlined function.
3235 // In the case of an embedded device function the address of the function is
3236 // used, in the case of a non-offload function a constant is created.
3237 Constant *createOutlinedFunctionID(Function *OutlinedFn,
3238 StringRef EntryFnIDName);
3239
3240 // Creates the region entry address for the outlined function
3241 Constant *createTargetRegionEntryAddr(Function *OutlinedFunction,
3242 StringRef EntryFnName);
3243
3244public:
3245 /// Functions used to generate a function with the given name.
3247 std::function<Expected<Function *>(StringRef FunctionName)>;
3248
3249 /// Create a unique name for the entry function using the source location
3250 /// information of the current target region. The name will be something like:
3251 ///
3252 /// __omp_offloading_DD_FFFF_PP_lBB[_CC]
3253 ///
3254 /// where DD_FFFF is an ID unique to the file (device and file IDs), PP is the
3255 /// mangled name of the function that encloses the target region and BB is the
3256 /// line number of the target region. CC is a count added when more than one
3257 /// region is located at the same location.
3258 ///
3259 /// If this target outline function is not an offload entry, we don't need to
3260 /// register it. This may happen if it is guarded by an if clause that is
3261 /// false at compile time, or no target archs have been specified.
3262 ///
3263 /// The created target region ID is used by the runtime library to identify
3264 /// the current target region, so it only has to be unique and not
3265 /// necessarily point to anything. It could be the pointer to the outlined
3266 /// function that implements the target region, but we aren't using that so
3267 /// that the compiler doesn't need to keep that, and could therefore inline
3268 /// the host function if proven worthwhile during optimization. In the other
3269 /// hand, if emitting code for the device, the ID has to be the function
3270 /// address so that it can retrieved from the offloading entry and launched
3271 /// by the runtime library. We also mark the outlined function to have
3272 /// external linkage in case we are emitting code for the device, because
3273 /// these functions will be entry points to the device.
3274 ///
3275 /// \param InfoManager The info manager keeping track of the offload entries
3276 /// \param EntryInfo The entry information about the function
3277 /// \param GenerateFunctionCallback The callback function to generate the code
3278 /// \param OutlinedFunction Pointer to the outlined function
3279 /// \param EntryFnIDName Name of the ID o be created
3281 TargetRegionEntryInfo &EntryInfo,
3282 FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry,
3283 Function *&OutlinedFn, Constant *&OutlinedFnID);
3284
3285 /// Registers the given function and sets up the attribtues of the function
3286 /// Returns the FunctionID.
3287 ///
3288 /// \param InfoManager The info manager keeping track of the offload entries
3289 /// \param EntryInfo The entry information about the function
3290 /// \param OutlinedFunction Pointer to the outlined function
3291 /// \param EntryFnName Name of the outlined function
3292 /// \param EntryFnIDName Name of the ID o be created
3295 Function *OutlinedFunction,
3296 StringRef EntryFnName, StringRef EntryFnIDName);
3297
3298 /// Type of BodyGen to use for region codegen
3299 ///
3300 /// Priv: If device pointer privatization is required, emit the body of the
3301 /// region here. It will have to be duplicated: with and without
3302 /// privatization.
3303 /// DupNoPriv: If we need device pointer privatization, we need
3304 /// to emit the body of the region with no privatization in the 'else' branch
3305 /// of the conditional.
3306 /// NoPriv: If we don't require privatization of device
3307 /// pointers, we emit the body in between the runtime calls. This avoids
3308 /// duplicating the body code.
3310
3311 /// Callback type for creating the map infos for the kernel parameters.
3312 /// \param CodeGenIP is the insertion point where code should be generated,
3313 /// if any.
3316
3317private:
3318 /// Emit the array initialization or deletion portion for user-defined mapper
3319 /// code generation. First, it evaluates whether an array section is mapped
3320 /// and whether the \a MapType instructs to delete this section. If \a IsInit
3321 /// is true, and \a MapType indicates to not delete this array, array
3322 /// initialization code is generated. If \a IsInit is false, and \a MapType
3323 /// indicates to delete this array, array deletion code is generated.
3324 void emitUDMapperArrayInitOrDel(Function *MapperFn, llvm::Value *MapperHandle,
3325 llvm::Value *Base, llvm::Value *Begin,
3326 llvm::Value *Size, llvm::Value *MapType,
3327 llvm::Value *MapName, TypeSize ElementSize,
3328 llvm::BasicBlock *ExitBB, bool IsInit);
3329
3330public:
3331 /// Emit the user-defined mapper function. The code generation follows the
3332 /// pattern in the example below.
3333 /// \code
3334 /// void .omp_mapper.<type_name>.<mapper_id>.(void *rt_mapper_handle,
3335 /// void *base, void *begin,
3336 /// int64_t size, int64_t type,
3337 /// void *name = nullptr) {
3338 /// // Allocate space for an array section first or add a base/begin for
3339 /// // pointer dereference.
3340 /// if ((size > 1 || (base != begin && maptype.IsPtrAndObj)) &&
3341 /// !maptype.IsDelete)
3342 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3343 /// size*sizeof(Ty), clearToFromMember(type));
3344 /// // Map members.
3345 /// for (unsigned i = 0; i < size; i++) {
3346 /// // For each component specified by this mapper:
3347 /// for (auto c : begin[i]->all_components) {
3348 /// if (c.hasMapper())
3349 /// (*c.Mapper())(rt_mapper_handle, c.arg_base, c.arg_begin,
3350 /// c.arg_size,
3351 /// c.arg_type, c.arg_name);
3352 /// else
3353 /// __tgt_push_mapper_component(rt_mapper_handle, c.arg_base,
3354 /// c.arg_begin, c.arg_size, c.arg_type,
3355 /// c.arg_name);
3356 /// }
3357 /// }
3358 /// // Delete the array section.
3359 /// if (size > 1 && maptype.IsDelete)
3360 /// __tgt_push_mapper_component(rt_mapper_handle, base, begin,
3361 /// size*sizeof(Ty), clearToFromMember(type));
3362 /// }
3363 /// \endcode
3364 ///
3365 /// \param PrivAndGenMapInfoCB Callback that privatizes code and populates the
3366 /// MapInfos and returns.
3367 /// \param ElemTy DeclareMapper element type.
3368 /// \param FuncName Optional param to specify mapper function name.
3369 /// \param CustomMapperCB Optional callback to generate code related to
3370 /// custom mappers.
3373 InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)>
3374 PrivAndGenMapInfoCB,
3375 llvm::Type *ElemTy, StringRef FuncName,
3376 CustomMapperCallbackTy CustomMapperCB);
3377
3378 /// Generator for '#omp target data'
3379 ///
3380 /// \param Loc The location where the target data construct was encountered.
3381 /// \param AllocaIP The insertion points to be used for alloca instructions.
3382 /// \param CodeGenIP The insertion point at which the target directive code
3383 /// should be placed.
3384 /// \param IsBegin If true then emits begin mapper call otherwise emits
3385 /// end mapper call.
3386 /// \param DeviceID Stores the DeviceID from the device clause.
3387 /// \param IfCond Value which corresponds to the if clause condition.
3388 /// \param Info Stores all information realted to the Target Data directive.
3389 /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
3390 /// \param CustomMapperCB Callback to generate code related to
3391 /// custom mappers.
3392 /// \param BodyGenCB Optional Callback to generate the region code.
3393 /// \param DeviceAddrCB Optional callback to generate code related to
3394 /// use_device_ptr and use_device_addr.
3396 const LocationDescription &Loc, InsertPointTy AllocaIP,
3397 InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
3399 CustomMapperCallbackTy CustomMapperCB,
3400 omp::RuntimeFunction *MapperFunc = nullptr,
3402 BodyGenTy BodyGenType)>
3403 BodyGenCB = nullptr,
3404 function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
3405 Value *SrcLocInfo = nullptr);
3406
3408 InsertPointTy AllocaIP, InsertPointTy CodeGenIP)>;
3409
3411 Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP,
3412 InsertPointTy CodeGenIP)>;
3413
3414 /// Generator for '#omp target'
3415 ///
3416 /// \param Loc where the target data construct was encountered.
3417 /// \param IsOffloadEntry whether it is an offload entry.
3418 /// \param CodeGenIP The insertion point where the call to the outlined
3419 /// function should be emitted.
3420 /// \param Info Stores all information realted to the Target directive.
3421 /// \param EntryInfo The entry information about the function.
3422 /// \param DefaultAttrs Structure containing the default attributes, including
3423 /// numbers of threads and teams to launch the kernel with.
3424 /// \param RuntimeAttrs Structure containing the runtime numbers of threads
3425 /// and teams to launch the kernel with.
3426 /// \param IfCond value of the `if` clause.
3427 /// \param Inputs The input values to the region that will be passed.
3428 /// as arguments to the outlined function.
3429 /// \param BodyGenCB Callback that will generate the region code.
3430 /// \param ArgAccessorFuncCB Callback that will generate accessors
3431 /// instructions for passed in target arguments where neccessary
3432 /// \param CustomMapperCB Callback to generate code related to
3433 /// custom mappers.
3434 /// \param Dependencies A vector of DependData objects that carry
3435 /// dependency information as passed in the depend clause
3436 /// \param HasNowait Whether the target construct has a `nowait` clause or
3437 /// not.
3438 /// \param DynCGroupMem The size of the dynamic groupprivate memory for each
3439 /// cgroup.
3440 /// \param DynCGroupMem The fallback mechanism to execute if the requested
3441 /// cgroup memory cannot be provided.
3443 const LocationDescription &Loc, bool IsOffloadEntry,
3446 TargetRegionEntryInfo &EntryInfo,
3447 const TargetKernelDefaultAttrs &DefaultAttrs,
3448 const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
3449 SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
3450 TargetBodyGenCallbackTy BodyGenCB,
3451 TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
3452 CustomMapperCallbackTy CustomMapperCB,
3453 const SmallVector<DependData> &Dependencies, bool HasNowait = false,
3454 Value *DynCGroupMem = nullptr,
3455 omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
3457
3458 /// Returns __kmpc_for_static_init_* runtime function for the specified
3459 /// size \a IVSize and sign \a IVSigned. Will create a distribute call
3460 /// __kmpc_distribute_static_init* if \a IsGPUDistribute is set.
3462 bool IVSigned,
3463 bool IsGPUDistribute);
3464
3465 /// Returns __kmpc_dispatch_init_* runtime function for the specified
3466 /// size \a IVSize and sign \a IVSigned.
3468 bool IVSigned);
3469
3470 /// Returns __kmpc_dispatch_next_* runtime function for the specified
3471 /// size \a IVSize and sign \a IVSigned.
3473 bool IVSigned);
3474
3475 /// Returns __kmpc_dispatch_fini_* runtime function for the specified
3476 /// size \a IVSize and sign \a IVSigned.
3478 bool IVSigned);
3479
3480 /// Returns __kmpc_dispatch_deinit runtime function.
3482
3483 /// Declarations for LLVM-IR types (simple, array, function and structure) are
3484 /// generated below. Their names are defined and used in OpenMPKinds.def. Here
3485 /// we provide the declarations, the initializeTypes function will provide the
3486 /// values.
3487 ///
3488 ///{
3489#define OMP_TYPE(VarName, InitValue) Type *VarName = nullptr;
3490#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize) \
3491 ArrayType *VarName##Ty = nullptr; \
3492 PointerType *VarName##PtrTy = nullptr;
3493#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...) \
3494 FunctionType *VarName = nullptr; \
3495 PointerType *VarName##Ptr = nullptr;
3496#define OMP_STRUCT_TYPE(VarName, StrName, ...) \
3497 StructType *VarName = nullptr; \
3498 PointerType *VarName##Ptr = nullptr;
3499#include "llvm/Frontend/OpenMP/OMPKinds.def"
3500
3501 ///}
3502
3503private:
3504 /// Create all simple and struct types exposed by the runtime and remember
3505 /// the llvm::PointerTypes of them for easy access later.
3506 void initializeTypes(Module &M);
3507
3508 /// Common interface for generating entry calls for OMP Directives.
3509 /// if the directive has a region/body, It will set the insertion
3510 /// point to the body
3511 ///
3512 /// \param OMPD Directive to generate entry blocks for
3513 /// \param EntryCall Call to the entry OMP Runtime Function
3514 /// \param ExitBB block where the region ends.
3515 /// \param Conditional indicate if the entry call result will be used
3516 /// to evaluate a conditional of whether a thread will execute
3517 /// body code or not.
3518 ///
3519 /// \return The insertion position in exit block
3520 InsertPointTy emitCommonDirectiveEntry(omp::Directive OMPD, Value *EntryCall,
3521 BasicBlock *ExitBB,
3522 bool Conditional = false);
3523
3524 /// Common interface to finalize the region
3525 ///
3526 /// \param OMPD Directive to generate exiting code for
3527 /// \param FinIP Insertion point for emitting Finalization code and exit call.
3528 /// This block must not contain any non-finalization code.
3529 /// \param ExitCall Call to the ending OMP Runtime Function
3530 /// \param HasFinalize indicate if the directive will require finalization
3531 /// and has a finalization callback in the stack that
3532 /// should be called.
3533 ///
3534 /// \return The insertion position in exit block
3535 InsertPointOrErrorTy emitCommonDirectiveExit(omp::Directive OMPD,
3536 InsertPointTy FinIP,
3537 Instruction *ExitCall,
3538 bool HasFinalize = true);
3539
3540 /// Common Interface to generate OMP inlined regions
3541 ///
3542 /// \param OMPD Directive to generate inlined region for
3543 /// \param EntryCall Call to the entry OMP Runtime Function
3544 /// \param ExitCall Call to the ending OMP Runtime Function
3545 /// \param BodyGenCB Body code generation callback.
3546 /// \param FiniCB Finalization Callback. Will be called when finalizing region
3547 /// \param Conditional indicate if the entry call result will be used
3548 /// to evaluate a conditional of whether a thread will execute
3549 /// body code or not.
3550 /// \param HasFinalize indicate if the directive will require finalization
3551 /// and has a finalization callback in the stack that
3552 /// should be called.
3553 /// \param IsCancellable if HasFinalize is set to true, indicate if the
3554 /// the directive should be cancellable.
3555 /// \return The insertion point after the region
3557 EmitOMPInlinedRegion(omp::Directive OMPD, Instruction *EntryCall,
3558 Instruction *ExitCall, BodyGenCallbackTy BodyGenCB,
3559 FinalizeCallbackTy FiniCB, bool Conditional = false,
3560 bool HasFinalize = true, bool IsCancellable = false);
3561
3562 /// Get the platform-specific name separator.
3563 /// \param Parts different parts of the final name that needs separation
3564 /// \param FirstSeparator First separator used between the initial two
3565 /// parts of the name.
3566 /// \param Separator separator used between all of the rest consecutive
3567 /// parts of the name
3568 static std::string getNameWithSeparators(ArrayRef<StringRef> Parts,
3569 StringRef FirstSeparator,
3570 StringRef Separator);
3571
3572 /// Returns corresponding lock object for the specified critical region
3573 /// name. If the lock object does not exist it is created, otherwise the
3574 /// reference to the existing copy is returned.
3575 /// \param CriticalName Name of the critical region.
3576 ///
3577 Value *getOMPCriticalRegionLock(StringRef CriticalName);
3578
3579 /// Callback type for Atomic Expression update
3580 /// ex:
3581 /// \code{.cpp}
3582 /// unsigned x = 0;
3583 /// #pragma omp atomic update
3584 /// x = Expr(x_old); //Expr() is any legal operation
3585 /// \endcode
3586 ///
3587 /// \param XOld the value of the atomic memory address to use for update
3588 /// \param IRB reference to the IRBuilder to use
3589 ///
3590 /// \returns Value to update X to.
3591 using AtomicUpdateCallbackTy =
3592 const function_ref<Expected<Value *>(Value *XOld, IRBuilder<> &IRB)>;
3593
3594private:
3595 enum AtomicKind { Read, Write, Update, Capture, Compare };
3596
3597 /// Determine whether to emit flush or not
3598 ///
3599 /// \param Loc The insert and source location description.
3600 /// \param AO The required atomic ordering
3601 /// \param AK The OpenMP atomic operation kind used.
3602 ///
3603 /// \returns wether a flush was emitted or not
3604 bool checkAndEmitFlushAfterAtomic(const LocationDescription &Loc,
3605 AtomicOrdering AO, AtomicKind AK);
3606
3607 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3608 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3609 /// Only Scalar data types.
3610 ///
3611 /// \param AllocaIP The insertion point to be used for alloca
3612 /// instructions.
3613 /// \param X The target atomic pointer to be updated
3614 /// \param XElemTy The element type of the atomic pointer.
3615 /// \param Expr The value to update X with.
3616 /// \param AO Atomic ordering of the generated atomic
3617 /// instructions.
3618 /// \param RMWOp The binary operation used for update. If
3619 /// operation is not supported by atomicRMW,
3620 /// or belong to {FADD, FSUB, BAD_BINOP}.
3621 /// Then a `cmpExch` based atomic will be generated.
3622 /// \param UpdateOp Code generator for complex expressions that cannot be
3623 /// expressed through atomicrmw instruction.
3624 /// \param VolatileX true if \a X volatile?
3625 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3626 /// update expression, false otherwise.
3627 /// (e.g. true for X = X BinOp Expr)
3628 ///
3629 /// \returns A pair of the old value of X before the update, and the value
3630 /// used for the update.
3631 Expected<std::pair<Value *, Value *>>
3632 emitAtomicUpdate(InsertPointTy AllocaIP, Value *X, Type *XElemTy, Value *Expr,
3634 AtomicUpdateCallbackTy &UpdateOp, bool VolatileX,
3635 bool IsXBinopExpr, bool IsIgnoreDenormalMode,
3636 bool IsFineGrainedMemory, bool IsRemoteMemory);
3637
3638 /// Emit the binary op. described by \p RMWOp, using \p Src1 and \p Src2 .
3639 ///
3640 /// \Return The instruction
3641 Value *emitRMWOpAsInstruction(Value *Src1, Value *Src2,
3642 AtomicRMWInst::BinOp RMWOp);
3643
3644 bool IsFinalized;
3645
3646public:
3647 /// a struct to pack relevant information while generating atomic Ops
3649 Value *Var = nullptr;
3650 Type *ElemTy = nullptr;
3651 bool IsSigned = false;
3652 bool IsVolatile = false;
3653 };
3654
3655 /// Emit atomic Read for : V = X --- Only Scalar data types.
3656 ///
3657 /// \param Loc The insert and source location description.
3658 /// \param X The target pointer to be atomically read
3659 /// \param V Memory address where to store atomically read
3660 /// value
3661 /// \param AO Atomic ordering of the generated atomic
3662 /// instructions.
3663 /// \param AllocaIP Insert point for allocas
3664 //
3665 /// \return Insertion point after generated atomic read IR.
3668 AtomicOrdering AO,
3669 InsertPointTy AllocaIP);
3670
3671 /// Emit atomic write for : X = Expr --- Only Scalar data types.
3672 ///
3673 /// \param Loc The insert and source location description.
3674 /// \param X The target pointer to be atomically written to
3675 /// \param Expr The value to store.
3676 /// \param AO Atomic ordering of the generated atomic
3677 /// instructions.
3678 /// \param AllocaIP Insert point for allocas
3679 ///
3680 /// \return Insertion point after generated atomic Write IR.
3682 AtomicOpValue &X, Value *Expr,
3683 AtomicOrdering AO,
3684 InsertPointTy AllocaIP);
3685
3686 /// Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X
3687 /// For complex Operations: X = UpdateOp(X) => CmpExch X, old_X, UpdateOp(X)
3688 /// Only Scalar data types.
3689 ///
3690 /// \param Loc The insert and source location description.
3691 /// \param AllocaIP The insertion point to be used for alloca instructions.
3692 /// \param X The target atomic pointer to be updated
3693 /// \param Expr The value to update X with.
3694 /// \param AO Atomic ordering of the generated atomic instructions.
3695 /// \param RMWOp The binary operation used for update. If operation
3696 /// is not supported by atomicRMW, or belong to
3697 /// {FADD, FSUB, BAD_BINOP}. Then a `cmpExch` based
3698 /// atomic will be generated.
3699 /// \param UpdateOp Code generator for complex expressions that cannot be
3700 /// expressed through atomicrmw instruction.
3701 /// \param IsXBinopExpr true if \a X is Left H.S. in Right H.S. part of the
3702 /// update expression, false otherwise.
3703 /// (e.g. true for X = X BinOp Expr)
3704 ///
3705 /// \return Insertion point after generated atomic update IR.
3708 Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp,
3709 AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr,
3710 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3711 bool IsRemoteMemory = false);
3712
3713 /// Emit atomic update for constructs: --- Only Scalar data types
3714 /// V = X; X = X BinOp Expr ,
3715 /// X = X BinOp Expr; V = X,
3716 /// V = X; X = Expr BinOp X,
3717 /// X = Expr BinOp X; V = X,
3718 /// V = X; X = UpdateOp(X),
3719 /// X = UpdateOp(X); V = X,
3720 ///
3721 /// \param Loc The insert and source location description.
3722 /// \param AllocaIP The insertion point to be used for alloca instructions.
3723 /// \param X The target atomic pointer to be updated
3724 /// \param V Memory address where to store captured value
3725 /// \param Expr The value to update X with.
3726 /// \param AO Atomic ordering of the generated atomic instructions
3727 /// \param RMWOp The binary operation used for update. If
3728 /// operation is not supported by atomicRMW, or belong to
3729 /// {FADD, FSUB, BAD_BINOP}. Then a cmpExch based
3730 /// atomic will be generated.
3731 /// \param UpdateOp Code generator for complex expressions that cannot be
3732 /// expressed through atomicrmw instruction.
3733 /// \param UpdateExpr true if X is an in place update of the form
3734 /// X = X BinOp Expr or X = Expr BinOp X
3735 /// \param IsXBinopExpr true if X is Left H.S. in Right H.S. part of the
3736 /// update expression, false otherwise.
3737 /// (e.g. true for X = X BinOp Expr)
3738 /// \param IsPostfixUpdate true if original value of 'x' must be stored in
3739 /// 'v', not an updated one.
3740 ///
3741 /// \return Insertion point after generated atomic capture IR.
3744 AtomicOpValue &V, Value *Expr, AtomicOrdering AO,
3745 AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp,
3746 bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr,
3747 bool IsIgnoreDenormalMode = false, bool IsFineGrainedMemory = false,
3748 bool IsRemoteMemory = false);
3749
3750 /// Emit atomic compare for constructs: --- Only scalar data types
3751 /// cond-expr-stmt:
3752 /// x = x ordop expr ? expr : x;
3753 /// x = expr ordop x ? expr : x;
3754 /// x = x == e ? d : x;
3755 /// x = e == x ? d : x; (this one is not in the spec)
3756 /// cond-update-stmt:
3757 /// if (x ordop expr) { x = expr; }
3758 /// if (expr ordop x) { x = expr; }
3759 /// if (x == e) { x = d; }
3760 /// if (e == x) { x = d; } (this one is not in the spec)
3761 /// conditional-update-capture-atomic:
3762 /// v = x; cond-update-stmt; (IsPostfixUpdate=true, IsFailOnly=false)
3763 /// cond-update-stmt; v = x; (IsPostfixUpdate=false, IsFailOnly=false)
3764 /// if (x == e) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3765 /// IsFailOnly=true)
3766 /// r = x == e; if (r) { x = d; } (IsPostfixUpdate=false, IsFailOnly=false)
3767 /// r = x == e; if (r) { x = d; } else { v = x; } (IsPostfixUpdate=false,
3768 /// IsFailOnly=true)
3769 ///
3770 /// \param Loc The insert and source location description.
3771 /// \param X The target atomic pointer to be updated.
3772 /// \param V Memory address where to store captured value (for
3773 /// compare capture only).
3774 /// \param R Memory address where to store comparison result
3775 /// (for compare capture with '==' only).
3776 /// \param E The expected value ('e') for forms that use an
3777 /// equality comparison or an expression ('expr') for
3778 /// forms that use 'ordop' (logically an atomic maximum or
3779 /// minimum).
3780 /// \param D The desired value for forms that use an equality
3781 /// comparison. If forms that use 'ordop', it should be
3782 /// \p nullptr.
3783 /// \param AO Atomic ordering of the generated atomic instructions.
3784 /// \param Op Atomic compare operation. It can only be ==, <, or >.
3785 /// \param IsXBinopExpr True if the conditional statement is in the form where
3786 /// x is on LHS. It only matters for < or >.
3787 /// \param IsPostfixUpdate True if original value of 'x' must be stored in
3788 /// 'v', not an updated one (for compare capture
3789 /// only).
3790 /// \param IsFailOnly True if the original value of 'x' is stored to 'v'
3791 /// only when the comparison fails. This is only valid for
3792 /// the case the comparison is '=='.
3793 ///
3794 /// \return Insertion point after generated atomic capture IR.
3799 bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly);
3803 omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate,
3804 bool IsFailOnly, AtomicOrdering Failure);
3805
3806 /// Create the control flow structure of a canonical OpenMP loop.
3807 ///
3808 /// The emitted loop will be disconnected, i.e. no edge to the loop's
3809 /// preheader and no terminator in the AfterBB. The OpenMPIRBuilder's
3810 /// IRBuilder location is not preserved.
3811 ///
3812 /// \param DL DebugLoc used for the instructions in the skeleton.
3813 /// \param TripCount Value to be used for the trip count.
3814 /// \param F Function in which to insert the BasicBlocks.
3815 /// \param PreInsertBefore Where to insert BBs that execute before the body,
3816 /// typically the body itself.
3817 /// \param PostInsertBefore Where to insert BBs that execute after the body.
3818 /// \param Name Base name used to derive BB
3819 /// and instruction names.
3820 ///
3821 /// \returns The CanonicalLoopInfo that represents the emitted loop.
3823 Function *F,
3824 BasicBlock *PreInsertBefore,
3825 BasicBlock *PostInsertBefore,
3826 const Twine &Name = {});
3827 /// OMP Offload Info Metadata name string
3828 const std::string ompOffloadInfoName = "omp_offload.info";
3829
3830 /// Loads all the offload entries information from the host IR
3831 /// metadata. This function is only meant to be used with device code
3832 /// generation.
3833 ///
3834 /// \param M Module to load Metadata info from. Module passed maybe
3835 /// loaded from bitcode file, i.e, different from OpenMPIRBuilder::M module.
3837
3838 /// Loads all the offload entries information from the host IR
3839 /// metadata read from the file passed in as the HostFilePath argument. This
3840 /// function is only meant to be used with device code generation.
3841 ///
3842 /// \param HostFilePath The path to the host IR file,
3843 /// used to load in offload metadata for the device, allowing host and device
3844 /// to maintain the same metadata mapping.
3846 StringRef HostFilePath);
3847
3848 /// Gets (if variable with the given name already exist) or creates
3849 /// internal global variable with the specified Name. The created variable has
3850 /// linkage CommonLinkage by default and is initialized by null value.
3851 /// \param Ty Type of the global variable. If it is exist already the type
3852 /// must be the same.
3853 /// \param Name Name of the variable.
3856 std::optional<unsigned> AddressSpace = {});
3857};
3858
3859/// Class to represented the control flow structure of an OpenMP canonical loop.
3860///
3861/// The control-flow structure is standardized for easy consumption by
3862/// directives associated with loops. For instance, the worksharing-loop
3863/// construct may change this control flow such that each loop iteration is
3864/// executed on only one thread. The constraints of a canonical loop in brief
3865/// are:
3866///
3867/// * The number of loop iterations must have been computed before entering the
3868/// loop.
3869///
3870/// * Has an (unsigned) logical induction variable that starts at zero and
3871/// increments by one.
3872///
3873/// * The loop's CFG itself has no side-effects. The OpenMP specification
3874/// itself allows side-effects, but the order in which they happen, including
3875/// how often or whether at all, is unspecified. We expect that the frontend
3876/// will emit those side-effect instructions somewhere (e.g. before the loop)
3877/// such that the CanonicalLoopInfo itself can be side-effect free.
3878///
3879/// Keep in mind that CanonicalLoopInfo is meant to only describe a repeated
3880/// execution of a loop body that satifies these constraints. It does NOT
3881/// represent arbitrary SESE regions that happen to contain a loop. Do not use
3882/// CanonicalLoopInfo for such purposes.
3883///
3884/// The control flow can be described as follows:
3885///
3886/// Preheader
3887/// |
3888/// /-> Header
3889/// | |
3890/// | Cond---\
3891/// | | |
3892/// | Body |
3893/// | | | |
3894/// | <...> |
3895/// | | | |
3896/// \--Latch |
3897/// |
3898/// Exit
3899/// |
3900/// After
3901///
3902/// The loop is thought to start at PreheaderIP (at the Preheader's terminator,
3903/// including) and end at AfterIP (at the After's first instruction, excluding).
3904/// That is, instructions in the Preheader and After blocks (except the
3905/// Preheader's terminator) are out of CanonicalLoopInfo's control and may have
3906/// side-effects. Typically, the Preheader is used to compute the loop's trip
3907/// count. The instructions from BodyIP (at the Body block's first instruction,
3908/// excluding) until the Latch are also considered outside CanonicalLoopInfo's
3909/// control and thus can have side-effects. The body block is the single entry
3910/// point into the loop body, which may contain arbitrary control flow as long
3911/// as all control paths eventually branch to the Latch block.
3912///
3913/// TODO: Consider adding another standardized BasicBlock between Body CFG and
3914/// Latch to guarantee that there is only a single edge to the latch. It would
3915/// make loop transformations easier to not needing to consider multiple
3916/// predecessors of the latch (See redirectAllPredecessorsTo) and would give us
3917/// an equivalant to PreheaderIP, AfterIP and BodyIP for inserting code that
3918/// executes after each body iteration.
3919///
3920/// There must be no loop-carried dependencies through llvm::Values. This is
3921/// equivalant to that the Latch has no PHINode and the Header's only PHINode is
3922/// for the induction variable.
3923///
3924/// All code in Header, Cond, Latch and Exit (plus the terminator of the
3925/// Preheader) are CanonicalLoopInfo's responsibility and their build-up checked
3926/// by assertOK(). They are expected to not be modified unless explicitly
3927/// modifying the CanonicalLoopInfo through a methods that applies a OpenMP
3928/// loop-associated construct such as applyWorkshareLoop, tileLoops, unrollLoop,
3929/// etc. These methods usually invalidate the CanonicalLoopInfo and re-use its
3930/// basic blocks. After invalidation, the CanonicalLoopInfo must not be used
3931/// anymore as its underlying control flow may not exist anymore.
3932/// Loop-transformation methods such as tileLoops, collapseLoops and unrollLoop
3933/// may also return a new CanonicalLoopInfo that can be passed to other
3934/// loop-associated construct implementing methods. These loop-transforming
3935/// methods may either create a new CanonicalLoopInfo usually using
3936/// createLoopSkeleton and invalidate the input CanonicalLoopInfo, or reuse and
3937/// modify one of the input CanonicalLoopInfo and return it as representing the
3938/// modified loop. What is done is an implementation detail of
3939/// transformation-implementing method and callers should always assume that the
3940/// CanonicalLoopInfo passed to it is invalidated and a new object is returned.
3941/// Returned CanonicalLoopInfo have the same structure and guarantees as the one
3942/// created by createCanonicalLoop, such that transforming methods do not have
3943/// to special case where the CanonicalLoopInfo originated from.
3944///
3945/// Generally, methods consuming CanonicalLoopInfo do not need an
3946/// OpenMPIRBuilder::InsertPointTy as argument, but use the locations of the
3947/// CanonicalLoopInfo to insert new or modify existing instructions. Unless
3948/// documented otherwise, methods consuming CanonicalLoopInfo do not invalidate
3949/// any InsertPoint that is outside CanonicalLoopInfo's control. Specifically,
3950/// any InsertPoint in the Preheader, After or Block can still be used after
3951/// calling such a method.
3952///
3953/// TODO: Provide mechanisms for exception handling and cancellation points.
3954///
3955/// Defined outside OpenMPIRBuilder because nested classes cannot be
3956/// forward-declared, e.g. to avoid having to include the entire OMPIRBuilder.h.
3958 friend class OpenMPIRBuilder;
3959
3960private:
3961 BasicBlock *Header = nullptr;
3962 BasicBlock *Cond = nullptr;
3963 BasicBlock *Latch = nullptr;
3964 BasicBlock *Exit = nullptr;
3965
3966 // Hold the MLIR value for the `lastiter` of the canonical loop.
3967 Value *LastIter = nullptr;
3968
3969 /// Add the control blocks of this loop to \p BBs.
3970 ///
3971 /// This does not include any block from the body, including the one returned
3972 /// by getBody().
3973 ///
3974 /// FIXME: This currently includes the Preheader and After blocks even though
3975 /// their content is (mostly) not under CanonicalLoopInfo's control.
3976 /// Re-evaluated whether this makes sense.
3977 void collectControlBlocks(SmallVectorImpl<BasicBlock *> &BBs);
3978
3979 /// Sets the number of loop iterations to the given value. This value must be
3980 /// valid in the condition block (i.e., defined in the preheader) and is
3981 /// interpreted as an unsigned integer.
3982 void setTripCount(Value *TripCount);
3983
3984 /// Replace all uses of the canonical induction variable in the loop body with
3985 /// a new one.
3986 ///
3987 /// The intended use case is to update the induction variable for an updated
3988 /// iteration space such that it can stay normalized in the 0...tripcount-1
3989 /// range.
3990 ///
3991 /// The \p Updater is called with the (presumable updated) current normalized
3992 /// induction variable and is expected to return the value that uses of the
3993 /// pre-updated induction values should use instead, typically dependent on
3994 /// the new induction variable. This is a lambda (instead of e.g. just passing
3995 /// the new value) to be able to distinguish the uses of the pre-updated
3996 /// induction variable and uses of the induction varible to compute the
3997 /// updated induction variable value.
3998 void mapIndVar(llvm::function_ref<Value *(Instruction *)> Updater);
3999
4000public:
4001 /// Sets the last iteration variable for this loop.
4002 void setLastIter(Value *IterVar) { LastIter = std::move(IterVar); }
4003
4004 /// Returns the last iteration variable for this loop.
4005 /// Certain use-cases (like translation of linear clause) may access
4006 /// this variable even after a loop transformation. Hence, do not guard
4007 /// this getter function by `isValid`. It is the responsibility of the
4008 /// callee to ensure this functionality is not invoked by a non-outlined
4009 /// CanonicalLoopInfo object (in which case, `setLastIter` will never be
4010 /// invoked and `LastIter` will be by default `nullptr`).
4011 Value *getLastIter() { return LastIter; }
4012
4013 /// Returns whether this object currently represents the IR of a loop. If
4014 /// returning false, it may have been consumed by a loop transformation or not
4015 /// been intialized. Do not use in this case;
4016 bool isValid() const { return Header; }
4017
4018 /// The preheader ensures that there is only a single edge entering the loop.
4019 /// Code that must be execute before any loop iteration can be emitted here,
4020 /// such as computing the loop trip count and begin lifetime markers. Code in
4021 /// the preheader is not considered part of the canonical loop.
4023
4024 /// The header is the entry for each iteration. In the canonical control flow,
4025 /// it only contains the PHINode for the induction variable.
4027 assert(isValid() && "Requires a valid canonical loop");
4028 return Header;
4029 }
4030
4031 /// The condition block computes whether there is another loop iteration. If
4032 /// yes, branches to the body; otherwise to the exit block.
4034 assert(isValid() && "Requires a valid canonical loop");
4035 return Cond;
4036 }
4037
4038 /// The body block is the single entry for a loop iteration and not controlled
4039 /// by CanonicalLoopInfo. It can contain arbitrary control flow but must
4040 /// eventually branch to the \p Latch block.
4042 assert(isValid() && "Requires a valid canonical loop");
4043 return cast<BranchInst>(Cond->getTerminator())->getSuccessor(0);
4044 }
4045
4046 /// Reaching the latch indicates the end of the loop body code. In the
4047 /// canonical control flow, it only contains the increment of the induction
4048 /// variable.
4050 assert(isValid() && "Requires a valid canonical loop");
4051 return Latch;
4052 }
4053
4054 /// Reaching the exit indicates no more iterations are being executed.
4056 assert(isValid() && "Requires a valid canonical loop");
4057 return Exit;
4058 }
4059
4060 /// The after block is intended for clean-up code such as lifetime end
4061 /// markers. It is separate from the exit block to ensure, analogous to the
4062 /// preheader, it having just a single entry edge and being free from PHI
4063 /// nodes should there be multiple loop exits (such as from break
4064 /// statements/cancellations).
4066 assert(isValid() && "Requires a valid canonical loop");
4067 return Exit->getSingleSuccessor();
4068 }
4069
4070 /// Returns the llvm::Value containing the number of loop iterations. It must
4071 /// be valid in the preheader and always interpreted as an unsigned integer of
4072 /// any bit-width.
4074 assert(isValid() && "Requires a valid canonical loop");
4075 Instruction *CmpI = &Cond->front();
4076 assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
4077 return CmpI->getOperand(1);
4078 }
4079
4080 /// Returns the instruction representing the current logical induction
4081 /// variable. Always unsigned, always starting at 0 with an increment of one.
4083 assert(isValid() && "Requires a valid canonical loop");
4084 Instruction *IndVarPHI = &Header->front();
4085 assert(isa<PHINode>(IndVarPHI) && "First inst must be the IV PHI");
4086 return IndVarPHI;
4087 }
4088
4089 /// Return the type of the induction variable (and the trip count).
4091 assert(isValid() && "Requires a valid canonical loop");
4092 return getIndVar()->getType();
4093 }
4094
4095 /// Return the insertion point for user code before the loop.
4097 assert(isValid() && "Requires a valid canonical loop");
4098 BasicBlock *Preheader = getPreheader();
4099 return {Preheader, std::prev(Preheader->end())};
4100 };
4101
4102 /// Return the insertion point for user code in the body.
4104 assert(isValid() && "Requires a valid canonical loop");
4105 BasicBlock *Body = getBody();
4106 return {Body, Body->begin()};
4107 };
4108
4109 /// Return the insertion point for user code after the loop.
4111 assert(isValid() && "Requires a valid canonical loop");
4112 BasicBlock *After = getAfter();
4113 return {After, After->begin()};
4114 };
4115
4117 assert(isValid() && "Requires a valid canonical loop");
4118 return Header->getParent();
4119 }
4120
4121 /// Consistency self-check.
4122 LLVM_ABI void assertOK() const;
4123
4124 /// Invalidate this loop. That is, the underlying IR does not fulfill the
4125 /// requirements of an OpenMP canonical loop anymore.
4126 LLVM_ABI void invalidate();
4127};
4128
4129/// ScanInfo holds the information to assist in lowering of Scan reduction.
4130/// Before lowering, the body of the for loop specifying scan reduction is
4131/// expected to have the following structure
4132///
4133/// Loop Body Entry
4134/// |
4135/// Code before the scan directive
4136/// |
4137/// Scan Directive
4138/// |
4139/// Code after the scan directive
4140/// |
4141/// Loop Body Exit
4142/// When `createCanonicalScanLoops` is executed, the bodyGen callback of it
4143/// transforms the body to:
4144///
4145/// Loop Body Entry
4146/// |
4147/// OMPScanDispatch
4148///
4149/// OMPBeforeScanBlock
4150/// |
4151/// OMPScanLoopExit
4152/// |
4153/// Loop Body Exit
4154///
4155/// The insert point is updated to the first insert point of OMPBeforeScanBlock.
4156/// It dominates the control flow of code generated until
4157/// scan directive is encountered and OMPAfterScanBlock dominates the
4158/// control flow of code generated after scan is encountered. The successor
4159/// of OMPScanDispatch can be OMPBeforeScanBlock or OMPAfterScanBlock based
4160/// on 1.whether it is in Input phase or Scan Phase , 2. whether it is an
4161/// exclusive or inclusive scan. This jump is added when `createScan` is
4162/// executed. If input loop is being generated, if it is inclusive scan,
4163/// `OMPAfterScanBlock` succeeds `OMPScanDispatch` , if exclusive,
4164/// `OMPBeforeScanBlock` succeeds `OMPDispatch` and vice versa for scan loop. At
4165/// the end of the input loop, temporary buffer is populated and at the
4166/// beginning of the scan loop, temporary buffer is read. After scan directive
4167/// is encountered, insertion point is updated to `OMPAfterScanBlock` as it is
4168/// expected to dominate the code after the scan directive. Both Before and
4169/// After scan blocks are succeeded by `OMPScanLoopExit`.
4170/// Temporary buffer allocations are done in `ScanLoopInit` block before the
4171/// lowering of for-loop. The results are copied back to reduction variable in
4172/// `ScanLoopFinish` block.
4174public:
4175 /// Dominates the body of the loop before scan directive
4177
4178 /// Dominates the body of the loop before scan directive
4180
4181 /// Controls the flow to before or after scan blocks
4183
4184 /// Exit block of loop body
4186
4187 /// Block before loop body where scan initializations are done
4189
4190 /// Block after loop body where scan finalizations are done
4192
4193 /// If true, it indicates Input phase is lowered; else it indicates
4194 /// ScanPhase is lowered
4195 bool OMPFirstScanLoop = false;
4196
4197 /// Maps the private reduction variable to the pointer of the temporary
4198 /// buffer
4200
4201 /// Keeps track of value of iteration variable for input/scan loop to be
4202 /// used for Scan directive lowering
4203 llvm::Value *IV = nullptr;
4204
4205 /// Stores the span of canonical loop being lowered to be used for temporary
4206 /// buffer allocation or Finalization.
4207 llvm::Value *Span = nullptr;
4208
4212 ScanInfo(ScanInfo &) = delete;
4213 ScanInfo &operator=(const ScanInfo &) = delete;
4214
4215 ~ScanInfo() { delete (ScanBuffPtrs); }
4216};
4217
4218} // end namespace llvm
4219
4220#endif // LLVM_FRONTEND_OPENMP_OMPIRBUILDER_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
arc branch finalize
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file defines the BumpPtrAllocator interface.
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition CSEInfo.cpp:27
#define LLVM_ABI
Definition Compiler.h:213
DXIL Finalize Linkage
Hexagon Hardware Loops
Module.h This file contains the declarations for the Module class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Machine Check Debug Module
#define T
This file defines constans and helpers used when dealing with OpenMP.
Provides definitions for Target specific Grid Values.
const SmallVectorImpl< MachineOperand > & Cond
Basic Register Allocator
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
std::unordered_set< BasicBlock * > BlockSet
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
an instruction to allocate memory on the stack
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Align AtomicAlign
Definition Atomic.h:23
bool UseLibcall
Definition Atomic.h:25
IRBuilderBase * Builder
Definition Atomic.h:19
uint64_t AtomicSizeInBits
Definition Atomic.h:21
uint64_t ValueSizeInBits
Definition Atomic.h:22
IRBuilderBase::InsertPoint AllocaIP
Definition Atomic.h:26
Align ValueAlign
Definition Atomic.h:24
BinOp
This enumeration lists the possible modifications atomicrmw can make.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:472
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:459
This class represents a function call, abstracting a target machine's calling convention.
Class to represented the control flow structure of an OpenMP canonical loop.
Value * getTripCount() const
Returns the llvm::Value containing the number of loop iterations.
BasicBlock * getHeader() const
The header is the entry for each iteration.
LLVM_ABI void assertOK() const
Consistency self-check.
Type * getIndVarType() const
Return the type of the induction variable (and the trip count).
BasicBlock * getBody() const
The body block is the single entry for a loop iteration and not controlled by CanonicalLoopInfo.
bool isValid() const
Returns whether this object currently represents the IR of a loop.
void setLastIter(Value *IterVar)
Sets the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getAfterIP() const
Return the insertion point for user code after the loop.
Value * getLastIter()
Returns the last iteration variable for this loop.
OpenMPIRBuilder::InsertPointTy getBodyIP() const
Return the insertion point for user code in the body.
BasicBlock * getAfter() const
The after block is intended for clean-up code such as lifetime end markers.
Function * getFunction() const
LLVM_ABI void invalidate()
Invalidate this loop.
BasicBlock * getLatch() const
Reaching the latch indicates the end of the loop body code.
OpenMPIRBuilder::InsertPointTy getPreheaderIP() const
Return the insertion point for user code before the loop.
BasicBlock * getCond() const
The condition block computes whether there is another loop iteration.
BasicBlock * getExit() const
Reaching the exit indicates no more iterations are being executed.
LLVM_ABI BasicBlock * getPreheader() const
The preheader ensures that there is only a single edge entering the loop.
Instruction * getIndVar() const
Returns the instruction representing the current logical induction variable.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
This is an important base class in LLVM.
Definition Constant.h:43
A debug info location.
Definition DebugLoc.h:123
Lightweight error class with error context and mandatory checking.
Definition Error.h:159
Tagged union holding either a T or a Error.
Definition Error.h:485
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
LinkageTypes
An enumeration for the kinds of linkage for global values.
Definition GlobalValue.h:52
InsertPoint - A saved insertion point.
Definition IRBuilder.h:291
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2794
Class to represent integer types.
Analysis pass that exposes the LoopInfo for a function.
Definition LoopInfo.h:569
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:36
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
OffloadEntryInfoDeviceGlobalVar(unsigned Order, OMPTargetGlobalVarEntryKind Flags)
OffloadEntryInfoDeviceGlobalVar(unsigned Order, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage, const std::string &VarName)
static bool classof(const OffloadEntryInfo *Info)
OffloadEntryInfoTargetRegion(unsigned Order, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
@ OffloadingEntryInfoTargetRegion
Entry is a target region.
@ OffloadingEntryInfoDeviceGlobalVar
Entry is a declare target variable.
OffloadingEntryInfoKinds getKind() const
OffloadEntryInfo(OffloadingEntryInfoKinds Kind)
static bool classof(const OffloadEntryInfo *Info)
OffloadEntryInfo(OffloadingEntryInfoKinds Kind, unsigned Order, uint32_t Flags)
Class that manages information about offload code regions and data.
function_ref< void(StringRef, const OffloadEntryInfoDeviceGlobalVar &)> OffloadDeviceGlobalVarEntryInfoActTy
Applies action Action on all registered entries.
OMPTargetDeviceClauseKind
Kind of device clause for declare target variables and functions NOTE: Currently not used as a part o...
@ OMPTargetDeviceClauseNoHost
The target is marked for non-host devices.
@ OMPTargetDeviceClauseAny
The target is marked for all devices.
@ OMPTargetDeviceClauseNone
The target is marked as having no clause.
@ OMPTargetDeviceClauseHost
The target is marked for host devices.
LLVM_ABI void registerDeviceGlobalVarEntryInfo(StringRef VarName, Constant *Addr, int64_t VarSize, OMPTargetGlobalVarEntryKind Flags, GlobalValue::LinkageTypes Linkage)
Register device global variable entry.
LLVM_ABI void initializeDeviceGlobalVarEntryInfo(StringRef Name, OMPTargetGlobalVarEntryKind Flags, unsigned Order)
Initialize device global variable entry.
LLVM_ABI void actOnDeviceGlobalVarEntriesInfo(const OffloadDeviceGlobalVarEntryInfoActTy &Action)
OMPTargetRegionEntryKind
Kind of the target registry entry.
@ OMPTargetRegionEntryTargetRegion
Mark the entry as target region.
OffloadEntriesInfoManager(OpenMPIRBuilder *builder)
LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, const TargetRegionEntryInfo &EntryInfo)
LLVM_ABI bool hasTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, bool IgnoreAddressId=false) const
Return true if a target region entry with the provided information exists.
LLVM_ABI void registerTargetRegionEntryInfo(TargetRegionEntryInfo EntryInfo, Constant *Addr, Constant *ID, OMPTargetRegionEntryKind Flags)
Register target region entry.
LLVM_ABI void actOnTargetRegionEntriesInfo(const OffloadTargetRegionEntryInfoActTy &Action)
unsigned size() const
Return number of entries defined so far.
LLVM_ABI void initializeTargetRegionEntryInfo(const TargetRegionEntryInfo &EntryInfo, unsigned Order)
Initialize target region entry.
OMPTargetGlobalVarEntryKind
Kind of the global variable entry..
@ OMPTargetGlobalVarEntryEnter
Mark the entry as a declare target enter.
@ OMPTargetGlobalVarEntryNone
Mark the entry as having no declare target entry kind.
@ OMPTargetGlobalRegisterRequires
Mark the entry as a register requires global.
@ OMPTargetGlobalVarEntryIndirect
Mark the entry as a declare target indirect global.
@ OMPTargetGlobalVarEntryLink
Mark the entry as a to declare target link.
@ OMPTargetGlobalVarEntryTo
Mark the entry as a to declare target.
@ OMPTargetGlobalVarEntryIndirectVTable
Mark the entry as a declare target indirect vtable.
function_ref< void(const TargetRegionEntryInfo &EntryInfo, const OffloadEntryInfoTargetRegion &)> OffloadTargetRegionEntryInfoActTy
brief Applies action Action on all registered entries.
bool hasDeviceGlobalVarEntryInfo(StringRef VarName) const
Checks if the variable with the given name has been registered already.
LLVM_ABI bool empty() const
Return true if a there are no entries defined.
Captures attributes that affect generating LLVM-IR using the OpenMPIRBuilder and related classes.
std::optional< bool > IsTargetDevice
Flag to define whether to generate code for the role of the OpenMP host (if set to false) or device (...
std::optional< bool > IsGPU
Flag for specifying if the compilation is done for an accelerator.
std::optional< StringRef > FirstSeparator
First separator used between the initial two parts of a name.
StringRef separator() const
LLVM_ABI int64_t getRequiresFlags() const
Returns requires directive clauses as flags compatible with those expected by libomptarget.
void setFirstSeparator(StringRef FS)
void setDefaultTargetAS(unsigned AS)
StringRef firstSeparator() const
std::optional< bool > OpenMPOffloadMandatory
Flag for specifying if offloading is mandatory.
std::optional< bool > EmitLLVMUsedMetaInfo
Flag for specifying if LLVMUsed information should be emitted.
SmallVector< Triple > TargetTriples
When compilation is being done for the OpenMP host (i.e.
LLVM_ABI void setHasRequiresReverseOffload(bool Value)
LLVM_ABI bool hasRequiresUnifiedSharedMemory() const
LLVM_ABI void setHasRequiresUnifiedSharedMemory(bool Value)
unsigned getDefaultTargetAS() const
std::optional< StringRef > Separator
Separator used between all of the rest consecutive parts of s name.
LLVM_ABI bool hasRequiresDynamicAllocators() const
bool openMPOffloadMandatory() const
CallingConv::ID getRuntimeCC() const
LLVM_ABI void setHasRequiresUnifiedAddress(bool Value)
void setOpenMPOffloadMandatory(bool Value)
void setIsTargetDevice(bool Value)
void setSeparator(StringRef S)
void setRuntimeCC(CallingConv::ID CC)
LLVM_ABI void setHasRequiresDynamicAllocators(bool Value)
void setEmitLLVMUsed(bool Value=true)
std::optional< omp::GV > GridValue
LLVM_ABI bool hasRequiresReverseOffload() const
LLVM_ABI bool hasRequiresUnifiedAddress() const
llvm::AllocaInst * CreateAlloca(llvm::Type *Ty, const llvm::Twine &Name) const override
void decorateWithTBAA(llvm::Instruction *I) override
AtomicInfo(IRBuilder<> *Builder, llvm::Type *Ty, uint64_t AtomicSizeInBits, uint64_t ValueSizeInBits, llvm::Align AtomicAlign, llvm::Align ValueAlign, bool UseLibcall, IRBuilderBase::InsertPoint AllocaIP, llvm::Value *AtomicVar)
llvm::Value * getAtomicPointer() const override
Struct that keeps the information that should be kept throughout a 'target data' region.
TargetDataInfo(bool RequiresDevicePointerInfo, bool SeparateBeginEndCalls)
SmallMapVector< const Value *, std::pair< Value *, Value * >, 4 > DevicePtrInfoMap
void clearArrayInfo()
Clear information about the data arrays.
unsigned NumberOfPtrs
The total number of pointers passed to the runtime library.
bool HasNoWait
Whether the target ... data directive has a nowait clause.
bool isValid()
Return true if the current target data information has valid arrays.
bool HasMapper
Indicate whether any user-defined mapper exists.
An interface to create LLVM-IR for OpenMP directives.
LLVM_ABI InsertPointOrErrorTy createOrderedThreadsSimd(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsThreads)
Generator for 'omp ordered [threads | simd]'.
LLVM_ABI Constant * getOrCreateIdent(Constant *SrcLocStr, uint32_t SrcLocStrSize, omp::IdentFlag Flags=omp::IdentFlag(0), unsigned Reserve2Flags=0)
Return an ident_t* encoding the source location SrcLocStr and Flags.
LLVM_ABI FunctionCallee getOrCreateRuntimeFunction(Module &M, omp::RuntimeFunction FnID)
Return the function declaration for the runtime function with FnID.
LLVM_ABI InsertPointOrErrorTy createCancel(const LocationDescription &Loc, Value *IfCondition, omp::Directive CanceledDirective)
Generator for 'omp cancel'.
std::function< Expected< Function * >(StringRef FunctionName)> FunctionGenCallback
Functions used to generate a function with the given name.
ReductionGenCBKind
Enum class for the RedctionGen CallBack type to be used.
LLVM_ABI CanonicalLoopInfo * collapseLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, InsertPointTy ComputeIP)
Collapse a loop nest into a single loop.
LLVM_ABI InsertPointOrErrorTy createTask(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, bool Tied=true, Value *Final=nullptr, Value *IfCondition=nullptr, SmallVector< DependData > Dependencies={}, bool Mergeable=false, Value *EventHandle=nullptr, Value *Priority=nullptr)
Generator for #omp taskloop
LLVM_ABI void createTaskyield(const LocationDescription &Loc)
Generator for 'omp taskyield'.
std::function< Error(InsertPointTy CodeGenIP)> FinalizeCallbackTy
Callback type for variable finalization (think destructors).
LLVM_ABI void emitBranch(BasicBlock *Target)
LLVM_ABI Error emitCancelationCheckImpl(Value *CancelFlag, omp::Directive CanceledDirective)
Generate control flow and cleanup for cancellation.
static LLVM_ABI void writeThreadBoundsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
EvalKind
Enum class for reduction evaluation types scalar, complex and aggregate.
LLVM_ABI void emitTaskwaitImpl(const LocationDescription &Loc)
Generate a taskwait runtime call.
LLVM_ABI Constant * registerTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, Function *OutlinedFunction, StringRef EntryFnName, StringRef EntryFnIDName)
Registers the given function and sets up the attribtues of the function Returns the FunctionID.
LLVM_ABI GlobalVariable * emitKernelExecutionMode(StringRef KernelName, omp::OMPTgtExecModeFlags Mode)
Emit the kernel execution mode.
LLVM_ABI InsertPointOrErrorTy createDistribute(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for #omp distribute
LLVM_ABI void initialize()
Initialize the internal state, this will put structures types and potentially other helpers into the ...
LLVM_ABI void createTargetDeinit(const LocationDescription &Loc, int32_t TeamsReductionDataSize=0, int32_t TeamsReductionBufferLength=1024)
Create a runtime call for kmpc_target_deinit.
LLVM_ABI InsertPointOrErrorTy createTaskgroup(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB)
Generator for the taskgroup construct.
std::function< InsertPointTy(InsertPointTy CodeGenIP, unsigned Index, Value **LHS, Value **RHS, Function *CurFn)> ReductionGenClangCBTy
ReductionGen CallBack for Clang.
LLVM_ABI InsertPointTy createAtomicWrite(const LocationDescription &Loc, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic write for : X = Expr — Only Scalar data types.
LLVM_ABI void loadOffloadInfoMetadata(Module &M)
Loads all the offload entries information from the host IR metadata.
function_ref< MapInfosTy &(InsertPointTy CodeGenIP)> GenMapInfoCallbackTy
Callback type for creating the map infos for the kernel parameters.
LLVM_ABI Error emitOffloadingArrays(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Emit the arrays used to pass the captures and map information to the offloading runtime library.
LLVM_ABI void unrollLoopFull(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully unroll a loop.
function_ref< Error(InsertPointTy CodeGenIP, Value *IndVar)> LoopBodyGenCallbackTy
Callback type for loop body code generation.
LLVM_ABI InsertPointOrErrorTy emitScanReduction(const LocationDescription &Loc, ArrayRef< llvm::OpenMPIRBuilder::ReductionInfo > ReductionInfos, ScanInfo *ScanRedInfo)
This function performs the scan reduction of the values updated in the input phase.
LLVM_ABI void emitFlush(const LocationDescription &Loc)
Generate a flush runtime call.
static LLVM_ABI std::pair< int32_t, int32_t > readThreadBoundsForKernel(const Triple &T, Function &Kernel)
}
OpenMPIRBuilderConfig Config
The OpenMPIRBuilder Configuration.
LLVM_ABI CallInst * createOMPInteropDestroy(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_destroy.
LLVM_ABI Error emitIfClause(Value *Cond, BodyGenCallbackTy ThenGen, BodyGenCallbackTy ElseGen, InsertPointTy AllocaIP={})
Emits code for OpenMP 'if' clause using specified BodyGenCallbackTy Here is the logic: if (Cond) { Th...
std::function< InsertPointOrErrorTy( InsertPointTy CodeGenIP, Value *LHS, Value *RHS, Value *&Res)> ReductionGenCBTy
ReductionGen CallBack for MLIR.
function_ref< InsertPointOrErrorTy( Argument &Arg, Value *Input, Value *&RetVal, InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetGenArgAccessorsCallbackTy
LLVM_ABI void emitUsed(StringRef Name, ArrayRef< llvm::WeakTrackingVH > List)
Emit the llvm.used metadata.
void setConfig(OpenMPIRBuilderConfig C)
LLVM_ABI InsertPointOrErrorTy createSingle(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool IsNowait, ArrayRef< llvm::Value * > CPVars={}, ArrayRef< llvm::Function * > CPFuncs={})
Generator for 'omp single'.
LLVM_ABI InsertPointOrErrorTy createTeams(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, Value *NumTeamsLower=nullptr, Value *NumTeamsUpper=nullptr, Value *ThreadLimit=nullptr, Value *IfExpr=nullptr)
Generator for #omp teams
std::forward_list< CanonicalLoopInfo > LoopInfos
Collection of owned canonical loop objects that eventually need to be free'd.
LLVM_ABI void createTaskwait(const LocationDescription &Loc)
Generator for 'omp taskwait'.
LLVM_ABI CanonicalLoopInfo * createLoopSkeleton(DebugLoc DL, Value *TripCount, Function *F, BasicBlock *PreInsertBefore, BasicBlock *PostInsertBefore, const Twine &Name={})
Create the control flow structure of a canonical OpenMP loop.
SmallVector< uint64_t, 4 > MapDimArrayTy
LLVM_ABI std::string createPlatformSpecificName(ArrayRef< StringRef > Parts) const
Get the create a name using the platform specific separators.
LLVM_ABI FunctionCallee createDispatchNextFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_next_* runtime function for the specified size IVSize and sign IVSigned.
static LLVM_ABI void getKernelArgsVector(TargetKernelArgs &KernelArgs, IRBuilderBase &Builder, SmallVector< Value * > &ArgsVector)
Create the kernel args vector used by emitTargetKernel.
function_ref< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> BodyGenCallbackTy
Callback type for body (=inner region) code generation.
SmallVector< Constant *, 4 > MapNamesArrayTy
LLVM_ABI void unrollLoopHeuristic(DebugLoc DL, CanonicalLoopInfo *Loop)
Fully or partially unroll a loop.
LLVM_ABI InsertPointOrErrorTy createParallel(const LocationDescription &Loc, InsertPointTy AllocaIP, BodyGenCallbackTy BodyGenCB, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, Value *IfCondition, Value *NumThreads, omp::ProcBindKind ProcBind, bool IsCancellable)
Generator for 'omp parallel'.
LLVM_ABI omp::OpenMPOffloadMappingFlags getMemberOfFlag(unsigned Position)
Get OMP_MAP_MEMBER_OF flag with extra bits reserved based on the position given.
LLVM_ABI void addAttributes(omp::RuntimeFunction FnID, Function &Fn)
Add attributes known for FnID to Fn.
Module & M
The underlying LLVM-IR module.
StringMap< Constant * > SrcLocStrMap
Map to remember source location strings.
LLVM_ABI void createMapperAllocas(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumOperands, struct MapperAllocas &MapperAllocas)
Create the allocas instruction used in call to mapper functions.
LLVM_ABI Constant * getOrCreateSrcLocStr(StringRef LocStr, uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the source location LocStr.
void addOutlineInfo(OutlineInfo &&OI)
Add a new region that will be outlined later.
LLVM_ABI Error emitTargetRegionFunction(TargetRegionEntryInfo &EntryInfo, FunctionGenCallback &GenerateFunctionCallback, bool IsOffloadEntry, Function *&OutlinedFn, Constant *&OutlinedFnID)
Create a unique name for the entry function using the source location information of the current targ...
LLVM_ABI Expected< SmallVector< llvm::CanonicalLoopInfo * > > createCanonicalScanLoops(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, InsertPointTy ComputeIP, const Twine &Name, ScanInfo *ScanRedInfo)
Generator for the control flow structure of an OpenMP canonical loops if the parent directive has an ...
LLVM_ABI FunctionCallee createDispatchFiniFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_fini_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI void unrollLoopPartial(DebugLoc DL, CanonicalLoopInfo *Loop, int32_t Factor, CanonicalLoopInfo **UnrolledCLI)
Partially unroll a loop.
function_ref< Error(Value *DeviceID, Value *RTLoc, IRBuilderBase::InsertPoint TargetTaskAllocaIP)> TargetTaskBodyCallbackTy
Callback type for generating the bodies of device directives that require outer target tasks (e....
Expected< MapInfosTy & > MapInfosOrErrorTy
SmallVector< omp::OpenMPOffloadMappingFlags, 4 > MapFlagsArrayTy
LLVM_ABI void emitTaskyieldImpl(const LocationDescription &Loc)
Generate a taskyield runtime call.
LLVM_ABI void emitMapperCall(const LocationDescription &Loc, Function *MapperFunc, Value *SrcLocInfo, Value *MaptypesArg, Value *MapnamesArg, struct MapperAllocas &MapperAllocas, int64_t DeviceID, unsigned NumOperands)
Create the call for the target mapper function.
function_ref< Expected< Function * >(unsigned int)> CustomMapperCallbackTy
LLVM_ABI InsertPointTy createAtomicCompare(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOpValue &R, Value *E, Value *D, AtomicOrdering AO, omp::OMPAtomicCompareOp Op, bool IsXBinopExpr, bool IsPostfixUpdate, bool IsFailOnly)
Emit atomic compare for constructs: — Only scalar data types cond-expr-stmt: x = x ordop expr ?
LLVM_ABI InsertPointTy createOrderedDepend(const LocationDescription &Loc, InsertPointTy AllocaIP, unsigned NumLoops, ArrayRef< llvm::Value * > StoreValues, const Twine &Name, bool IsDependSource)
Generator for 'omp ordered depend (source | sink)'.
LLVM_ABI InsertPointTy createCopyinClauseBlocks(InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr, llvm::IntegerType *IntPtrTy, bool BranchtoEnd=true)
Generate conditional branch and relevant BasicBlocks through which private threads copy the 'copyin' ...
SmallVector< MapValuesArrayTy, 4 > MapNonContiguousArrayTy
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value &Original, Value &Inner, Value *&ReplVal)> PrivatizeCallbackTy
Callback type for variable privatization (think copy & default constructor).
LLVM_ABI bool isFinalized()
Check whether the finalize function has already run.
function_ref< InsertPointOrErrorTy( InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> TargetBodyGenCallbackTy
SmallVector< DeviceInfoTy, 4 > MapDeviceInfoArrayTy
SmallVector< FinalizationInfo, 8 > FinalizationStack
The finalization stack made up of finalize callbacks currently in-flight, wrapped into FinalizationIn...
std::function< Error(InsertPointTy AllocaIP, InsertPointTy CodeGenIP)> StorableBodyGenCallbackTy
LLVM_ABI std::vector< CanonicalLoopInfo * > tileLoops(DebugLoc DL, ArrayRef< CanonicalLoopInfo * > Loops, ArrayRef< Value * > TileSizes)
Tile a loop nest.
LLVM_ABI CallInst * createOMPInteropInit(const LocationDescription &Loc, Value *InteropVar, omp::OMPInteropType InteropType, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_init.
SmallVector< OutlineInfo, 16 > OutlineInfos
Collection of regions that need to be outlined during finalization.
LLVM_ABI Function * getOrCreateRuntimeFunctionPtr(omp::RuntimeFunction FnID)
std::function< InsertPointOrErrorTy( InsertPointTy, Value *ByRefVal, Value *&Res)> ReductionGenDataPtrPtrCBTy
LLVM_ABI InsertPointTy createTargetInit(const LocationDescription &Loc, const llvm::OpenMPIRBuilder::TargetKernelDefaultAttrs &Attrs)
The omp target interface.
LLVM_ABI InsertPointOrErrorTy createReductions(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false)
Generator for 'omp reduction'.
const Triple T
The target triple of the underlying module.
DenseMap< std::pair< Constant *, uint64_t >, Constant * > IdentMap
Map to remember existing ident_t*.
LLVM_ABI CallInst * createOMPFree(const LocationDescription &Loc, Value *Addr, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_free.
LLVM_ABI FunctionCallee createForStaticInitFunction(unsigned IVSize, bool IVSigned, bool IsGPUDistribute)
Returns __kmpc_for_static_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI CallInst * createOMPAlloc(const LocationDescription &Loc, Value *Size, Value *Allocator, std::string Name="")
Create a runtime call for kmpc_Alloc.
LLVM_ABI void emitNonContiguousDescriptor(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo, TargetDataInfo &Info)
Emit an array of struct descriptors to be assigned to the offload args.
SmallVector< Value *, 4 > MapValuesArrayTy
LLVM_ABI InsertPointOrErrorTy createSection(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp section'.
function_ref< InsertPointOrErrorTy(InsertPointTy)> EmitFallbackCallbackTy
Callback function type for functions emitting the host fallback code that is executed when the kernel...
static LLVM_ABI TargetRegionEntryInfo getTargetEntryUniqueInfo(FileIdentifierInfoCallbackTy CallBack, vfs::FileSystem &VFS, StringRef ParentName="")
Creates a unique info for a target entry when provided a filename and line number from.
LLVM_ABI void emitBlock(BasicBlock *BB, Function *CurFn, bool IsFinished=false)
LLVM_ABI Value * getOrCreateThreadID(Value *Ident)
Return the current thread ID.
LLVM_ABI InsertPointOrErrorTy createMaster(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB)
Generator for 'omp master'.
void pushFinalizationCB(const FinalizationInfo &FI)
Push a finalization callback on the finalization stack.
LLVM_ABI InsertPointOrErrorTy createTargetData(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond, TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB, CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc=nullptr, function_ref< InsertPointOrErrorTy(InsertPointTy CodeGenIP, BodyGenTy BodyGenType)> BodyGenCB=nullptr, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr, Value *SrcLocInfo=nullptr)
Generator for 'omp target data'.
CallInst * createRuntimeFunctionCall(FunctionCallee Callee, ArrayRef< Value * > Args, StringRef Name="")
LLVM_ABI InsertPointOrErrorTy emitKernelLaunch(const LocationDescription &Loc, Value *OutlinedFnID, EmitFallbackCallbackTy EmitTargetCallFallbackCB, TargetKernelArgs &Args, Value *DeviceID, Value *RTLoc, InsertPointTy AllocaIP)
Generate a target region entry call and host fallback call.
InsertPointTy getInsertionPoint()
}
StringMap< GlobalVariable *, BumpPtrAllocator > InternalVars
An ordered map of auto-generated variables to their unique names.
LLVM_ABI InsertPointOrErrorTy createCancellationPoint(const LocationDescription &Loc, omp::Directive CanceledDirective)
Generator for 'omp cancellation point'.
LLVM_ABI FunctionCallee createDispatchInitFunction(unsigned IVSize, bool IVSigned)
Returns __kmpc_dispatch_init_* runtime function for the specified size IVSize and sign IVSigned.
LLVM_ABI InsertPointOrErrorTy createScan(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< llvm::Value * > ScanVars, ArrayRef< llvm::Type * > ScanVarsType, bool IsInclusive, ScanInfo *ScanRedInfo)
This directive split and directs the control flow to input phase blocks or scan phase blocks based on...
LLVM_ABI CallInst * createOMPInteropUse(const LocationDescription &Loc, Value *InteropVar, Value *Device, Value *NumDependences, Value *DependenceAddress, bool HaveNowaitClause)
Create a runtime call for __tgt_interop_use.
IRBuilder<>::InsertPoint InsertPointTy
Type used throughout for insertion points.
LLVM_ABI GlobalVariable * getOrCreateInternalVariable(Type *Ty, const StringRef &Name, std::optional< unsigned > AddressSpace={})
Gets (if variable with the given name already exist) or creates internal global variable with the spe...
LLVM_ABI GlobalVariable * createOffloadMapnames(SmallVectorImpl< llvm::Constant * > &Names, std::string VarName)
Create the global variable holding the offload names information.
std::forward_list< ScanInfo > ScanInfos
Collection of owned ScanInfo objects that eventually need to be free'd.
static LLVM_ABI void writeTeamsForKernel(const Triple &T, Function &Kernel, int32_t LB, int32_t UB)
std::function< InsertPointOrErrorTy( InsertPointTy, Type *, Value *, Value *)> ReductionGenAtomicCBTy
Functions used to generate atomic reductions.
LLVM_ABI Value * calculateCanonicalLoopTripCount(const LocationDescription &Loc, Value *Start, Value *Stop, Value *Step, bool IsSigned, bool InclusiveStop, const Twine &Name="loop")
Calculate the trip count of a canonical loop.
LLVM_ABI InsertPointOrErrorTy createBarrier(const LocationDescription &Loc, omp::Directive Kind, bool ForceSimpleCall=false, bool CheckCancelFlag=true)
Emitter methods for OpenMP directives.
LLVM_ABI void setCorrectMemberOfFlag(omp::OpenMPOffloadMappingFlags &Flags, omp::OpenMPOffloadMappingFlags MemberOfFlag)
Given an initial flag set, this function modifies it to contain the passed in MemberOfFlag generated ...
LLVM_ABI Error emitOffloadingArraysAndArgs(InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous=false, bool ForEndCall=false, function_ref< void(unsigned int, Value *)> DeviceAddrCB=nullptr)
Allocates memory for and populates the arrays required for offloading (offload_{baseptrs|ptrs|mappers...
LLVM_ABI Constant * getOrCreateDefaultSrcLocStr(uint32_t &SrcLocStrSize)
Return the (LLVM-IR) string describing the default source location.
LLVM_ABI InsertPointOrErrorTy createCritical(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst)
Generator for 'omp critical'.
LLVM_ABI void createOffloadEntry(Constant *ID, Constant *Addr, uint64_t Size, int32_t Flags, GlobalValue::LinkageTypes, StringRef Name="")
Creates offloading entry for the provided entry ID ID, address Addr, size Size, and flags Flags.
static LLVM_ABI unsigned getOpenMPDefaultSimdAlign(const Triple &TargetTriple, const StringMap< bool > &Features)
Get the default alignment value for given target.
LLVM_ABI unsigned getFlagMemberOffset()
Get the offset of the OMP_MAP_MEMBER_OF field.
LLVM_ABI InsertPointOrErrorTy applyWorkshareLoop(DebugLoc DL, CanonicalLoopInfo *CLI, InsertPointTy AllocaIP, bool NeedsBarrier, llvm::omp::ScheduleKind SchedKind=llvm::omp::OMP_SCHEDULE_Default, Value *ChunkSize=nullptr, bool HasSimdModifier=false, bool HasMonotonicModifier=false, bool HasNonmonotonicModifier=false, bool HasOrderedClause=false, omp::WorksharingLoopType LoopType=omp::WorksharingLoopType::ForStaticLoop, bool NoLoop=false, bool HasDistSchedule=false, Value *DistScheduleChunkSize=nullptr)
Modifies the canonical loop to be a workshare loop.
LLVM_ABI InsertPointOrErrorTy createAtomicCapture(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, AtomicOpValue &V, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool UpdateExpr, bool IsPostfixUpdate, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: — Only Scalar data types V = X; X = X BinOp Expr ,...
LLVM_ABI void createOffloadEntriesAndInfoMetadata(EmitMetadataErrorReportFunctionTy &ErrorReportFunction)
LLVM_ABI void applySimd(CanonicalLoopInfo *Loop, MapVector< Value *, Value * > AlignedVars, Value *IfCond, omp::OrderKind Order, ConstantInt *Simdlen, ConstantInt *Safelen)
Add metadata to simd-ize a loop.
LLVM_ABI InsertPointOrErrorTy createAtomicUpdate(const LocationDescription &Loc, InsertPointTy AllocaIP, AtomicOpValue &X, Value *Expr, AtomicOrdering AO, AtomicRMWInst::BinOp RMWOp, AtomicUpdateCallbackTy &UpdateOp, bool IsXBinopExpr, bool IsIgnoreDenormalMode=false, bool IsFineGrainedMemory=false, bool IsRemoteMemory=false)
Emit atomic update for constructs: X = X BinOp Expr ,or X = Expr BinOp X For complex Operations: X = ...
std::function< std::tuple< std::string, uint64_t >()> FileIdentifierInfoCallbackTy
bool isLastFinalizationInfoCancellable(omp::Directive DK)
Return true if the last entry in the finalization stack is of kind DK and cancellable.
LLVM_ABI InsertPointTy emitTargetKernel(const LocationDescription &Loc, InsertPointTy AllocaIP, Value *&Return, Value *Ident, Value *DeviceID, Value *NumTeams, Value *NumThreads, Value *HostPtr, ArrayRef< Value * > KernelArgs)
Generate a target region entry call.
LLVM_ABI GlobalVariable * createOffloadMaptypes(SmallVectorImpl< uint64_t > &Mappings, std::string VarName)
Create the global variable holding the offload mappings information.
LLVM_ABI CallInst * createCachedThreadPrivate(const LocationDescription &Loc, llvm::Value *Pointer, llvm::ConstantInt *Size, const llvm::Twine &Name=Twine(""))
Create a runtime call for kmpc_threadprivate_cached.
IRBuilder Builder
The LLVM-IR Builder used to create IR.
LLVM_ABI InsertPointOrErrorTy createTarget(const LocationDescription &Loc, bool IsOffloadEntry, OpenMPIRBuilder::InsertPointTy AllocaIP, OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info, TargetRegionEntryInfo &EntryInfo, const TargetKernelDefaultAttrs &DefaultAttrs, const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond, SmallVectorImpl< Value * > &Inputs, GenMapInfoCallbackTy GenMapInfoCB, TargetBodyGenCallbackTy BodyGenCB, TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB, CustomMapperCallbackTy CustomMapperCB, const SmallVector< DependData > &Dependencies, bool HasNowait=false, Value *DynCGroupMem=nullptr, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback=omp::OMPDynGroupprivateFallbackType::Abort)
Generator for 'omp target'.
LLVM_ABI GlobalValue * createGlobalFlag(unsigned Value, StringRef Name)
Create a hidden global flag Name in the module with initial value Value.
LLVM_ABI void emitOffloadingArraysArgument(IRBuilderBase &Builder, OpenMPIRBuilder::TargetDataRTArgs &RTArgs, OpenMPIRBuilder::TargetDataInfo &Info, bool ForEndCall=false)
Emit the arguments to be passed to the runtime library based on the arrays of base pointers,...
LLVM_ABI InsertPointOrErrorTy createMasked(const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, Value *Filter)
Generator for 'omp masked'.
LLVM_ABI Expected< CanonicalLoopInfo * > createCanonicalLoop(const LocationDescription &Loc, LoopBodyGenCallbackTy BodyGenCB, Value *TripCount, const Twine &Name="loop")
Generator for the control flow structure of an OpenMP canonical loop.
function_ref< Expected< InsertPointTy >( InsertPointTy AllocaIP, InsertPointTy CodeGenIP, Value *DestPtr, Value *SrcPtr)> TaskDupCallbackTy
Callback type for task duplication function code generation.
LLVM_ABI Value * getSizeInBytes(Value *BasePtr)
Computes the size of type in bytes.
OpenMPIRBuilder(Module &M)
Create a new OpenMPIRBuilder operating on the given module M.
LLVM_ABI InsertPointOrErrorTy createReductionsGPU(const LocationDescription &Loc, InsertPointTy AllocaIP, InsertPointTy CodeGenIP, ArrayRef< ReductionInfo > ReductionInfos, ArrayRef< bool > IsByRef, bool IsNoWait=false, bool IsTeamsReduction=false, ReductionGenCBKind ReductionGenCBKind=ReductionGenCBKind::MLIR, std::optional< omp::GV > GridValue={}, unsigned ReductionBufNum=1024, Value *SrcLocInfo=nullptr)
Design of OpenMP reductions on the GPU.
LLVM_ABI Expected< Function * > emitUserDefinedMapper(function_ref< MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)> PrivAndGenMapInfoCB, llvm::Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB)
Emit the user-defined mapper function.
LLVM_ABI FunctionCallee createDispatchDeinitFunction()
Returns __kmpc_dispatch_deinit runtime function.
LLVM_ABI void registerTargetGlobalVariable(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage, Type *LlvmPtrTy, Constant *Addr)
Registers a target variable for device or host.
BodyGenTy
Type of BodyGen to use for region codegen.
SmallVector< llvm::Function *, 16 > ConstantAllocaRaiseCandidates
A collection of candidate target functions that's constant allocas will attempt to be raised on a cal...
OffloadEntriesInfoManager OffloadInfoManager
Info manager to keep track of target regions.
static LLVM_ABI std::pair< int32_t, int32_t > readTeamBoundsForKernel(const Triple &T, Function &Kernel)
Read/write a bounds on teams for Kernel.
const std::string ompOffloadInfoName
OMP Offload Info Metadata name string.
Expected< InsertPointTy > InsertPointOrErrorTy
Type used to represent an insertion point or an error value.
LLVM_ABI InsertPointTy createCopyPrivate(const LocationDescription &Loc, llvm::Value *BufSize, llvm::Value *CpyBuf, llvm::Value *CpyFn, llvm::Value *DidIt)
Generator for __kmpc_copyprivate.
void popFinalizationCB()
Pop the last finalization callback from the finalization stack.
LLVM_ABI InsertPointOrErrorTy createSections(const LocationDescription &Loc, InsertPointTy AllocaIP, ArrayRef< StorableBodyGenCallbackTy > SectionCBs, PrivatizeCallbackTy PrivCB, FinalizeCallbackTy FiniCB, bool IsCancellable, bool IsNowait)
Generator for 'omp sections'.
LLVM_ABI InsertPointOrErrorTy emitTargetTask(TargetTaskBodyCallbackTy TaskBodyCB, Value *DeviceID, Value *RTLoc, OpenMPIRBuilder::InsertPointTy AllocaIP, const SmallVector< llvm::OpenMPIRBuilder::DependData > &Dependencies, const TargetDataRTArgs &RTArgs, bool HasNoWait)
Generate a target-task for the target construct.
std::function< void(EmitMetadataErrorKind, TargetRegionEntryInfo)> EmitMetadataErrorReportFunctionTy
Callback function type.
LLVM_ABI Expected< ScanInfo * > scanInfoInitialize()
Creates a ScanInfo object, allocates and returns the pointer.
LLVM_ABI InsertPointTy createAtomicRead(const LocationDescription &Loc, AtomicOpValue &X, AtomicOpValue &V, AtomicOrdering AO, InsertPointTy AllocaIP)
Emit atomic Read for : V = X — Only Scalar data types.
bool updateToLocation(const LocationDescription &Loc)
Update the internal location to Loc.
LLVM_ABI void createFlush(const LocationDescription &Loc)
Generator for 'omp flush'.
LLVM_ABI Constant * getAddrOfDeclareTargetVar(OffloadEntriesInfoManager::OMPTargetGlobalVarEntryKind CaptureClause, OffloadEntriesInfoManager::OMPTargetDeviceClauseKind DeviceClause, bool IsDeclaration, bool IsExternallyVisible, TargetRegionEntryInfo EntryInfo, StringRef MangledName, std::vector< GlobalVariable * > &GeneratedRefs, bool OpenMPSIMD, std::vector< Triple > TargetTriple, Type *LlvmPtrTy, std::function< Constant *()> GlobalInitializer, std::function< GlobalValue::LinkageTypes()> VariableLinkage)
Retrieve (or create if non-existent) the address of a declare target variable, used in conjunction wi...
EmitMetadataErrorKind
The kind of errors that can occur when emitting the offload entries and metadata.
ScanInfo holds the information to assist in lowering of Scan reduction.
llvm::SmallDenseMap< llvm::Value *, llvm::Value * > * ScanBuffPtrs
Maps the private reduction variable to the pointer of the temporary buffer.
llvm::BasicBlock * OMPScanLoopExit
Exit block of loop body.
llvm::Value * IV
Keeps track of value of iteration variable for input/scan loop to be used for Scan directive lowering...
llvm::BasicBlock * OMPAfterScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanInit
Block before loop body where scan initializations are done.
llvm::BasicBlock * OMPBeforeScanBlock
Dominates the body of the loop before scan directive.
llvm::BasicBlock * OMPScanFinish
Block after loop body where scan finalizations are done.
ScanInfo & operator=(const ScanInfo &)=delete
llvm::Value * Span
Stores the span of canonical loop being lowered to be used for temporary buffer allocation or Finaliz...
bool OMPFirstScanLoop
If true, it indicates Input phase is lowered; else it indicates ScanPhase is lowered.
ScanInfo(ScanInfo &)=delete
llvm::BasicBlock * OMPScanDispatch
Controls the flow to before or after scan blocks.
A vector that has set insertion semantics.
Definition SetVector.h:57
A templated base class for SmallPtrSet which provides the typesafe interface that is common across al...
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
Value * getOperand(unsigned i) const
Definition User.h:233
See the file comment.
Definition ValueMap.h:84
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
LLVM_ABI void setName(const Twine &Name)
Change the name of the value.
Definition Value.cpp:397
Value handle that is nullable, but tries to track the Value.
An efficient, type-erasing, non-owning reference to a callable.
The virtual file system interface.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
OpenMPOffloadMappingFlags
Values for bit flags used to specify the mapping type for offloading.
IdentFlag
IDs for all omp runtime library ident_t flag encodings (see their defintion in openmp/runtime/src/kmp...
RTLDependenceKindTy
Dependence kind for RTL.
RuntimeFunction
IDs for all omp runtime library (RTL) functions.
OMPDynGroupprivateFallbackType
The fallback types for the dyn_groupprivate clause.
WorksharingLoopType
A type of worksharing loop construct.
OMPAtomicCompareOp
Atomic compare operations. Currently OpenMP only supports ==, >, and <.
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
LLVM_ABI BasicBlock * splitBBWithSuffix(IRBuilderBase &Builder, bool CreateBranch, llvm::Twine Suffix=".split")
Like splitBB, but reuses the current block's name for the new name.
@ Offset
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI BasicBlock * splitBB(IRBuilderBase::InsertPoint IP, bool CreateBranch, DebugLoc DL, llvm::Twine Name={})
Split a BasicBlock at an InsertPoint, even if the block is degenerate (missing the terminator).
auto cast_or_null(const Y &Val)
Definition Casting.h:714
FunctionAddr VTableAddr Count
Definition InstrProf.h:139
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
OutputIt move(R &&Range, OutputIt Out)
Provide wrappers to std::move which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1915
LLVM_ABI void spliceBB(IRBuilderBase::InsertPoint IP, BasicBlock *New, bool CreateBranch, DebugLoc DL)
Move the instruction after an InsertPoint to the beginning of another BasicBlock.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Implement std::hash so that hash_code can be used in STL containers.
Definition BitVector.h:870
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
a struct to pack relevant information while generating atomic Ops
DependData(omp::RTLDependenceKindTy DepKind, Type *DepValueType, Value *DepVal)
omp::RTLDependenceKindTy DepKind
const omp::Directive DK
The directive kind of the innermost directive that has an associated region which might require final...
const bool IsCancellable
Flag to indicate if the directive is cancellable.
Error mergeFiniBB(IRBuilderBase &Builder, BasicBlock *ExistingFiniBB)
For cases where there is an unavoidable existing finalization block (e.g.
FinalizationInfo(FinalizeCallbackTy FiniCB, omp::Directive DK, bool IsCancellable)
Expected< BasicBlock * > getFiniBB(IRBuilderBase &Builder)
The basic block to which control should be transferred to implement the FiniCB.
Description of a LLVM-IR insertion point (IP) and a debug/source location (filename,...
LocationDescription(const InsertPointTy &IP)
LocationDescription(const InsertPointTy &IP, const DebugLoc &DL)
LocationDescription(const IRBuilderBase &IRB)
This structure contains combined information generated for mappable clauses, including base pointers,...
void append(MapInfosTy &CurInfo)
Append arrays in CurInfo.
MapDeviceInfoArrayTy DevicePointers
StructNonContiguousInfo NonContigInfo
Helper that contains information about regions we need to outline during finalization.
LLVM_ABI void collectBlocks(SmallPtrSetImpl< BasicBlock * > &BlockSet, SmallVectorImpl< BasicBlock * > &BlockVector)
Collect all blocks in between EntryBB and ExitBB in both the given vector and set.
Function * getFunction() const
Return the function that contains the region to be outlined.
SmallVector< Value *, 2 > ExcludeArgsFromAggregate
std::function< void(Function &)> PostOutlineCBTy
EvalKind EvaluationKind
Reduction evaluation kind - scalar, complex or aggregate.
ReductionInfo(Type *ElementType, Value *Variable, Value *PrivateVariable, EvalKind EvaluationKind, ReductionGenCBTy ReductionGen, ReductionGenClangCBTy ReductionGenClang, ReductionGenAtomicCBTy AtomicReductionGen, ReductionGenDataPtrPtrCBTy DataPtrPtrGen, Type *ByRefAllocatedType=nullptr, Type *ByRefElementType=nullptr)
ReductionGenAtomicCBTy AtomicReductionGen
Callback for generating the atomic reduction body, may be null.
ReductionGenCBTy ReductionGen
Callback for generating the reduction body.
ReductionInfo(Value *PrivateVariable)
Type * ByRefAllocatedType
For by-ref reductions, we need to keep track of 2 extra types that are potentially different:
Value * Variable
Reduction variable of pointer type.
Value * PrivateVariable
Thread-private partial reduction variable.
ReductionGenClangCBTy ReductionGenClang
Clang callback for generating the reduction body.
Type * ElementType
Reduction element type, must match pointee type of variable.
ReductionGenDataPtrPtrCBTy DataPtrPtrGen
Container for the arguments used to pass data to the runtime library.
Value * SizesArray
The array of sizes passed to the runtime library.
TargetDataRTArgs(Value *BasePointersArray, Value *PointersArray, Value *SizesArray, Value *MapTypesArray, Value *MapTypesArrayEnd, Value *MappersArray, Value *MapNamesArray)
Value * PointersArray
The array of section pointers passed to the runtime library.
Value * MappersArray
The array of user-defined mappers passed to the runtime library.
Value * MapTypesArrayEnd
The array of map types passed to the runtime library for the end of the region, or nullptr if there a...
Value * BasePointersArray
The array of base pointer passed to the runtime library.
Value * MapTypesArray
The array of map types passed to the runtime library for the beginning of the region or for the entir...
Value * MapNamesArray
The array of original declaration names of mapped pointers sent to the runtime library for debugging.
Data structure that contains the needed information to construct the kernel args vector.
ArrayRef< Value * > NumThreads
The number of threads.
TargetDataRTArgs RTArgs
Arguments passed to the runtime library.
Value * NumIterations
The number of iterations.
Value * DynCGroupMem
The size of the dynamic shared memory.
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs, Value *NumIterations, ArrayRef< Value * > NumTeams, ArrayRef< Value * > NumThreads, Value *DynCGroupMem, bool HasNoWait, omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
unsigned NumTargetItems
Number of arguments passed to the runtime library.
bool HasNoWait
True if the kernel has 'no wait' clause.
ArrayRef< Value * > NumTeams
The number of teams.
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback
The fallback mechanism for the shared memory.
Container to pass the default attributes with which a kernel must be launched, used to set kernel att...
Container to pass LLVM IR runtime values or constants related to the number of teams and threads with...
Value * DeviceID
Device ID value used in the kernel launch.
Value * MaxThreads
'parallel' construct 'num_threads' clause value, if present and it is an SPMD kernel.
Value * LoopTripCount
Total number of iterations of the SPMD or Generic-SPMD kernel or null if it is a generic kernel.
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276
Data structure to contain the information needed to uniquely identify a target entry.
static LLVM_ABI void getTargetRegionEntryFnName(SmallVectorImpl< char > &Name, StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count)
static constexpr const char * KernelNamePrefix
The prefix used for kernel names.
bool operator<(const TargetRegionEntryInfo &RHS) const
TargetRegionEntryInfo(StringRef ParentName, unsigned DeviceID, unsigned FileID, unsigned Line, unsigned Count=0)
Defines various target-specific GPU grid values that must be consistent between host RTL (plugin),...