LLVM 22.0.0git
NVPTXSubtarget.h
Go to the documentation of this file.
1//=====-- NVPTXSubtarget.h - Define Subtarget for the NVPTX ---*- C++ -*--====//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file declares the NVPTX specific subclass of TargetSubtarget.
10//
11//===----------------------------------------------------------------------===//
12
13#ifndef LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
14#define LLVM_LIB_TARGET_NVPTX_NVPTXSUBTARGET_H
15
16#include "NVPTX.h"
17#include "NVPTXFrameLowering.h"
18#include "NVPTXISelLowering.h"
19#include "NVPTXInstrInfo.h"
20#include "NVPTXRegisterInfo.h"
22#include "llvm/IR/DataLayout.h"
24#include <string>
25
26#define GET_SUBTARGETINFO_HEADER
27#include "NVPTXGenSubtargetInfo.inc"
28
29namespace llvm {
30
32 virtual void anchor();
33 std::string TargetName;
34
35 // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
36 unsigned PTXVersion;
37
38 // Full SM version x.y is represented as 100*x+10*y+feature, e.g. 3.1 == 310
39 // sm_90a == 901
40 unsigned int FullSmVersion;
41
42 // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31. Derived from
43 // FullSmVersion.
44 unsigned int SmVersion;
45
46 NVPTXInstrInfo InstrInfo;
48 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
49
50 // NVPTX does not have any call stack frame, but need a NVPTX specific
51 // FrameLowering class because TargetFrameLowering is abstract.
52 NVPTXFrameLowering FrameLowering;
53
54public:
55 /// This constructor initializes the data members to match that
56 /// of the specified module.
57 ///
58 NVPTXSubtarget(const Triple &TT, const std::string &CPU,
59 const std::string &FS, const NVPTXTargetMachine &TM);
60
61 ~NVPTXSubtarget() override;
62
63 const TargetFrameLowering *getFrameLowering() const override {
64 return &FrameLowering;
65 }
66 const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
67 const NVPTXRegisterInfo *getRegisterInfo() const override {
68 return &InstrInfo.getRegisterInfo();
69 }
70 const NVPTXTargetLowering *getTargetLowering() const override {
71 return &TLInfo;
72 }
73
74 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
75
76 // Checks PTX version and family-specific and architecture-specific SM
77 // versions. For example, sm_100{f/a} and any future variants in the same
78 // family will match for any PTX version greater than or equal to
79 // `PTXVersion`.
80 bool hasPTXWithFamilySMs(unsigned PTXVersion,
81 ArrayRef<unsigned> SMVersions) const;
82 // Checks PTX version and architecture-specific SM versions.
83 // For example, sm_100{a} will match for any PTX version greater than or equal
84 // to `PTXVersion`.
85 bool hasPTXWithAccelSMs(unsigned PTXVersion,
86 ArrayRef<unsigned> SMVersions) const;
87
88 bool has256BitVectorLoadStore(unsigned AS) const {
89 return SmVersion >= 100 && PTXVersion >= 88 &&
91 }
92 bool hasAtomAddF64() const { return SmVersion >= 60; }
93 bool hasAtomScope() const { return SmVersion >= 60; }
94 bool hasAtomBitwise64() const { return SmVersion >= 32; }
95 bool hasAtomMinMax64() const { return SmVersion >= 32; }
96 bool hasAtomCas16() const { return SmVersion >= 70 && PTXVersion >= 63; }
97 bool hasAtomSwap128() const { return SmVersion >= 90 && PTXVersion >= 83; }
98 bool hasClusters() const { return SmVersion >= 90 && PTXVersion >= 78; }
99 bool hasLDG() const { return SmVersion >= 32; }
100 bool hasHWROT32() const { return SmVersion >= 32; }
101 bool hasFP16Math() const { return SmVersion >= 53; }
102 bool hasBF16Math() const { return SmVersion >= 80; }
103 bool allowFP16Math() const;
104 bool hasMaskOperator() const { return PTXVersion >= 71; }
105 bool hasNoReturn() const { return SmVersion >= 30 && PTXVersion >= 64; }
106 // Does SM & PTX support memory orderings (weak and atomic: relaxed, acquire,
107 // release, acq_rel, sc) ?
108 bool hasMemoryOrdering() const { return SmVersion >= 70 && PTXVersion >= 60; }
109 // Does SM & PTX support .acquire and .release qualifiers for fence?
111 return SmVersion >= 90 && PTXVersion >= 86;
112 }
113 // Does SM & PTX support atomic relaxed MMIO operations ?
114 bool hasRelaxedMMIO() const { return SmVersion >= 70 && PTXVersion >= 82; }
115 bool hasDotInstructions() const {
116 return SmVersion >= 61 && PTXVersion >= 50;
117 }
118 // Tcgen05 instructions in Blackwell family
120 bool HasTcgen05 = false;
121 unsigned MinPTXVersion = 86;
122 switch (FullSmVersion) {
123 default:
124 break;
125 case 1003: // sm_100a
126 case 1013: // sm_101a
127 HasTcgen05 = true;
128 break;
129 case 1103: // sm_110a
130 HasTcgen05 = true;
131 MinPTXVersion = 90;
132 break;
133 case 1033: // sm_103a
134 HasTcgen05 = true;
135 MinPTXVersion = 88;
136 break;
137 }
138
139 return HasTcgen05 && PTXVersion >= MinPTXVersion;
140 }
141
142 // Checks following instructions support:
143 // - tcgen05.ld/st
144 // - tcgen05.alloc/dealloc/relinquish
145 // - tcgen05.cp
146 // - tcgen05.fence/wait
147 // - tcgen05.commit
149 // sm_101 renamed to sm_110 in PTX 9.0
150 return hasPTXWithFamilySMs(90, {100, 110}) ||
151 hasPTXWithFamilySMs(88, {100, 101}) ||
152 hasPTXWithAccelSMs(86, {100, 101});
153 }
154
155 // Checks tcgen05.shift instruction support.
157 // sm_101 renamed to sm_110 in PTX 9.0
158 return hasPTXWithAccelSMs(90, {100, 110, 103}) ||
159 hasPTXWithAccelSMs(88, {100, 101, 103}) ||
160 hasPTXWithAccelSMs(86, {100, 101});
161 }
162
164 return FullSmVersion == 1003 && PTXVersion >= 86;
165 }
166 // f32x2 instructions in Blackwell family
167 bool hasF32x2Instructions() const;
168
169 // TMA G2S copy with cta_group::1/2 support
171 // TODO: Update/tidy-up after the family-conditional support arrives
172 switch (FullSmVersion) {
173 case 1003:
174 case 1013:
175 return PTXVersion >= 86;
176 case 1033:
177 return PTXVersion >= 88;
178 default:
179 return false;
180 }
181 }
182
183 // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction
184 // terminates a basic block. Instead, it would assume that control flow
185 // continued to the next instruction. The next instruction could be in the
186 // block that's lexically below it. This would lead to a phantom CFG edges
187 // being created within ptxas. This issue was fixed in CUDA 12.3. Thus, when
188 // PTX ISA versions 8.3+ we can confidently say that the bug will not be
189 // present.
190 bool hasPTXASUnreachableBug() const { return PTXVersion < 83; }
191 bool hasCvtaParam() const { return SmVersion >= 70 && PTXVersion >= 77; }
192 unsigned int getFullSmVersion() const { return FullSmVersion; }
193 unsigned int getSmVersion() const { return getFullSmVersion() / 10; }
194 unsigned int getSmFamilyVersion() const { return getFullSmVersion() / 100; }
195 // GPUs with "a" suffix have architecture-accelerated features that are
196 // supported on the specified architecture only, hence such targets do not
197 // follow the onion layer model. hasArchAccelFeatures() allows distinguishing
198 // such GPU variants from the base GPU architecture.
199 // - false represents non-accelerated architecture.
200 // - true represents architecture-accelerated variant.
201 bool hasArchAccelFeatures() const {
202 return (getFullSmVersion() & 1) && PTXVersion >= 80;
203 }
204 // GPUs with 'f' suffix have architecture-accelerated features which are
205 // portable across all future architectures under same SM major. For example,
206 // sm_100f features will work for sm_10X*f*/sm_10X*a* future architectures.
207 // - false represents non-family-specific architecture.
208 // - true represents family-specific variant.
210 return getFullSmVersion() % 10 == 2 ? PTXVersion >= 88
212 }
213 // If the user did not provide a target we default to the `sm_30` target.
214 std::string getTargetName() const {
215 return TargetName.empty() ? "sm_30" : TargetName;
216 }
217 bool hasTargetName() const { return !TargetName.empty(); }
218
219 bool hasNativeBF16Support(int Opcode) const;
220
221 // Get maximum value of required alignments among the supported data types.
222 // From the PTX ISA doc, section 8.2.3:
223 // The memory consistency model relates operations executed on memory
224 // locations with scalar data-types, which have a maximum size and alignment
225 // of 64 bits. Memory operations with a vector data-type are modelled as a
226 // set of equivalent memory operations with a scalar data-type, executed in
227 // an unspecified order on the elements in the vector.
228 unsigned getMaxRequiredAlignment() const { return 8; }
229 // Get the smallest cmpxchg word size that the hardware supports.
230 unsigned getMinCmpXchgSizeInBits() const { return 32; }
231
232 unsigned getPTXVersion() const { return PTXVersion; }
233
236
237 void failIfClustersUnsupported(std::string const &FailureMessage) const;
238};
239
240} // End llvm namespace
241
242#endif
NVPTX address space definition.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
bool hasCpAsyncBulkTensorCTAGroupSupport() const
const NVPTXInstrInfo * getInstrInfo() const override
void failIfClustersUnsupported(std::string const &FailureMessage) const
bool hasPTXWithAccelSMs(unsigned PTXVersion, ArrayRef< unsigned > SMVersions) const
std::string getTargetName() const
unsigned getMaxRequiredAlignment() const
bool hasAtomMinMax64() const
bool hasTcgen05InstSupport() const
bool hasAtomAddF64() const
bool hasSplitAcquireAndReleaseFences() const
bool hasClusters() const
bool hasMaskOperator() const
const NVPTXTargetLowering * getTargetLowering() const override
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
unsigned getMinCmpXchgSizeInBits() const
unsigned getPTXVersion() const
bool hasCvtaParam() const
~NVPTXSubtarget() override
bool hasNativeBF16Support(int Opcode) const
const NVPTXRegisterInfo * getRegisterInfo() const override
unsigned int getFullSmVersion() const
unsigned int getSmVersion() const
bool hasDotInstructions() const
bool hasTcgen05MMAScaleInputDImm() const
bool hasFamilySpecificFeatures() const
bool hasTcgen05Instructions() const
bool hasAtomBitwise64() const
bool hasPTXWithFamilySMs(unsigned PTXVersion, ArrayRef< unsigned > SMVersions) const
bool hasTcgen05ShiftSupport() const
bool hasRelaxedMMIO() const
bool hasTargetName() const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
unsigned int getSmFamilyVersion() const
const TargetFrameLowering * getFrameLowering() const override
bool hasAtomScope() const
bool hasAtomCas16() const
NVPTXSubtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const NVPTXTargetMachine &TM)
This constructor initializes the data members to match that of the specified module.
bool hasMemoryOrdering() const
bool hasArchAccelFeatures() const
NVPTXSubtarget & initializeSubtargetDependencies(StringRef CPU, StringRef FS)
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool has256BitVectorLoadStore(unsigned AS) const
bool hasPTXASUnreachableBug() const
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Information about stack frame layout on the target.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
This is an optimization pass for GlobalISel generic memory operations.